In [None]:
!pip install transformers



In [None]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import EncoderDecoderModel, EncoderDecoderConfig
from transformers import BertModel, BertLMHeadModel, BertConfig

In [None]:
RANDOM_SEED=42
np.random.seed(RANDOM_SEED)

MAX_LENGTH = 41

In [None]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train, df_val = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")

In [None]:
train_data_tokens = tokenizer(df_train['data'].tolist(), padding='max_length', truncation=True, max_length=MAX_LENGTH)
val_data_tokens = tokenizer(df_val['data'].tolist(), padding='max_length', truncation=True, max_length=MAX_LENGTH)
test_data_tokens = tokenizer(df_test['data'].tolist(), padding='max_length', truncation=True, max_length=MAX_LENGTH)

train_target_tokens = tokenizer(df_train['label'].tolist(), padding='max_length', truncation=True, max_length=MAX_LENGTH)
val_target_tokens = tokenizer(df_val['label'].tolist(), padding='max_length', truncation=True, max_length=MAX_LENGTH)

In [None]:
class LangDataset(torch.utils.data.Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets

    def __getitem__(self, idx):
        item = {'input_ids' : torch.tensor(self.data[idx].ids),
                'attention_mask': torch.tensor(self.data[idx].attention_mask),
                'labels': torch.tensor(self.targets[idx].ids)
        }
        return item

    def __len__(self):
        return len(self.data['input_ids'])

In [None]:
train_dataset = LangDataset(train_data_tokens, train_target_tokens)
val_dataset = LangDataset(val_data_tokens, val_target_tokens)
test_dataset = LangDataset(test_data_tokens, test_data_tokens)

train_dataloader = DataLoader(train_dataset, batch_size=64)
val_dataloader = DataLoader(val_dataset, batch_size=16)

In [None]:
encoder_config = BertConfig(vocab_size = len(tokenizer),
                    max_position_embeddings = MAX_LENGTH + 64, 
                    num_attention_heads = 6,
                    num_hidden_layers = 6)

encoder = BertModel(config=encoder_config)


decoder_config = BertConfig(vocab_size = len(tokenizer),
                    max_position_embeddings = MAX_LENGTH + 64, 
                    num_attention_heads = 6,
                    num_hidden_layers = 6,
                    is_decoder=True)  
decoder_config.is_decoder = True
decoder_config.add_cross_attention = True

decoder = BertLMHeadModel(config=decoder_config)


model = EncoderDecoderModel(encoder=encoder, decoder=decoder)

model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=30,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_strategy='steps',
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=10,
    save_strategy='steps',
    save_steps=400,
    eval_accumulation_steps=16
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 985
  Num Epochs = 30
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 930


Step,Training Loss,Validation Loss
10,11.8077,11.744199
20,11.6892,11.535938
30,11.4806,11.311883
40,11.2817,11.127597
50,11.1075,10.951204
60,10.9427,10.761515
70,10.7671,10.560139
80,10.5703,10.354213
90,10.3705,10.155472
100,10.1681,9.95682


***** Running Evaluation *****
  Num examples = 110
  Batch size = 64
***** Running Evaluation *****
  Num examples = 110
  Batch size = 64
***** Running Evaluation *****
  Num examples = 110
  Batch size = 64
***** Running Evaluation *****
  Num examples = 110
  Batch size = 64
***** Running Evaluation *****
  Num examples = 110
  Batch size = 64
***** Running Evaluation *****
  Num examples = 110
  Batch size = 64
***** Running Evaluation *****
  Num examples = 110
  Batch size = 64
***** Running Evaluation *****
  Num examples = 110
  Batch size = 64
***** Running Evaluation *****
  Num examples = 110
  Batch size = 64
***** Running Evaluation *****
  Num examples = 110
  Batch size = 64
***** Running Evaluation *****
  Num examples = 110
  Batch size = 64
***** Running Evaluation *****
  Num examples = 110
  Batch size = 64
***** Running Evaluation *****
  Num examples = 110
  Batch size = 64
***** Running Evaluation *****
  Num examples = 110
  Batch size = 64
***** Running Evalua

TrainOutput(global_step=930, training_loss=3.0981063771832695, metrics={'train_runtime': 1356.6324, 'train_samples_per_second': 21.782, 'train_steps_per_second': 0.686, 'total_flos': 730878880031100.0, 'train_loss': 3.0981063771832695, 'epoch': 30.0})

In [None]:
batch_size=16

id = []
data = []
label = []

for batch_idx in range(len(test_data_tokens.input_ids) // batch_size):
    id.extend(df_test.iloc[batch_idx*batch_size : (batch_idx + 1)*batch_size]['id'])
    data.extend(df_test.iloc[batch_idx*batch_size : (batch_idx + 1)*batch_size]['data'])
    test_data = torch.tensor(test_data_tokens.input_ids[batch_idx*batch_size : (batch_idx + 1)*batch_size])
    label.extend(trainer.model_wrapped.generate(test_data.to('cuda'),
                                decoder_start_token_id=tokenizer.cls_token_id))


In [None]:
labels = [tokenizer.decode(lab, skip_special_tokens=True) for lab in label]

In [None]:
df_res = pd.DataFrame({'id': id, 'data': data, 'label': labels})

In [None]:
df_res['label'] = df_res['label'].str.replace(' ', '')

In [None]:
df_res[['id', 'label']].to_csv('submission.csv', index=False)

In [None]:
df_res

Unnamed: 0,id,data,label
0,0,05/10/2007,05-10-2007
1,1,elfter september 2007,11-09-2007
2,2,09 sa'wol 2077,09-04-2077
3,3,le vingt-huit mai 2077,28-05-2077
4,4,chile gu'wol 2007,07-09-2007
...,...,...,...
4651,4651,осмог јуна 2077,08-06-2077
4652,4652,sipil sibirwol 2077,11-11-2077
4653,4653,yuke i'wol 2049,06-02-2049
4654,4654,четырнадцатого 07 2049,14-07-2049
