In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/last_last_try_bart_model_params/pytorch/default/1/last_last_try_bart_model_params.pth
/kaggle/input/val-data/val
/kaggle/input/test-ref/test_no_reference
/kaggle/input/train-data/train


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader

from transformers import BartForConditionalGeneration, BartTokenizer, GenerationConfig, Trainer, TrainingArguments

import json
import gc

In [3]:
train = []
with open('/kaggle/input/train-data/train', 'r') as file:
    for line in file:
        new_line = json.loads(line.strip())
        train.append(new_line)

valid = []
with open('/kaggle/input/val-data/val', 'r') as file:
    for line in file:
        new_line = json.loads(line.strip())
        valid.append(new_line)

test = []
with open('/kaggle/input/test-ref/test_no_reference', 'r') as file:
    for line in file:
        new_line = json.loads(line.strip())
        test.append(new_line['src'])

In [4]:
train[0], len(train)

({'dst': '- Intriguing.', 'src': '◄▴◓◠▨ ◨▽◠▦◈◬◓▪▼◬▵'}, 300000)

### Filter empty, diff length, diff digits, diff parenthesis or quotation mark sentences

In [5]:
empty_sentences_inds = []
for i in range(len(train)):
    if train[i]['dst'] == '' or train[i]['src'] == '':
        empty_sentences_inds.append(i)

len(empty_sentences_inds)

0

In [6]:
diff_size_inds = []
for i in range(len(train)):
    if len(train[i]['dst'].split()) >= 2 * len(train[i]['src'].split()) or len(train[i]['src'].split()) >= 2 * len(train[i]['dst'].split()):
        diff_size_inds.append(i)

len(diff_size_inds)

86634

In [7]:
for i in range(len(train), -1, -1):
    if i in diff_size_inds:
        del train[i]

In [8]:
len(train)

213366

In [9]:
diff_digits_inds = []
for i in range(len(train)):
    dst = train[i]['dst'].split()
    src = train[i]['src'].split()
    for token in src:
        if token.isdigit() and token not in src:
                diff_digits_inds.append(i)

len(diff_digits_inds)

0

In [28]:
unique_src = set()
unique_src_inds = []
for i in range(len(train)):
    if train[i]['src'] not in unique_src:
        unique_src.add(train[i]['src'])
        unique_src_inds.append(i)

len(unique_src)

193572

In [32]:
train = [train[i] for i in unique_src_inds]
len(train)

193572

In [33]:
unique_dst = set()
unique_dst_inds = []
for i in range(len(train)):
    if train[i]['dst'] not in unique_dst:
        unique_dst.add(train[i]['dst'])
        unique_dst_inds.append(i)

len(unique_dst)

189762

In [34]:
train = [train[i] for i in unique_dst_inds]
len(train)

189762

In [35]:
diff_parenthesis_quotation_mark_inds = []
check = set(['(', ')', '[', ']', '{', '}', '"', "'"])
for i in range(len(train)):
    dst = train[i]['dst'].split()
    src = train[i]['src'].split()
    for token in src:
        if token in check and token not in src:
            diff_parenthesis_quotation_mark_inds.append(i)

len(diff_parenthesis_quotation_mark_inds)

0

### Using Bart

In [8]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

gen_config = GenerationConfig(num_beams=4,
                              early_stopping=True,
                              no_repeat_ngram_size=4,
                              forced_bos_token_id=0)

model.generation_config = gen_config

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [37]:
def encode_data(data, tokenizer, max_length=50):
    input_encodings = tokenizer([item['src'] for item in data], truncation=True, padding=True, max_length=max_length)
    target_encodings = tokenizer([item['dst'] for item in data], truncation=True, padding=True, max_length=max_length)
    
    return input_encodings, target_encodings


input_encodings, target_encodings = encode_data(train, tokenizer)
eval_input_encodings, eval_target_encodings = encode_data(valid, tokenizer)

In [38]:
class TranslationDataset(Dataset):
    def __init__(self, input_encodings, target_encodings):
        self.input_encodings = input_encodings
        self.target_encodings = target_encodings

    def __len__(self):
        return len(self.input_encodings['input_ids'])

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.input_encodings['input_ids'][idx])
        attention_mask = torch.tensor(self.input_encodings['attention_mask'][idx])
        labels = torch.tensor(self.target_encodings['input_ids'][idx])
        
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

train_dataset = TranslationDataset(input_encodings, target_encodings)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

valid_dataset = TranslationDataset(eval_input_encodings, eval_target_encodings)
valid_dataloader = DataLoader(valid_dataset, batch_size=8, shuffle=False)

In [39]:
training_args = TrainingArguments(output_dir="./results",          # куда будут сохранены результаты 
                                  eval_strategy="epoch",           # оценка модели по завершению каждой эпохи
                                  learning_rate=1e-4,              # скорость обучения
                                  per_device_train_batch_size=8,   # размер батча
                                  per_device_eval_batch_size=8,
                                  weight_decay=0.03,               # регуляризация L2
                                  save_total_limit=2,              # хранить только последние 2 сохранённые модели
                                  num_train_epochs=4,              # количество эпох
                                  logging_steps=500,
                                  logging_first_step=True,
                                  save_steps=10_000,               # сохранять модель каждые 10000 шагов
                                  report_to="tensorboard")

trainer = Trainer(model=model,                   # модель для дообучения
                  args=training_args,            # параметры обучения
                  train_dataset=train_dataset,   # датасет для обучения
                  eval_dataset=valid_dataset,
                  tokenizer=tokenizer)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.8,2.737467
2,0.686,2.694982
3,0.6128,2.691104
4,0.5305,2.710722




TrainOutput(global_step=94884, training_loss=0.680077860093148, metrics={'train_runtime': 9620.887, 'train_samples_per_second': 78.896, 'train_steps_per_second': 9.862, 'total_flos': 2.2598595661824e+16, 'train_loss': 0.680077860093148, 'epoch': 4.0})

In [40]:
torch.save(model, 'last_try_bart_model.pth')
torch.save(model.state_dict(), 'last_try_bart_model_params.pth')

In [9]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
model.load_state_dict(torch.load('/kaggle/input/last_last_try_bart_model_params/pytorch/default/1/last_last_try_bart_model_params.pth'))

  model.load_state_dict(torch.load('/kaggle/input/last_last_try_bart_model_params/pytorch/default/1/last_last_try_bart_model_params.pth'))


<All keys matched successfully>

In [10]:
gc.collect()

178

In [11]:
device = torch.device('cpu')
device

device(type='cpu')

In [13]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device('cpu')
model.to(device)

test_encodings = tokenizer(test, return_tensors="pt", padding=True, truncation=True, max_length=50).to(device)

input_ids = test_encodings['input_ids'].to(device)
attention_mask = test_encodings['attention_mask'].to(device)

In [None]:
# Генерация перевода
with torch.no_grad():
    generated_ids = model.generate(
        input_ids=test_encodings['input_ids'],
        attention_mask=test_encodings['attention_mask'],
        num_beams=4,
        max_length=50,
        early_stopping=True,
        decoder_start_token_id=tokenizer.bos_token_id  # Указываем токен начала последовательности
    )

# Декодирование сгенерированных токенов в текст
translations = [tokenizer_.decode(ids, skip_special_tokens=True) for ids in generated_ids]

In [None]:
data = [{"dst": translated, "src": source} for translated, source in zip(translations, test)]

with open('dst_src_for_test.json', 'w') as file:
    for entry in data:
        json.dump(entry, file, ensure_ascii=False)
        file.write("\n")