In [None]:
from google.colab import drive
import sys
drive.mount('/content/drive')
sys.path.append('/content/drive/My Drive/VKR')

In [None]:
!nvidia-smi --query-gpu=gpu_name,driver_version,memory.total --format=csv

In [None]:
!pip install pytorch_transformers
!pip install -U torchtext==0.10.0

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.functional as F
from tqdm.notebook import tqdm
# from pytorch_transformers import BertTokenizer
# from torchtext.legacy.data import Field, BucketIterator
import torchtext.vocab 
import random
import math
import time
import pandas as pd
import json
import os

import nltk
import pickle
SEED = 12

from torchtext.data.metrics import bleu_score

In [None]:
nltk.__version__

In [None]:
from text_error_generator import WordShuffle, LetterShuffling, Typo, CommonMistake, TextErrorGenerator
from transformer import Encoder, Decoder, Transformer
from utils import DataFrameDataset

In [None]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device == 'cpu':
    print('cpu')
else:
    n_gpu = torch.cuda.device_count()
    print(n_gpu)
    if (n_gpu > 0):
        print(torch.cuda.get_device_name(0))

In [None]:
def load_dataframes(add_generated: bool):
    external_df = None
    rulang_train = pd.read_csv('./drive/My Drive/VKR/rulang8_train.csv', sep=';')
    rulang_test = pd.read_csv('./drive/My Drive/VKR/rulang8_test.csv', sep=';')
    spell_ru_eval_train = pd.read_csv('./drive/My Drive/VKR/spell_ru_eval_train.csv', sep=';')
    spell_ru_eval_test = pd.read_csv('./drive/My Drive/VKR/spell_ru_eval_test.csv', sep=';')
    train_df = rulang_train.append(spell_ru_eval_train).sample(frac=1).reset_index(drop=True)
    test_df = rulang_test.append(spell_ru_eval_test).sample(frac=1).reset_index(drop=True)
    if add_generated:
        external_df = pd.read_csv('./drive/My Drive/VKR/books.csv', sep=';')
        external_df.rename(columns={"sentence": "trg"}, inplace=True)
        external_df = external_df.sample(frac=1).reset_index(drop=True)
        # external_df = np.array_split(external_df, 3)[0] # take half TODO: CHANGE
        # external_df = external_df[:20000]
    return train_df, test_df, external_df

def generateErrors(parameters, train_df, external_df):
    generator = TextErrorGenerator(
        [
         WordShuffle(parameters['word_shuffle']), 
         CommonMistake(parameters['common_mistake']), 
         LetterShuffling(parameters['letter_shuffling']), 
         Typo(parameters['typo_a'], parameters['typo_b'])
        ]
    )
    print("generating errors...")
    tqdm.pandas()
    external_df['src'] = external_df.trg.progress_map(generator.generateErrors)
    train_df = train_df.append(external_df)
    return train_df.sample(frac=1).reset_index(drop=True)

def save_vocab(vocab, path):
    output = open(path, 'wb')
    pickle.dump(vocab, output)
    output.close()

def create_vocab_and_tokenize(iteration, train_data, test_data):
    tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased") 
    SRC = Field(tokenize = tokenizer.tokenize,
                init_token = '<sos>', 
                eos_token = '<eos>', 
                fix_length = 80,
                lower = False, 
                batch_first = True)

    TRG = Field(tokenize = tokenizer.tokenize,
                init_token = '<sos>', 
                eos_token = '<eos>', 
                fix_length = 80,
                lower = False, 
                batch_first = True)
    fields = { 'trg' : TRG, 'src' : SRC }
    train_ds = DataFrameDataset(train_data, fields)
    test_ds = DataFrameDataset(test_data, fields)
    SRC.build_vocab(train_ds, min_freq = 1, vectors = "glove.6B.100d")
    TRG.build_vocab(train_ds, min_freq = 1, vectors = "glove.6B.100d")
    # SRC.build_vocab(train_ds, min_freq = 1)
    # TRG.build_vocab(train_ds, min_freq = 1)
    
    save_vocab(SRC.vocab, './drive/My Drive/VKR/random_search/SRC_transformer_best.pickle')
    save_vocab(TRG.vocab, './drive/My Drive/VKR/random_search/TRG_transformer_best.pickle')

    print("Number of words in source vocabulary", len(SRC.vocab))
    print("Number of words in target vocabulary", len(TRG.vocab))
    return train_ds, test_ds, SRC, TRG

def create_iterator(train_ds, test_ds):
    BATCH_SIZE = 128

    train_iterator, test_iterator = BucketIterator.splits(
        (train_ds, test_ds), 
        batch_size = BATCH_SIZE,
        device = device)
    return train_iterator, test_iterator


def randomize_parameters():
    parameter_space = {'word_shuffle':np.arange(0.0, 0.2, 0.005).tolist(), 
                        'common_mistake': np.arange(0.0, 0.2, 0.005).tolist(),
                        'letter_shuffling': np.arange(0.0, 0.2, 0.005).tolist(), 
                        'typo_a': np.arange(0.0, 0.2, 0.005).tolist(),
                        'typo_b': np.arange(0.0, 0.2, 0.005).tolist()
                        }
    return {k: random.choice(v) for k, v in parameter_space.items()}
                    

In [None]:
def load_existing_vocab():
    with open("./drive/My Drive/VKR/latest_SRC_real.pickle","rb")as f:
        SRC.vocab = pickle.load(f)
    with open("./drive/My Drive/VKR/latest_TRG_real.pickle","rb")as f:
        TRG.vocab = pickle.load(f)


In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

def create_transformer(SRC, TRG):
    SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
    TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

    INPUT_DIM = len(SRC.vocab)
    OUTPUT_DIM = len(TRG.vocab)
    HID_DIM = 256
    ENC_LAYERS = 3
    DEC_LAYERS = 3
    ENC_HEADS = 8
    DEC_HEADS = 8
    ENC_PF_DIM = 512
    DEC_PF_DIM = 512
    ENC_DROPOUT = 0.1
    DEC_DROPOUT = 0.1

    enc = Encoder(INPUT_DIM, 
                HID_DIM, 
                ENC_LAYERS, 
                ENC_HEADS, 
                ENC_PF_DIM, 
                ENC_DROPOUT, 
                device)

    dec = Decoder(OUTPUT_DIM, 
                HID_DIM, 
                DEC_LAYERS, 
                DEC_HEADS, 
                DEC_PF_DIM, 
                DEC_DROPOUT, 
                device)
    model = Transformer(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

    print(f'The model has {count_parameters(model):,} trainable parameters')
    model.apply(initialize_weights);
    return model


In [None]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    with tqdm(total=len(iterator), file=sys.stdout) as pbar:    
        for i, batch in enumerate(iterator):
            
            src = batch.src
            trg = batch.trg
            
            optimizer.zero_grad()
            
            output, _ = model(src, trg[:,:-1])
                    
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
                
            output_dim = output.shape[-1]
                
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
                    
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]
                
            loss = criterion(output, trg)
            
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            
            optimizer.step()
            
            epoch_loss += loss.item()
            
            # update the progress bar
            pbar.set_description('train processed')
            pbar.update(1)
        
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    predict = []
    target = []

    with torch.no_grad():
        with tqdm(total=len(iterator), file=sys.stdout) as pbar:
            for i, batch in enumerate(iterator):

                src = batch.src
                trg = batch.trg

                output, _ = model(src, trg[:,:-1])
                predict += output.argmax(2).tolist()
                target += trg.tolist()
                
                #output = [batch size, trg len - 1, output dim]
                #trg = [batch size, trg len]
                
                output_dim = output.shape[-1]
                
                output = output.contiguous().view(-1, output_dim)
                trg = trg[:,1:].contiguous().view(-1)
                
                #output = [batch size * trg len - 1, output dim]
                #trg = [batch size * trg len - 1]
                
                loss = criterion(output, trg)

                epoch_loss += loss.item()
                
                # update the progress bar
                pbar.set_description('evaluate processed')
                pbar.update(1)
        
    return epoch_loss / len(iterator), (predict, target)

def translate_sentence_vectorized(src_tensor, src_field, trg_field, model, device, max_len=80):
    assert isinstance(src_tensor, torch.Tensor)

    model.eval()
    src_mask = model.make_src_mask(src_tensor)

    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)
    # enc_src = [batch_sz, src_len, hid_dim]

    trg_indexes = [[trg_field.vocab.stoi[trg_field.init_token]] for _ in range(len(src_tensor))]
    # Even though some examples might have been completed by producing a <eos> token
    # we still need to feed them through the model because other are not yet finished
    # and all examples act as a batch. Once every single sentence prediction encounters
    # <eos> token, then we can stop predicting.
    translations_done = [0] * len(src_tensor)
    for i in range(max_len):
        trg_tensor = torch.LongTensor(trg_indexes).to(device)
        trg_mask = model.make_trg_mask(trg_tensor)
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        pred_tokens = output.argmax(2)[:,-1]
        for i, pred_token_i in enumerate(pred_tokens):
            trg_indexes[i].append(pred_token_i)
            if pred_token_i == trg_field.vocab.stoi[trg_field.eos_token]:
                translations_done[i] = 1
        if all(translations_done):
            break

    # Iterate through each predicted example one by one;
    # Cut-off the portion including the after the <eos> token
    pred_sentences = []
    for trg_sentence in trg_indexes:
        pred_sentence = []
        for i in range(1, len(trg_sentence)):
            if trg_sentence[i] == trg_field.vocab.stoi[trg_field.eos_token]:
                break
            pred_sentence.append(trg_field.vocab.itos[trg_sentence[i]])
        pred_sentences.append(pred_sentence)

    return pred_sentences, attention

def calculate_bleu_alt(iterator, src_field, trg_field, model, device, max_len = 80):
    trgs = []
    pred_trgs = []
    with torch.no_grad():
        for batch in iterator:
            src = batch.src
            trg = batch.trg
            _trgs = []
            for sentence in trg:
                tmp = []
                # Start from the first token which skips the <start> token
                for i in sentence[1:]:
                    # Targets are padded. So stop appending as soon as a padding or eos token is encountered
                    if i == trg_field.vocab.stoi[trg_field.eos_token] or i == trg_field.vocab.stoi[trg_field.pad_token]:
                        break
                    tmp.append(trg_field.vocab.itos[i])
                _trgs.append([tmp])
            trgs += _trgs
            pred_trg, _ = translate_sentence_vectorized(src, src_field, trg_field, model, device)
            pred_trgs += pred_trg
    return pred_trgs, trgs, bleu_score(pred_trgs, trgs)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def fit_model(iteration, model, n_epochs, train_iterator, test_iterator, SRC, TRG):
    TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

    CLIP = 1

    LEARNING_RATE = 0.0005
    TRG_PAD_IDX
    optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)
    criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

    best_valid_loss = float('inf')
    best_valid_loss_bleu = 0

    for epoch in range(n_epochs):
        
        start_time = time.time()
        
        train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
        valid_loss, (predict, target) = evaluate(model, test_iterator, criterion)
        
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
        if (epoch > 2):
            print(f'scoring BLEU...')
            pred_rtgs, trgs, bleu_score = calculate_bleu_alt(test_iterator, SRC, TRG, model, device)
            print(f'BLEU score = {bleu_score*100:.2f}')
            if bleu_score > best_valid_loss_bleu:
                best_valid_loss_bleu = bleu_score
                torch.save(model.state_dict(), './drive/My Drive/VKR/random_search/transformer_best_params.pth')

        end_time = time.time()
        
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
    return best_valid_loss, best_valid_loss_bleu


In [None]:
RANDOM_SEARCH_TRIES = 10
N_EPOCHS = 8
GENERATE_ERRORS = True
def random_search(add_errors, iterations, epochs):

    best_params = {}
    best_iteration = 0
    best_valid_loss = float('inf')
    best_valid_bleu = 0

    if not add_errors:
        iterations = 1

    for i in range(iterations):
        print("ITERATION", i)
        train, test, ext = load_dataframes(add_errors) # read real data
        generator_params = randomize_parameters() # randomize params
        print("current_params: ", generator_params)
        if add_errors:
            train = generateErrors(generator_params, train, ext) # generate errors params
        print("train data sample:")
        train_idx = random.randint(0, len(train))
        print("src = ", train['src'].iloc[train_idx])
        print("trg = ", train['trg'].iloc[train_idx])
        test_idx = random.randint(0, len(test))
        print("test data sample:")
        print("src = ", test['src'].iloc[test_idx])
        print("trg = ", test['trg'].iloc[test_idx])
        print()
        print(f"train size: {len(train)}, test size: {len(test)}")

        train_ds, test_ds, SRC, TRG = create_vocab_and_tokenize(i, train, test)
        train_iterator, test_iterator = create_iterator(train_ds, test_ds)
        model = create_transformer(SRC, TRG)
        valid_loss, valid_bleu = fit_model(i, model, epochs, train_iterator, test_iterator, SRC, TRG)
        if valid_bleu >= best_valid_bleu:
            best_valid_bleu = valid_bleu
            best_iteration = iterations
            best_valid_loss = valid_loss
            best_params = generator_params

    print(f"BEST_VALID_BLEU: = {best_valid_bleu*100:.2f}")
    print(f"BEST ITERATION = {best_iteration}")
    print(f"BEST LOSS = {best_valid_loss}")
    print(f"BEST PARAMS = {best_params}")

### Random Search с сгенерированными данными

In [None]:
random_search(GENERATE_ERRORS, RANDOM_SEARCH_TRIES, N_EPOCHS)

### Без генератора на реальных данных

In [None]:
random_search(False, 1, 15)

Генератор с оптимальными параметрами из Random Search 

In [None]:
WITH_BEST_PARAMS_TRIES = 1
N_EPOCHS = 25
GENERATE_ERRORS = True
def random_search(add_errors, iterations, epochs):

    best_params = {}
    best_iteration = 0
    best_valid_loss = float('inf')
    best_valid_bleu = 0

    if not add_errors:
        iterations = 1

    for i in range(iterations):
        print("ITERATION", i)
        train, test, ext = load_dataframes(add_errors) # read real data
        generator_params = {'word_shuffle': 0.045, 'common_mistake': 0.1, 'letter_shuffling': 0.105, 'typo_a': 0.015, 'typo_b': 0.065}
        print("current_params: ", generator_params)
        if add_errors:
            train = generateErrors(generator_params, train, ext) # generate errors params
        print("train data sample:")
        train_idx = random.randint(0, len(train))
        print("src = ", train['src'].iloc[train_idx])
        print("trg = ", train['trg'].iloc[train_idx])
        test_idx = random.randint(0, len(test))
        print("test data sample:")
        print("src = ", test['src'].iloc[test_idx])
        print("trg = ", test['trg'].iloc[test_idx])
        print()
        print(f"train size: {len(train)}, test size: {len(test)}")

        train_ds, test_ds, SRC, TRG = create_vocab_and_tokenize(i, train, test)
        train_iterator, test_iterator = create_iterator(train_ds, test_ds)
        model = create_transformer(SRC, TRG)
        valid_loss, valid_bleu = fit_model(i, model, epochs, train_iterator, test_iterator, SRC, TRG)
        if valid_bleu >= best_valid_bleu:
            best_valid_bleu = valid_bleu
            best_iteration = iterations
            best_valid_loss = valid_loss
            best_params = generator_params

    print(f"BEST_VALID_BLEU: = {best_valid_bleu*100:.2f}")
    print(f"BEST ITERATION = {best_iteration}")
    print(f"BEST LOSS = {best_valid_loss}")
    print(f"BEST PARAMS = {best_params}")

In [None]:
random_search(GENERATE_ERRORS, WITH_BEST_PARAMS_TRIES, N_EPOCHS)

In [None]:
 def correct_sentence(sentence, src_field, trg_field, model, device, max_len = 80):
    
    model.eval()
        
    if isinstance(sentence, str):
        tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased") 
        tokens = [token for token in tokenizer.tokenize(sentence)]
    else:
        tokens = [token for token in sentence]

    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
        
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    
    src_mask = model.make_src_mask(src_tensor)
    
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        trg_mask = model.make_trg_mask(trg_tensor)
        
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break
    
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:], attention

In [None]:
def beautify(text_lst):
    symbols = set([",", ".", "?", "!"])
    result = ""
    for item in text_lst:
        if item.startswith("##"):
            result += item[2:]
        elif item in symbols:
            result += item
        else:
            result += " " + item
    return result
    

In [None]:
def try_correct(sentence):
    tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased") 
    SRC = Field(tokenize = tokenizer.tokenize,
                init_token = '<sos>', 
                eos_token = '<eos>', 
                fix_length = 80,
                lower = False, 
                batch_first = True)

    TRG = Field(tokenize = tokenizer.tokenize,
                init_token = '<sos>', 
                eos_token = '<eos>', 
                fix_length = 80,
                lower = False, 
                batch_first = True)
    
    with open("./drive/My Drive/VKR/random_search/SRC_transformer_best.pickle","rb")as f:
        SRC.vocab = pickle.load(f)
    with open("./drive/My Drive/VKR/random_search/TRG_transformer_best.pickle","rb")as f:
        TRG.vocab = pickle.load(f)


    model = create_transformer(SRC, TRG)
    model.load_state_dict(torch.load('./drive/My Drive/VKR/random_search/transformer_best_params.pth'))
    fields = { 'trg' : TRG, 'src' : SRC }
    translation, attention = correct_sentence(sentence, SRC, TRG, model, device)
    print(beautify(translation[:-1]))

In [None]:
try_correct("Мй любзный друуг, я тк по теебе скуал!")

In [None]:
try_correct("Привет пожалуйста принесешь, чаю ?")

In [None]:
try_correct("Ппзнййся")