# Requirements

In [1]:
!pip install razdel



# Mount colab

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!mkdir models

# Imports

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import numpy as np
import os
import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
from razdel import tokenize
import gdown
import pandas as pd
import json
import random
import copy

%matplotlib inline

# Download train data from Google Drive

In [5]:
gdown.download(url='https://drive.google.com/uc?id=1iFPkClVHzYC2werYdzIly38EOvP4HGFd', output='qa_data.json', quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1iFPkClVHzYC2werYdzIly38EOvP4HGFd
To: /content/qa_data.json
100%|██████████| 1.31G/1.31G [00:08<00:00, 153MB/s]


'qa_data.json'

# Build functions for load data to memory and load data

In [3]:
def load_jsonl_data(path_to_jsonl: str) -> pd.DataFrame:
    with open(path_to_jsonl) as f:
        data = [json.loads(line) for line in f]
    print(f'Loaded {len(data)} examples')
    result_dict = {}
    for i in range(len(data)):
        result_dict[i] = data[i]
    df = pd.DataFrame().from_dict(result_dict, orient='index')
    del result_dict
    del data
    return df

In [4]:
def filter_df_jsonl(df: pd.DataFrame) -> pd.DataFrame:
    df['responses_len'] = df['responses'].apply(lambda x: len(x))
    df['question_len'] = df['question'].apply(lambda x: len(x.split(' ')))
    
    df = df[(df['responses_len'] >= 2)&(df['question_len'] >= 10)]

    del df['question_len']
    del df['responses_len']

    def select_max_response(responses: list) -> str:
        max_len = 0
        index_for_return = None
        for i in range(len(responses)):
            if len(responses[i]) > max_len:
                max_len = len(responses[i])
                index_for_return = copy.copy(i)
        
        return responses[index_for_return]
    
    df['responses'] = df['responses'].apply(select_max_response)
    
    return df

In [5]:
df = load_jsonl_data('qa_data.json')

Loaded 2808811 examples


In [6]:
print(len(df))

2808811


In [7]:
df_filtred = filter_df_jsonl(df)
del df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [8]:
df_filtred.tail(10)

Unnamed: 0,question,category,responses
2808782,"как пользоваьбся шпорой, куда ее сувать если з...",Образование,пиши на линейке... деревянной.. карандашом.. в...
2808783,"хочу приобрести крысу , но у меня аллергия , м...","Животные, Растения",вот лысая: но не думаю что у вас алергия ...
2808785,как называется песня я только помню что она бы...,Искусство и Культура,new world sound &amp; thomas newson — flute j...
2808788,"*** почему ...многие не признаются, что у них ...","Знакомства, Любовь, Отношения","ну я лично понимаю, что настроил себе иллюзий...."
2808790,удар если удар составляет 500 кг. при росте 18...,Спорт,"смотря чем? если кувалдой кг на 20, то да, а е..."
2808795,"вас жестоко передавали те, кому вы верили? и ч...","Знакомства, Любовь, Отношения","сделала и сделала. главное, что чел этого не з..."
2808797,в каком месте колоть язык?? ? главное в вены н...,Красота и Здоровье,"идиотизм, а если потеряете дар речи, повредив ..."
2808798,"можно ли завести рыбок гуппи, если у тебя алле...","Животные, Растения",вообще гуппи едят корм но раз аллергия можно д...
2808802,"можно ли использовать реплику чатского ""а судь...",Искусство и Культура,"можно. там у островского, насколько помню, одн..."
2808810,какой вам начальник больше подойдёт?глупый - н...,"Работа, Карьера","глупому начальнику можно что угодно ""втереть"",..."


In [9]:
len(df_filtred)

841064

In [10]:
# Выбираем небольшой набор данных случайным образом чтобы проверить что модель обучается
df_filtred = df_filtred.sample(10000)

In [11]:
def preprocessing_sentence(sentence: str) -> list:
    s = sentence.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^0-9a-zA-Zа-яёА-ЯЁ]+", r" ", s)
    tokens = list(tokenize(s))
    res_sent = [_.text for _ in tokens]
    return res_sent

In [12]:
questions = df_filtred['question'].apply(preprocessing_sentence).tolist()
responses = df_filtred['responses'].apply(preprocessing_sentence).tolist()

In [13]:
questions[0]

['зачем',
 'парни',
 'так',
 'делают',
 'когда',
 'обнимаюсь',
 'или',
 'целуюсь',
 'с',
 'парнем',
 'он',
 'берет',
 'меня',
 'за',
 'бедра',
 'и',
 'прижимает',
 'к',
 'себе']

In [14]:
responses[0]

['укусит',
 'за',
 'шею',
 'попьёт',
 'твоей',
 'крови',
 'и',
 'потом',
 'ты',
 'сама',
 'станешь',
 'такой',
 'видишь',
 'тут',
 'многих',
 'бараны',
 'покусали']

In [15]:
max_len_all = 0
for sen in questions:
    if len(sen) > max_len_all:
        max_len_all = len(sen)

for sen in responses:
    if len(sen) > max_len_all:
        max_len_all = len(sen)

In [16]:
max_len_all

38

In [17]:
max_length = 60
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = 'myseq2seq'

In [18]:
pad_token = 0
sos_token = 1
eos_token = 2

class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {pad_token:'PAD', sos_token:'SOS', eos_token : 'EOS'}
        self.numword = 3
        
    def add_sentence(self, sentence):
        s = sentence.lower().strip()
        s = re.sub(r"([.!?])", r" \1", s)
        s = re.sub(r"[^0-9a-zA-Zа-яёА-ЯЁ]+", r" ", s)
        tokens = list(tokenize(s))
        res_sent = [_.text for _ in tokens]
        for word in res_sent:
            self.addword(word)
            
    def addword(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.numword
            self.word2count[word] = 1
            self.index2word[self.numword] = word
            self.numword += 1
        else:
            self.word2count[word] += 1

In [19]:
voc = Voc(model_name)

for sent in questions:
    for word in sent:
        voc.addword(word)

for sent in responses:
    for word in sent:
        voc.addword(word)

pairs = list(zip(questions, responses))

voc, trimmed_pair =  voc, pairs

In [20]:
trimmed_pair[0]

(['зачем',
  'парни',
  'так',
  'делают',
  'когда',
  'обнимаюсь',
  'или',
  'целуюсь',
  'с',
  'парнем',
  'он',
  'берет',
  'меня',
  'за',
  'бедра',
  'и',
  'прижимает',
  'к',
  'себе'],
 ['укусит',
  'за',
  'шею',
  'попьёт',
  'твоей',
  'крови',
  'и',
  'потом',
  'ты',
  'сама',
  'станешь',
  'такой',
  'видишь',
  'тут',
  'многих',
  'бараны',
  'покусали'])

In [21]:
def index_from_sentence(voc, sentence):
    return [voc.word2index[word] for word in sentence] + [eos_token]

In [22]:
def zeroPadding(l,fillvalue=pad_token):
    return list(itertools.zip_longest(*l, fillvalue = fillvalue))

def binaryMatrix(l, value=pad_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if token == pad_token:
                m[i].append(0)
            else:
                m[i].append(1)
    return m

def input_to_torch(l, voc):
    indexes_batch = [index_from_sentence(voc, sentence) for sentence in l]
    padded_list_index = zeroPadding(indexes_batch)
    padded_tensor_index = torch.LongTensor(padded_list_index)
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    return padded_tensor_index, lengths

In [23]:
def output_to_torch(l, voc):
    indexes_batch = [index_from_sentence(voc, sentence) for sentence in l]
    padded_list_index = zeroPadding(indexes_batch)
    padded_tensor_index = torch.LongTensor(padded_list_index)
    max_output_length = max([len(indexes) for indexes in indexes_batch])
    mask = binaryMatrix(padded_list_index)
    mask = torch.ByteTensor(mask)
    return padded_tensor_index, mask, max_output_length

In [24]:
def get_batch_pair(voc, batch_pair):
    batch_pair.sort(key = lambda x: len(x[0]), reverse = True )
    input_batch, response_batch = [], []
    for pair in batch_pair:
        input_batch.append(pair[0])
        response_batch.append(pair[1])
    
    input_tensor, length_input = input_to_torch(input_batch, voc)
    output_tensor, mask, max_length = output_to_torch(response_batch, voc)
    return input_tensor, length_input, output_tensor, mask, max_length

In [25]:
class EncoderRNN(nn.Module):
    def __init__(self, embedding, hidden_size, num_layers = 1,dropout = 0):
        super(EncoderRNN, self).__init__()
        self.num_layers = num_layers
        self.embedding = embedding
        self.hidden_size = hidden_size
        self.gru = nn.GRU(  input_size = hidden_size
                          , hidden_size = hidden_size
                          , num_layers = num_layers
                          , dropout = (0 if num_layers == 1 else dropout)
                          , bidirectional = True)
    def forward(self, input_seq, input_length, hidden = None):
        embedding = self.embedding(input_seq)
        packed_input = torch.nn.utils.rnn.pack_padded_sequence(embedding, input_length)
        output, hidden_cell = self.gru(packed_input, hidden)
        output, _ = torch.nn.utils.rnn.pad_packed_sequence(output)
        output = output[:,:,:self.hidden_size] + output[:,:,self.hidden_size:]
        return output, hidden_cell
    

In [26]:
class AttentionLayer(nn.Module):
    def __init__(self, hidden_size):
        super(AttentionLayer, self).__init__()
        self.hidden_size = hidden_size
        self.attn = torch.nn.Linear(self.hidden_size * 2, hidden_size)
        self.weight = nn.Parameter(torch.FloatTensor(hidden_size))
            
    def get_dot_score(self, hidden, encoder_outputs):
        return torch.sum(hidden*encoder_outputs, dim=2)
    
    def get_general_score(self, hidden, encoder_outputs):
        energy = self.attn(encoder_outputs)
        return torch.sum(hidden * energy, dim=2)
    
    def get_concat_score(self, hidden, encoder_outputs):
        concat = torch.cat((hidden.expand(encoder_outputs.size(0),-1,-1), encoder_outputs), dim=2)
        energy = torch.tanh(self.attn(concat))
        return torch.sum(self.weight * energy, dim=2)
                           
    def forward(self, hidden, encoder_outputs):
        attn_energy = self.get_concat_score(hidden, encoder_outputs)
        attn_energy = attn_energy.t()
        return F.softmax(attn_energy, dim=1).unsqueeze(1)

In [27]:
class AttentionDecoder(nn.Module):
    def __init__(self, embedding, hidden_size, output_size, n_layers=1, dropout = 0.1):
        super(AttentionDecoder, self).__init__()

        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))

        self.concat = nn.Linear(hidden_size*2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.attention = AttentionLayer(hidden_size)
        
    def forward(self, input_step, last_hidden, encoder_outputs):
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        rnn_output, hidden = self.gru(embedded, last_hidden)
        attention_weights = self.attention(rnn_output, encoder_outputs)
        context = attention_weights.bmm(encoder_outputs.transpose(0,1))
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input =  torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        return output, hidden

In [28]:
def maskNLLLoss(input, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(input, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

In [29]:
np.random.seed(42)
def train(input_variable, lengths, target_variable, embedding, encoder, decoder, encoder_optimizer, decoder_optimizer, max_target_lens
            , batch_size, clip, mask,max_length = max_length):
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_variable = input_variable.to(device)
    target_variable = target_variable.to(device)

    mask = mask.bool()
    mask = mask.to(device)

    loss = 0
    print_loss = []
    n_totals = 0

    output_encoders,  hidden_encoders = encoder(input_variable, lengths)

    input_decoders = torch.LongTensor([[sos_token for _ in range(batch_size)]])
    input_decoders = input_decoders.to(device)

    hidden_decoders = hidden_encoders[:decoder.n_layers]
    teacher_forcing = True if random.random() < teacher_forcing_rate else False
    

    if teacher_forcing:
        for t in range(max_target_lens):
            output_decoders, hidden_decoders = decoder(input_decoders, hidden_decoders, output_encoders)
            input_decoders = target_variable[t].view(1, -1)
            mask_loss, nTotal = maskNLLLoss(output_decoders, target_variable[t], mask[t])
            loss+=mask_loss
            print_loss.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_lens):
            output_decoders, hidden_encoders = decoder(input_decoders, hidden_decoders, output_encoders)
            _, topi = output_decoders.topk(1)
            input_decoders = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            input_decoders = input_decoders.to(device)
            mask_loss, nTotal = maskNLLLoss(output_decoders, target_variable[t], mask[t])
            loss += mask_loss
            print_loss.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    loss.backward()
    _ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)
    encoder_optimizer.step()
    decoder_optimizer.step()
    return sum(print_loss) / n_totals
    

In [30]:
def trainIters(model_name, voc, trimmed_pair, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers,
               decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name,):
    training_batches = [get_batch_pair(voc, [random.choice(trimmed_pair) for _ in range(batch_size)]) for _ in range(n_iteration)]
    print('initializing...')
    start_iteration = 1
    print_loss = 0
    
    print('Tranining...')
    for iteration in range(start_iteration, n_iteration +1):
        training_batch = training_batches[iteration-1]
        input_variable, lengths, target_variable, mask, max_target_lens = training_batch
        loss = train(input_variable, lengths, target_variable, embedding, encoder, decoder, encoder_optimizer, decoder_optimizer, max_target_lens
            , batch_size, clip, mask)
        print_loss += loss
        if (iteration % print_every) == 0:
            print_loss_avg = print_loss / print_every
            print(f'loss_avg at {iteration} is: {print_loss_avg}, in {100 * iteration / n_iteration } % progress complete')
            print_loss = 0
        if (iteration % save_every) == 0:
           directory = os.path.join(path_save, model_name, corpus_name, f'{encoder_n_layers}-{decoder_n_layers}_{hidden_size}')
           if not os.path.exists(directory):
               os.makedirs(directory)
           torch.save({
               'iteration': iteration,
               'encoder' : encoder.state_dict(),
               'decoder' : decoder.state_dict(),
               'encoder_optimizer': encoder_optimizer.state_dict(),
               'decoder_optimizer': decoder_optimizer.state_dict(),
               'loss' : loss,
               'voc_dict'  : voc.__dict__,
               'embedding': embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))            

In [31]:
class Greedysearch_decoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(Greedysearch_decoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, input_seq, input_length, max_length):
        output_encoder, hidden_encoder = self.encoder(input_seq, input_length)
        hidden_decoder = hidden_encoder[:decoder.n_layers]
        input_decoder = torch.ones(1,1,device = device, dtype = torch.long) * sos_token
        all_tokens = torch.zeros([0], device=device, dtype = torch.long)
        all_score  = torch.zeros([0], device=device)
        for _ in range(max_length):
            output_decoder, hidden_decoder = self.decoder(input_decoder, hidden_decoder, output_encoder)
            max_score, output_index = torch.max(output_decoder, dim = 1)
            all_tokens = torch.cat((all_tokens, output_index), dim = 0)
            all_score = torch.cat((all_score, max_score), dim = 0)
            input_decoder = torch.unsqueeze(output_index, 0)
        return all_tokens, all_score

In [32]:
def evaluate(encoder, decoder, searcher, voc, sentence, max_length = max_length):
    index_sentence_list = [index_from_sentence(voc, sentence)]
    input_lengths = torch.tensor([len(index) for index in index_sentence_list])
    index_sentence = torch.LongTensor(index_sentence_list)
    input_batch = index_sentence.transpose(0,1)
    input_batch = input_batch.to(device)
    output_tokens, output_scores = searcher(input_batch, input_lengths, max_length) 
    words_decoder = [voc.index2word[index.item()] for index in output_tokens]
    return words_decoder


def predict_answer(encoder, decoder, search, voc, input_sentence):
    input_sentence = preprocessing_sentence(input_sentence)
    res_input_sentence = list()
    for word in input_sentence:
        if word in voc.word2index:
            res_input_sentence.append(word)
    words_decoder = evaluate(encoder, decoder, search, voc, res_input_sentence)
    words_decoder[:]  = [word for word in words_decoder if word not in ['PAD','EOS']]
    print(' '.join(words_decoder))

In [37]:
hidden_size = 512
encoder_n_layers = 3
decoder_n_layers = 3
dropout = 0.1
batch_size = 64
checkpoint_iter = 10000

In [38]:
embedding = nn.Embedding(voc.numword, hidden_size)
encoder = EncoderRNN(embedding, hidden_size, encoder_n_layers, dropout)
decoder = AttentionDecoder(embedding, hidden_size, voc.numword, decoder_n_layers, dropout)
encoder = encoder.to(device)
decoder = decoder.to(device)

In [39]:
device

'cuda'

In [40]:
clip = 50.0
teacher_forcing_rate = 1.0
learning_rate = 3e-4
decoder_learning_rate = 5.0
n_iteration = 10000
print_every = 1000
save_every = 5000
path_save = './models/'

encoder.train()
decoder.train()

print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_rate)

print("Starting Training!")
trainIters(model_name, voc, trimmed_pair, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, encoder_n_layers, decoder_n_layers, path_save, n_iteration, batch_size,
           print_every, save_every, clip, model_name)

Building optimizers ...
Starting Training!
initializing...
Tranining...
loss_avg at 1000 is: 6.710489610945203, in 10.0 % progress complete
loss_avg at 2000 is: 3.949417522309355, in 20.0 % progress complete
loss_avg at 3000 is: 2.3891349722606465, in 30.0 % progress complete
loss_avg at 4000 is: 1.4062898633463614, in 40.0 % progress complete
loss_avg at 5000 is: 0.8255642938884654, in 50.0 % progress complete
loss_avg at 6000 is: 0.5283654809083402, in 60.0 % progress complete
loss_avg at 7000 is: 0.41006448695574993, in 70.0 % progress complete
loss_avg at 8000 is: 0.2963281483093786, in 80.0 % progress complete
loss_avg at 9000 is: 0.23319580083737337, in 90.0 % progress complete
loss_avg at 10000 is: 0.276209027614968, in 100.0 % progress complete


In [41]:
!cp "./models/myseq2seq/myseq2seq/3-3_512/10000_checkpoint.tar" "/content/drive/MyDrive/10000_checkpoint_3_3_512.tar"

In [47]:
sub_smp = df_filtred.sample(5)
sent_q = sub_smp['question'].tolist()
sent_a = sub_smp['responses'].tolist()

In [50]:
encoder.eval()
decoder.eval()

searcher = Greedysearch_decoder(encoder, decoder)

for i in range(len(sent_q)):
    print('Вопрос', sent_q[i])
    print('Ответ')
    predict_answer(encoder, decoder, searcher, voc, sent_q[i])
    print()

Вопрос а вы бы завели лошадь???конечно если бы били условия и деньги.
Ответ
канеш очень красивое и умное животное я бы на ней на рыбалку и охоту гонял

Вопрос какой немецкий игрок в матче с португалией забил гол, сделал пару голевых передач и стал героем дня?
Ответ
швайни я думаю лучшим игроком фильмы лучшим игроком матча игроком матча лучшим игроком матча михаэля баллака игроком матча я игроком

Вопрос если в описании дивана еврокнижки сказано "требует сборки" это в каком виде он приходит? как две половинки или как?
Ответ
вопрос некорректен смотря у кого покупаете неужели так сложно выясни все вопросы у продавца

Вопрос кто-то все же разбудил спящую собаку))) но!! это к лучшему)) а у вас что к лучшему!?)) если не секрет))
Ответ
у меня нет собаки а у вас видать проблемы воры пробрались через окно и это муж ваш его разбудил

Вопрос сегодня резко во время движения руль стал очень тугим. остановился, выключил, включил. больше не повторялось.
Ответ
гур проверь может уровень масла низкий а 