# Requirements

In [1]:
!pip install razdel

Collecting razdel
  Downloading razdel-0.5.0-py3-none-any.whl (21 kB)
Installing collected packages: razdel
Successfully installed razdel-0.5.0


# Mount colab

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!mkdir models

# Imports

In [4]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import numpy as np
import os
import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math
from razdel import tokenize
import gdown
import pandas as pd
import json
import random
import copy
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

%matplotlib inline

# Download train data from Google Drive

In [5]:
gdown.download(url='https://drive.google.com/uc?id=1iFPkClVHzYC2werYdzIly38EOvP4HGFd', output='qa_data.json', quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1iFPkClVHzYC2werYdzIly38EOvP4HGFd
To: /content/qa_data.json
100%|██████████| 1.31G/1.31G [00:05<00:00, 254MB/s]


'qa_data.json'

# Build functions for load data to memory and load data

In [6]:
def load_jsonl_data(path_to_jsonl: str) -> pd.DataFrame:
    with open(path_to_jsonl) as f:
        data = [json.loads(line) for line in f]
    print(f'Loaded {len(data)} examples')
    result_dict = {}
    for i in range(len(data)):
        result_dict[i] = data[i]
    df = pd.DataFrame().from_dict(result_dict, orient='index')
    del result_dict
    del data
    return df

In [7]:
def filter_df_jsonl(df: pd.DataFrame) -> pd.DataFrame:
    df['responses_len'] = df['responses'].apply(lambda x: len(x))
    df['question_len'] = df['question'].apply(lambda x: len(x.split(' ')))
    
    df = df[(df['responses_len'] >= 2)&(df['question_len'] >= 10)]

    del df['question_len']
    del df['responses_len']

    def select_max_response(responses: list) -> str:
        max_len = 0
        index_for_return = None
        for i in range(len(responses)):
            if len(responses[i]) > max_len:
                max_len = len(responses[i])
                index_for_return = copy.copy(i)
        
        return responses[index_for_return]
    
    df['responses'] = df['responses'].apply(select_max_response)
    
    return df

In [8]:
df = load_jsonl_data('qa_data.json')

Loaded 2808811 examples


In [9]:
print(len(df))

2808811


In [10]:
df_filtred = filter_df_jsonl(df)
del df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [11]:
df_filtred.tail(10)

Unnamed: 0,question,category,responses
2808782,"как пользоваьбся шпорой, куда ее сувать если з...",Образование,пиши на линейке... деревянной.. карандашом.. в...
2808783,"хочу приобрести крысу , но у меня аллергия , м...","Животные, Растения",вот лысая: но не думаю что у вас алергия ...
2808785,как называется песня я только помню что она бы...,Искусство и Культура,new world sound &amp; thomas newson — flute j...
2808788,"*** почему ...многие не признаются, что у них ...","Знакомства, Любовь, Отношения","ну я лично понимаю, что настроил себе иллюзий...."
2808790,удар если удар составляет 500 кг. при росте 18...,Спорт,"смотря чем? если кувалдой кг на 20, то да, а е..."
2808795,"вас жестоко передавали те, кому вы верили? и ч...","Знакомства, Любовь, Отношения","сделала и сделала. главное, что чел этого не з..."
2808797,в каком месте колоть язык?? ? главное в вены н...,Красота и Здоровье,"идиотизм, а если потеряете дар речи, повредив ..."
2808798,"можно ли завести рыбок гуппи, если у тебя алле...","Животные, Растения",вообще гуппи едят корм но раз аллергия можно д...
2808802,"можно ли использовать реплику чатского ""а судь...",Искусство и Культура,"можно. там у островского, насколько помню, одн..."
2808810,какой вам начальник больше подойдёт?глупый - н...,"Работа, Карьера","глупому начальнику можно что угодно ""втереть"",..."


In [12]:
len(df_filtred)

841064

In [13]:
# Выбираем небольшой набор данных случайным образом чтобы проверить что модель обучается
df_filtred = df_filtred.sample(10000)

In [14]:
def preprocessing_sentence(sentence: str) -> list:
    s = sentence.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^0-9a-zA-Zа-яёА-ЯЁ]+", r" ", s)
    tokens = list(tokenize(s))
    res_sent = [_.text for _ in tokens]
    return res_sent

In [15]:
questions = df_filtred['question'].apply(preprocessing_sentence).tolist()
responses = df_filtred['responses'].apply(preprocessing_sentence).tolist()

In [16]:
pad_token = 0
sos_token = 1
eos_token = 2

word2index = {}
word2count = {}
index2word = {pad_token:'PAD', sos_token:'SOS', eos_token : 'EOS'}
numword = 3

for sent in questions:
    for word in sent:
        if word not in word2index:
            word2index[word] = numword
            word2count[word] = 1
            index2word[numword] = word
            numword += 1
        else:
            word2count[word] += 1


for sent in responses:
    for word in sent:
        if word not in word2index:
            word2index[word] = numword
            word2count[word] = 1
            index2word[numword] = word
            numword += 1
        else:
            word2count[word] += 1

In [17]:
from keras.preprocessing.sequence import pad_sequences
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [18]:
class SeqDataset(Dataset):
    def __init__(self, questions, responses, word2index, word2count, index2word, n_words, max_len):
        self.questions = questions
        self.responses = responses
        self.word2index = word2index
        self.word2count = word2count
        self.index2word = index2word
        self.n_words = n_words
        self.max_len = max_len

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        q = self.questions[idx]
        r = self.responses[idx]

        qt = [sos_token] + [self.word2index[word] for word in q]
        qt = pad_sequences([qt], maxlen=self.max_len-1, padding='post', value=pad_token)
        
        rt = [sos_token] + [self.word2index[word] for word in r]
        rt = pad_sequences([rt], maxlen=self.max_len-1, padding='post', value=pad_token)
        
        qt = qt[0]
        rt = rt[0]

        qt = np.append(qt, eos_token)
        rt = np.append(rt, eos_token)

        qt = torch.tensor(qt, dtype=torch.long, device=device)
        rt = torch.tensor(rt, dtype=torch.long, device=device)

        return (qt, rt)

In [74]:
max_len = 30

In [75]:
mydata = SeqDataset(questions=questions,
                    responses=responses,
                    word2index=word2index,
                    word2count=word2count,
                    index2word=index2word,
                    n_words=numword,
                    max_len=max_len)

In [76]:
batch_size = 16

In [77]:
mydata[500]

(tensor([   1,  219,  429,   97,   13, 2422,   79, 3145,  101, 3146,   18, 2847,
          163, 2848,   20, 3147,  949,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    2], device='cuda:0'),
 tensor([    1,    18,   128,   158,    69, 29611,   983, 11382,   780,   158,
          8794,    26, 29612,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     2],
        device='cuda:0'))

In [78]:
train_iterator = DataLoader(mydata, batch_size=batch_size, shuffle=True)

In [79]:
class Encoder(nn.Module):
    def __init__(self, hidden_size, number_of_words, num_layers = 1):
        super(Encoder, self).__init__()
        self.num_layers = num_layers
        self.embedding = nn.Embedding(number_of_words, hidden_size, padding_idx=pad_token)
        self.hidden_size = hidden_size
        self.gru = nn.GRU(input_size = hidden_size,
                          hidden_size = hidden_size,
                          num_layers = num_layers,
                          dropout = 0.2,
                          bidirectional = False)
        
    def forward(self, input_seq, hidden = None):
        embedding = self.embedding(input_seq)
        output, hidden_cell = self.gru(embedding, hidden)
        return output, hidden_cell

In [80]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size, n_layers=1):
        super(Decoder, self).__init__()

        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = nn.Embedding(output_size, hidden_size, padding_idx=pad_token)
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size,
                          hidden_size,
                          n_layers,
                          dropout=0.1)

        self.linear = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, input_step, last_hidden, encoder_outputs):
        embedded = self.embedding(input_step)
        rnn_output, hidden = self.gru(embedded, last_hidden)
        prediction = self.linear(rnn_output)
        return prediction, hidden

In [87]:
def train_step(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, batch_size, max_len):

    #print('input_variable', input_variable)
    #print('input_variable', input_variable.shape)


    #print('target_variable', target_variable)
    #print('target_variable', target_variable.shape)

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_variable = input_variable.to(device)
    target_variable = target_variable.to(device)

    output_encoders,  hidden_encoders = encoder(input_variable)
    input_decoders = torch.LongTensor([[sos_token for _ in range(batch_size)]])
    input_decoders = input_decoders.to(device)

    hidden_decoders = hidden_encoders[:decoder.n_layers]


    teacher_forcing_ratio = 0.5

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        for t in range(max_len):
            output_decoders, hidden_decoders = decoder(input_decoders, hidden_decoders, output_encoders)
            input_decoders = target_variable[t].view(1, -1)
            tar = target_variable[t]
            loss = criterion(torch.squeeze(output_decoders), tar)

    else:
        for t in range(max_len):
            output_decoders, hidden_decoders = decoder(input_decoders, hidden_decoders, output_encoders)
            tar = target_variable[t]
            output_decoders = torch.squeeze(output_decoders)
            loss = criterion(torch.squeeze(output_decoders), tar)
            top1 = F.softmax(output_decoders)
            top1 = top1.argmax(1)
            input_decoders = top1.view(1, -1)


    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss

In [88]:
hidden_size = 128
encoder_n_layers = 3
decoder_n_layers = 3
dropout = 0.1
checkpoint_iter = 10000

start_iteration = 1
print_loss = 0

encoder = Encoder(hidden_size=hidden_size, number_of_words=numword, num_layers=encoder_n_layers)
decoder = Decoder(hidden_size=hidden_size, output_size=numword, n_layers=decoder_n_layers)
encoder = encoder.to(device)
decoder = decoder.to(device)

criterion = nn.CrossEntropyLoss()

learning_rate = 0.001
decoder_learning_rate = 2.0
n_iteration = 10000
print_every = 1000
save_every = 5000
path_save = './models/'

encoder.train()
decoder.train()

print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_rate)

Building optimizers ...


In [89]:
import tqdm

In [91]:
num_epoh = 10
all_iter = 0

for ep in tqdm.tqdm(range(num_epoh)):
#for ep in range(num_epoh):
    for iteration, batch in enumerate(train_iterator):
        loss = train_step(batch[0].transpose(0, 1), batch[1].transpose(0, 1), encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, batch_size, max_len)
        print_loss += loss
        all_iter += 1

    print_loss_avg = print_loss / all_iter
    print(f'end of {ep} epoh')
    print(f'loss_avg at {iteration} is: {print_loss_avg}')
    print_loss = 0
    all_iter = 0

 10%|█         | 1/10 [00:24<03:36, 24.10s/it]

end of 0 epoh
loss_avg at 624 is: 3.126020908355713


 20%|██        | 2/10 [00:48<03:13, 24.15s/it]

end of 1 epoh
loss_avg at 624 is: 2.9536900520324707


 30%|███       | 3/10 [01:12<02:48, 24.06s/it]

end of 2 epoh
loss_avg at 624 is: 2.9283530712127686


 40%|████      | 4/10 [01:36<02:24, 24.00s/it]

end of 3 epoh
loss_avg at 624 is: 2.877788543701172


 50%|█████     | 5/10 [02:00<01:59, 23.97s/it]

end of 4 epoh
loss_avg at 624 is: 2.843484878540039


 60%|██████    | 6/10 [02:23<01:35, 23.93s/it]

end of 5 epoh
loss_avg at 624 is: 2.8436455726623535


 70%|███████   | 7/10 [02:47<01:11, 23.84s/it]

end of 6 epoh
loss_avg at 624 is: 2.828920841217041


 80%|████████  | 8/10 [03:11<00:47, 23.83s/it]

end of 7 epoh
loss_avg at 624 is: 2.824098587036133


 90%|█████████ | 9/10 [03:35<00:23, 23.84s/it]

end of 8 epoh
loss_avg at 624 is: 2.8136589527130127


100%|██████████| 10/10 [03:59<00:00, 23.91s/it]

end of 9 epoh
loss_avg at 624 is: 2.817549228668213





In [92]:
sub_smp = df_filtred.sample(5)
sent_q = sub_smp['question'].tolist()
sent_a = sub_smp['responses'].tolist()

In [93]:
sent_q[0]

'как в кратчайшие сроки научиться садиться на ...шпагат ? ))...'

In [94]:
def predict_answer(encoder, decoder, input_sentence, word2index, max_len, index2word):
    input_sentence = preprocessing_sentence(input_sentence)
    res_input_sentence = list()
    for word in input_sentence:
        if word in word2index:
            res_input_sentence.append(word)
    print(res_input_sentence)
    sent = [sos_token] + [word2index[word] for word in res_input_sentence]
    sent = pad_sequences([sent], maxlen=max_len-1, padding='post', value=pad_token)
    sent = sent[0]
    sent = np.append(sent, eos_token)
    print(sent)
    sent = torch.tensor([sent], dtype=torch.long, device=device)
    sent = sent.transpose(0, 1)
    input_batch = sent.to(device)

    output_encoder, hidden_encoder = encoder(input_batch)
    hidden_decoder = hidden_encoder[:decoder.n_layers]
    input_decoder = torch.LongTensor([[sos_token for _ in range(1)]]).to(device)

    all_tokens = torch.zeros([0], device=device, dtype = torch.long)

    for _ in range(max_len):
            output_decoder, hidden_decoder = decoder(input_decoder, hidden_decoder, output_encoder)
            output_decoder = torch.squeeze(output_decoder)
            output_decoder = F.softmax(output_decoder)
            output_index = torch.argmax(output_decoder)
            output_index = torch.unsqueeze(output_index, 0)
            all_tokens = torch.cat((all_tokens, output_index), dim = 0)
            input_decoder = torch.unsqueeze(output_index, 0)

    words_decoder = [index2word[index.item()] for index in all_tokens]

    print(words_decoder)



In [95]:
predict_answer(encoder, decoder, 'как в кратчайшие сроки научиться садиться на ...шпагат', word2index, max_len, index2word)

['как', 'в', 'кратчайшие', 'сроки', 'научиться', 'садиться', 'на', 'шпагат']
[    1    16    20 25104 25105   259 25106    18 16260     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     2]
['PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']


