In [1]:
import pandas as pd
import numpy as np
import requests
import tqdm
import pymorphy2
from ast import literal_eval
from model.config import Config
from model.data_utils import UNK, NUM, BEGIN, END, \
    get_glove_vocab, write_vocab, load_vocab, \
    export_trimmed_glove_vectors, get_processing_word, \
    get_vocab, get_unique_column_words, correct_sentence, \
    change_letter, unk_to_normal_form, sentence_to_indices, \
    merge_context_and_reply

In [2]:
config = Config(load=False)

In [3]:
train_ids = np.load(config.train_indices)
test_ids = np.load(config.test_indices)
val_ids = np.load(config.val_indices)

In [4]:
data = pd.read_csv(config.path_to_train_dataframe, error_bad_lines=False, sep = '[  . ? , !]?\t', 
                   header=None)
data.columns = config.train_column_names

public = pd.read_csv(config.path_to_test_dataframe, error_bad_lines=False, sep = '[  . ? , !]?\t', 
                   header=None)
public.columns = config.test_column_names

  
  


In [5]:
data.head()

Unnamed: 0,context_id,context_2,context_1,context_0,reply_id,reply,label,confidence
0,22579918886,"кликни на меня а потом на надпись "" видео - зв...","о , я тебя вижу","ладно , повесь трубку",0,не могу,good,0.875352
1,22579918886,"кликни на меня а потом на надпись "" видео - зв...","о , я тебя вижу","ладно , повесь трубку",1,"нет , звонить буду я",neutral,0.900968
2,22579918886,"кликни на меня а потом на надпись "" видео - зв...","о , я тебя вижу","ладно , повесь трубку",2,"слушай , я не мог уйти",bad,0.88432
3,22579918886,"кликни на меня а потом на надпись "" видео - зв...","о , я тебя вижу","ладно , повесь трубку",3,я не прекращу звонить,good,0.98253
4,22579918886,"кликни на меня а потом на надпись "" видео - зв...","о , я тебя вижу","ладно , повесь трубку",4,я звоню им,good,0.838054


In [6]:
vocab = load_vocab(config.filename_words)

In [7]:
unk_dict = np.load(config.unk_dict).item()

In [8]:
data['context_2'] = data['context_2'].apply(lambda x: sentence_to_indices(x, vocab, unk_dict))
data['context_1'] = data['context_1'].apply(lambda x: sentence_to_indices(x, vocab, unk_dict))
data['context_0'] = data['context_0'].apply(lambda x: sentence_to_indices(x, vocab, unk_dict))
data['reply'] = data['reply'].apply(lambda x: sentence_to_indices(x, vocab, unk_dict))
data['one_hot_label'] = data.label.apply(lambda x: config.mapping[x])
data['weighted_label'] = [list(np.multiply(data.loc[i, 'one_hot_label'], 
                                        data.loc[i, 'confidence'])) for i in data.index]

In [9]:
data.head()

Unnamed: 0,context_id,context_2,context_1,context_0,reply_id,reply,label,confidence,one_hot_label,weighted_label
0,22579918886,"[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[30578, 7323, 21487, 7226, 24834]","[34273, 7323, 7921, 38444]",0,"[39653, 20692]",good,0.875352,"[0, 0, 1]","[0.0, 0.0, 0.8753516175]"
1,22579918886,"[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[30578, 7323, 21487, 7226, 24834]","[34273, 7323, 7921, 38444]",1,"[25647, 7323, 7587, 9585, 21487]",neutral,0.900968,"[0, 1, 0]","[0.0, 0.9009682112999999, 0.0]"
2,22579918886,"[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[30578, 7323, 21487, 7226, 24834]","[34273, 7323, 7921, 38444]",2,"[7289, 7323, 21487, 39653, 23581, 40946]",bad,0.88432,"[1, 0, 0]","[0.8843202145, 0.0, 0.0]"
3,22579918886,"[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[30578, 7323, 21487, 7226, 24834]","[34273, 7323, 7921, 38444]",3,"[21487, 39653, 20211, 7587]",good,0.98253,"[0, 0, 1]","[0.0, 0.0, 0.9825304673]"
4,22579918886,"[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[30578, 7323, 21487, 7226, 24834]","[34273, 7323, 7921, 38444]",4,"[21487, 9009, 37056]",good,0.838054,"[0, 0, 1]","[0.0, 0.0, 0.8380535095999999]"


In [10]:
public['context_2'] = public['context_2'].apply(lambda x: sentence_to_indices(x, vocab, unk_dict))
public['context_1'] = public['context_1'].apply(lambda x: sentence_to_indices(x, vocab, unk_dict))
public['context_0'] = public['context_0'].apply(lambda x: sentence_to_indices(x, vocab, unk_dict))
public['reply'] = public['reply'].apply(lambda x: sentence_to_indices(x, vocab, unk_dict))

In [11]:
public.head()

Unnamed: 0,context_id,context_2,context_1,context_0,reply_id,reply
0,138920940977,"[24350, 7323, 21487, 19771, 15667, 7323, 27441...","[39653, 22909, 10226]",[25647],0,"[1292, 30318]"
1,138920940977,"[24350, 7323, 21487, 19771, 15667, 7323, 27441...","[39653, 22909, 10226]",[25647],1,"[25647, 7323, 39653, 21278, 30318]"
2,138920940977,"[24350, 7323, 21487, 19771, 15667, 7323, 27441...","[39653, 22909, 10226]",[25647],2,"[25647, 30318]"
3,138920940977,"[24350, 7323, 21487, 19771, 15667, 7323, 27441...","[39653, 22909, 10226]",[25647],3,"[34555, 7323, 25647, 30318]"
4,138920940977,"[24350, 7323, 21487, 19771, 15667, 7323, 27441...","[39653, 22909, 10226]",[25647],4,"[24734, 7323, 25647, 30318]"


In [12]:
data['merged_contexts'] = [merge_context_and_reply(data, i, ['context_2', 'context_1', 'context_0']) 
                            for i in tqdm.tqdm(data.index)]

100%|██████████| 97533/97533 [02:55<00:00, 556.58it/s]


In [13]:
data['contexts_and_reply'] = [merge_context_and_reply(data, i) 
                            for i in tqdm.tqdm(data.index)]

100%|██████████| 97533/97533 [03:42<00:00, 437.50it/s]


In [14]:
public["merged_contexts"] = [merge_context_and_reply(public, i, ['context_2', 'context_1', 'context_0']) 
                            for i in tqdm.tqdm(public.index)]

100%|██████████| 9968/9968 [00:16<00:00, 592.55it/s]


In [15]:
public['contexts_and_reply'] = [merge_context_and_reply(public, i) 
                            for i in tqdm.tqdm(public.index)]

100%|██████████| 9968/9968 [00:22<00:00, 449.69it/s]


In [16]:
data.head()

Unnamed: 0,context_id,context_2,context_1,context_0,reply_id,reply,label,confidence,one_hot_label,weighted_label,merged_contexts,contexts_and_reply
0,22579918886,"[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[30578, 7323, 21487, 7226, 24834]","[34273, 7323, 7921, 38444]",0,"[39653, 20692]",good,0.875352,"[0, 0, 1]","[0.0, 0.0, 0.8753516175]","[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[18081, 40644, 13317, 10226, 2857, 40644, 6788..."
1,22579918886,"[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[30578, 7323, 21487, 7226, 24834]","[34273, 7323, 7921, 38444]",1,"[25647, 7323, 7587, 9585, 21487]",neutral,0.900968,"[0, 1, 0]","[0.0, 0.9009682112999999, 0.0]","[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[18081, 40644, 13317, 10226, 2857, 40644, 6788..."
2,22579918886,"[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[30578, 7323, 21487, 7226, 24834]","[34273, 7323, 7921, 38444]",2,"[7289, 7323, 21487, 39653, 23581, 40946]",bad,0.88432,"[1, 0, 0]","[0.8843202145, 0.0, 0.0]","[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[18081, 40644, 13317, 10226, 2857, 40644, 6788..."
3,22579918886,"[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[30578, 7323, 21487, 7226, 24834]","[34273, 7323, 7921, 38444]",3,"[21487, 39653, 20211, 7587]",good,0.98253,"[0, 0, 1]","[0.0, 0.0, 0.9825304673]","[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[18081, 40644, 13317, 10226, 2857, 40644, 6788..."
4,22579918886,"[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[30578, 7323, 21487, 7226, 24834]","[34273, 7323, 7921, 38444]",4,"[21487, 9009, 37056]",good,0.838054,"[0, 0, 1]","[0.0, 0.0, 0.8380535095999999]","[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[18081, 40644, 13317, 10226, 2857, 40644, 6788..."


In [17]:
public.head()

Unnamed: 0,context_id,context_2,context_1,context_0,reply_id,reply,merged_contexts,contexts_and_reply
0,138920940977,"[24350, 7323, 21487, 19771, 15667, 7323, 27441...","[39653, 22909, 10226]",[25647],0,"[1292, 30318]","[24350, 7323, 21487, 19771, 15667, 7323, 27441...","[24350, 7323, 21487, 19771, 15667, 7323, 27441..."
1,138920940977,"[24350, 7323, 21487, 19771, 15667, 7323, 27441...","[39653, 22909, 10226]",[25647],1,"[25647, 7323, 39653, 21278, 30318]","[24350, 7323, 21487, 19771, 15667, 7323, 27441...","[24350, 7323, 21487, 19771, 15667, 7323, 27441..."
2,138920940977,"[24350, 7323, 21487, 19771, 15667, 7323, 27441...","[39653, 22909, 10226]",[25647],2,"[25647, 30318]","[24350, 7323, 21487, 19771, 15667, 7323, 27441...","[24350, 7323, 21487, 19771, 15667, 7323, 27441..."
3,138920940977,"[24350, 7323, 21487, 19771, 15667, 7323, 27441...","[39653, 22909, 10226]",[25647],3,"[34555, 7323, 25647, 30318]","[24350, 7323, 21487, 19771, 15667, 7323, 27441...","[24350, 7323, 21487, 19771, 15667, 7323, 27441..."
4,138920940977,"[24350, 7323, 21487, 19771, 15667, 7323, 27441...","[39653, 22909, 10226]",[25647],4,"[24734, 7323, 25647, 30318]","[24350, 7323, 21487, 19771, 15667, 7323, 27441...","[24350, 7323, 21487, 19771, 15667, 7323, 27441..."


In [18]:
data.to_csv(config.path_to_preprocessed_train, index=False)

In [19]:
public.to_csv(config.path_to_preprocessed_test, index=False)

In [20]:
train = data.loc[data['context_id'].isin(train_ids)]
test = data.loc[data['context_id'].isin(test_ids)]
val = data.loc[data['context_id'].isin(val_ids)]

In [21]:
train.to_csv("../data/train_splitted.csv", index=False)

In [22]:
test.to_csv("../data/test_splitted.csv", index=False)

In [23]:
val.to_csv("../data/val_splitted.csv", index=False)

In [24]:
train.head()

Unnamed: 0,context_id,context_2,context_1,context_0,reply_id,reply,label,confidence,one_hot_label,weighted_label,merged_contexts,contexts_and_reply
0,22579918886,"[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[30578, 7323, 21487, 7226, 24834]","[34273, 7323, 7921, 38444]",0,"[39653, 20692]",good,0.875352,"[0, 0, 1]","[0.0, 0.0, 0.8753516175]","[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[18081, 40644, 13317, 10226, 2857, 40644, 6788..."
1,22579918886,"[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[30578, 7323, 21487, 7226, 24834]","[34273, 7323, 7921, 38444]",1,"[25647, 7323, 7587, 9585, 21487]",neutral,0.900968,"[0, 1, 0]","[0.0, 0.9009682112999999, 0.0]","[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[18081, 40644, 13317, 10226, 2857, 40644, 6788..."
2,22579918886,"[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[30578, 7323, 21487, 7226, 24834]","[34273, 7323, 7921, 38444]",2,"[7289, 7323, 21487, 39653, 23581, 40946]",bad,0.88432,"[1, 0, 0]","[0.8843202145, 0.0, 0.0]","[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[18081, 40644, 13317, 10226, 2857, 40644, 6788..."
3,22579918886,"[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[30578, 7323, 21487, 7226, 24834]","[34273, 7323, 7921, 38444]",3,"[21487, 39653, 20211, 7587]",good,0.98253,"[0, 0, 1]","[0.0, 0.0, 0.9825304673]","[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[18081, 40644, 13317, 10226, 2857, 40644, 6788..."
4,22579918886,"[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[30578, 7323, 21487, 7226, 24834]","[34273, 7323, 7921, 38444]",4,"[21487, 9009, 37056]",good,0.838054,"[0, 0, 1]","[0.0, 0.0, 0.8380535095999999]","[18081, 40644, 13317, 10226, 2857, 40644, 6788...","[18081, 40644, 13317, 10226, 2857, 40644, 6788..."


In [25]:
test.head()

Unnamed: 0,context_id,context_2,context_1,context_0,reply_id,reply,label,confidence,one_hot_label,weighted_label,merged_contexts,contexts_and_reply
24,127768564286,"[32116, 28301, 38394, 17608, 35699, 17608, 281...","[21487, 12780, 20040, 9510, 7323, 20040, 37590...","[2152, 6985, 12005, 37776]",0,"[28301, 2152, 14142, 16293, 7323, 23617]",good,0.867679,"[0, 0, 1]","[0.0, 0.0, 0.8676785568000001]","[32116, 28301, 38394, 17608, 35699, 17608, 281...","[32116, 28301, 38394, 17608, 35699, 17608, 281..."
25,127768564286,"[32116, 28301, 38394, 17608, 35699, 17608, 281...","[21487, 12780, 20040, 9510, 7323, 20040, 37590...","[2152, 6985, 12005, 37776]",1,"[2152, 14142]",neutral,0.653608,"[0, 1, 0]","[0.0, 0.6536082455, 0.0]","[32116, 28301, 38394, 17608, 35699, 17608, 281...","[32116, 28301, 38394, 17608, 35699, 17608, 281..."
26,127768564286,"[32116, 28301, 38394, 17608, 35699, 17608, 281...","[21487, 12780, 20040, 9510, 7323, 20040, 37590...","[2152, 6985, 12005, 37776]",2,"[10226, 21487, 12780, 17608, 6956, 17608]",good,0.903552,"[0, 0, 1]","[0.0, 0.0, 0.9035521986]","[32116, 28301, 38394, 17608, 35699, 17608, 281...","[32116, 28301, 38394, 17608, 35699, 17608, 281..."
27,127768564286,"[32116, 28301, 38394, 17608, 35699, 17608, 281...","[21487, 12780, 20040, 9510, 7323, 20040, 37590...","[2152, 6985, 12005, 37776]",3,"[5603, 2152, 14142, 17072, 12005]",bad,0.94458,"[1, 0, 0]","[0.9445797611, 0.0, 0.0]","[32116, 28301, 38394, 17608, 35699, 17608, 281...","[32116, 28301, 38394, 17608, 35699, 17608, 281..."
28,127768564286,"[32116, 28301, 38394, 17608, 35699, 17608, 281...","[21487, 12780, 20040, 9510, 7323, 20040, 37590...","[2152, 6985, 12005, 37776]",4,"[4617, 7323, 5603, 2152, 14142]",good,0.87135,"[0, 0, 1]","[0.0, 0.0, 0.8713497773000001]","[32116, 28301, 38394, 17608, 35699, 17608, 281...","[32116, 28301, 38394, 17608, 35699, 17608, 281..."


In [26]:
val.head()

Unnamed: 0,context_id,context_2,context_1,context_0,reply_id,reply,label,confidence,one_hot_label,weighted_label,merged_contexts,contexts_and_reply
121,521831731666,[27854],"[19015, 9575, 2178]","[7699, 22909, 21012, 12703, 5473]",0,"[11172, 25445, 39653, 24279]",neutral,0.936427,"[0, 1, 0]","[0.0, 0.9364273958, 0.0]","[27854, 19015, 9575, 2178, 7699, 22909, 21012,...","[27854, 19015, 9575, 2178, 7699, 22909, 21012,..."
122,521831731666,[27854],"[19015, 9575, 2178]","[7699, 22909, 21012, 12703, 5473]",1,"[25647, 7583]",good,0.586733,"[0, 0, 1]","[0.0, 0.0, 0.5867328328]","[27854, 19015, 9575, 2178, 7699, 22909, 21012,...","[27854, 19015, 9575, 2178, 7699, 22909, 21012,..."
123,521831731666,[27854],"[19015, 9575, 2178]","[7699, 22909, 21012, 12703, 5473]",2,"[25445, 22909, 1216, 5473, 7323, 5603, 25445, ...",good,0.958358,"[0, 0, 1]","[0.0, 0.0, 0.9583579443000001]","[27854, 19015, 9575, 2178, 7699, 22909, 21012,...","[27854, 19015, 9575, 2178, 7699, 22909, 21012,..."
124,521831731666,[27854],"[19015, 9575, 2178]","[7699, 22909, 21012, 12703, 5473]",3,"[5566, 27854, 35304]",bad,0.965069,"[1, 0, 0]","[0.9650693929, 0.0, 0.0]","[27854, 19015, 9575, 2178, 7699, 22909, 21012,...","[27854, 19015, 9575, 2178, 7699, 22909, 21012,..."
125,521831731666,[27854],"[19015, 9575, 2178]","[7699, 22909, 21012, 12703, 5473]",4,"[25647, 7323, 39653, 5473]",good,0.865941,"[0, 0, 1]","[0.0, 0.0, 0.86594092]","[27854, 19015, 9575, 2178, 7699, 22909, 21012,...","[27854, 19015, 9575, 2178, 7699, 22909, 21012,..."
