# Data preparation
1. Trash cleaning
2. Deduplication

In [1]:
import re

def format_string(string: str) -> str:
    string = string.strip()
    if len(string) >= 2 and string[-1] == "." and string[-2] != "." or string[-1] == '/':
        string = string[:-1]
        string = string.strip()
    string = re.sub(r"[ ]+(?=(\.|\?|\!|,))", "", string)
    string = re.sub(r"(\.){4,}", "...", string)
    string = re.sub(r"[ ]{2,}", " ", string)
    string = re.sub(r"(?<=\\w)\\.\\.(?=(\b|$| ))", "...", string)
    string = re.sub(r"\.(?=(\?|\!|\)))", "", string)
    string = re.sub(r"(?<=(\?|\!))( |\.)", "", string)
    string = re.sub(r"(?<=\w(\.|\?|\!))(?=\w)", " ", string)
    return string

In [2]:
import pandas as pd

txt_df = pd.read_csv('data/train.txt', sep='\t', names=['question', 'answer'])
txt_df = txt_df[~txt_df['question'].str.len() < 15]

txt_df = txt_df[~txt_df['question'].isna()]
txt_df = txt_df[~txt_df['answer'].isna()]

In [3]:
%%time
import json

json_data = []
with open('data/qa_data.jsonl') as f:
    for line in f:
        json_data.append(json.loads(line))

CPU times: user 21.3 s, sys: 1.5 s, total: 22.8 s
Wall time: 22.9 s


In [18]:
%%time
json_df = pd.DataFrame(json_data)
json_df = json_df[~json_df['question'].str.len() < 15]
# json_df = json_df.explode('responses')
json_df = json_df.groupby('question', as_index=False).head(1)

json_df = json_df[json_df['responses'].str.len() > 0]

json_df['responses'] = json_df['responses'].apply(lambda x: x[0])

json_df = json_df[~json_df['responses'].isna()]
json_df = json_df[~json_df['question'].isna()]

CPU times: user 8.73 s, sys: 314 ms, total: 9.04 s
Wall time: 9.08 s


In [19]:
from tqdm.auto import tqdm

tqdm.pandas()

In [20]:
json_df['question'] = json_df['question'].progress_apply(lambda x: format_string(x))
json_df['responses'] = json_df['responses'].progress_apply(lambda x: format_string(x))

  0%|          | 0/710162 [00:00<?, ?it/s]

  0%|          | 0/710162 [00:00<?, ?it/s]

In [21]:
txt_df['question'] = txt_df['question'].progress_apply(lambda x: format_string(x))
txt_df['answer'] = txt_df['answer'].progress_apply(lambda x: format_string(x))

  0%|          | 0/5880497 [00:00<?, ?it/s]

  0%|          | 0/5880497 [00:00<?, ?it/s]

In [22]:
txt_df = txt_df[txt_df['question'].str.len() < 60]
txt_df = txt_df[txt_df['answer'].str.len() < 60]
json_df = json_df[json_df['question'].str.len() < 60]
json_df = json_df[json_df['responses'].str.len() < 60]

In [23]:
def up_first_letter(string):
    if string[0].isalpha():
        return string[0].upper() + string[1:]
    return string

txt_df['question'] = txt_df['question'].progress_apply(up_first_letter)
txt_df['answer'] = txt_df['answer'].progress_apply(up_first_letter)
json_df['question'] = json_df['question'].progress_apply(up_first_letter)
json_df['responses'] = json_df['responses'].progress_apply(up_first_letter)

  0%|          | 0/5173187 [00:00<?, ?it/s]

  0%|          | 0/5173187 [00:00<?, ?it/s]

  0%|          | 0/258247 [00:00<?, ?it/s]

  0%|          | 0/258247 [00:00<?, ?it/s]

In [24]:
json_df = json_df[~json_df['responses'].isna()]
json_df = json_df[~json_df['question'].isna()]
txt_df = txt_df[~txt_df['question'].isna()]
txt_df = txt_df[~txt_df['answer'].isna()]

In [25]:
json_df.columns = ['question', 'category', 'answer']
total = pd.concat([json_df[['question', 'answer']], txt_df[['question', 'answer']]])

In [26]:
from collections import Counter

c = Counter(total['question'])
trash = [el[0] for el in c.most_common()[:30]]

In [40]:
emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u4e00-\u9fff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  
        u"\u3030"
                      "]+", re.UNICODE)

def has_trash(text):
    if re.search(emoj, text):
        return True
    if re.search(r"""[^А-яё\d\s:\?\!:;'"\.\,\\\/\)\(\]\[-–-_]""", text):
        return True
    return False

total = total[~total['question'].progress_apply(has_trash)]
total = total[~total['answer'].progress_apply(has_trash)]

  0%|          | 0/5347698 [00:00<?, ?it/s]

  0%|          | 0/5317929 [00:00<?, ?it/s]

In [41]:
# total.to_csv('data/seq2seq_data.csv', index=False, sep='\t')

In [42]:
%%time
total.to_csv('data/seq2seq_data.zip', index=False, sep='\t', compression='gzip')

In [None]:
ंहხგ

In [44]:
total[total['question'].str.contains('ं')]

Unnamed: 0,question,answer


In [45]:
total[total['answer'].str.contains('ं')]

Unnamed: 0,question,answer
3325292,Что в голове у умной женщины?,त्वां राम कामयामि - вот это
3364087,"Как будет по-бенгальски ""я тебя люблю""?",Вот так - संस्कृतम्
3835724,На каком языке я тебя люблю звучит особенно кр...,मैं तुमसे प्यार करता हूँ (хинди)


In [50]:
re.search(r"""[^А-яё\d\s:\?\!:;'"\.\,\\\/\)\(\]\[-–-_]""", 'मैं तुमसे प्यार करता हूँ (хинди)')

In [None]:
def has_trash(text):
    if re.search(emoj, text):
        return True
    if re.search(r"""[^А-яё\d\s:\?\!:;'"\.\,\\\/\)\(\]\[-–-_]""", text):
        return True
    return False

total = total[~total['question'].progress_apply(has_trash)]

In [None]:
total = total[~total['answer'].progress_apply(has_trash)]

In [51]:
total

Unnamed: 0,question,answer
0,Долго ли идут деньги с яндексденег на карту visa?,Нет. прорыв 35 ;)
3,В чем отличие медитации от йоги?,Букв в йоге меньше
4,Когда начнут линейку фильмов про лигу чемпионов?,А не фильм? жалко... а я то думал - хорошая ко...
5,А цветы нуждаются в оценке проходящего мимо?),Только для этого и растут
6,Что это значит?,Грива наверное растёт)))
...,...,...
5880492,Как вы заманили мужчину? ваш сикрет?,Он сам пришёл - добровольно)
5880493,Что может быть пиком эмоционального напряжения...,"Поход в горы, вернее обратно"
5880494,Каким законом мы должны жить чтобы бог был дов...,Так его же и законами. вы что не знали?
5880495,А. ты много. фаетазируешь?:):):),Много и даже претворяю фантазии в жизнь)
