In [1]:
from pathlib import Path

import pandas as pd

In [52]:
seed = 24666 # 23 42 24666
init_df_path = Path(f"../../data/classification/baseline/train_{seed}.csv")
output_folder = Path(f"../../data/classification/synonyms")
text_column = "Текст"
label_column = "Тональность"

# Base Synonyms

In [40]:
# !pip install git+https://github.com/ahmados/rusynonyms.git

In [41]:
# !pip install spacy==3.1.3

In [53]:
from spacy.lang.ru import Russian

import random
from ru_synonyms import AntonymsGraph, SynonymsGraph

nlp = Russian()
# Initialize both synonyms and antonyms graph
sg = SynonymsGraph()
ag = AntonymsGraph()

In [54]:
def change_random_word(text):
    tokenized = [token.text for token in nlp(text) if not token.is_punct]
    
    if len(tokenized) == 0:
        return "No luck"
    
    random_word = random.choice(range(len(tokenized)))
    if sg.is_in_dictionary(tokenized[random_word]):
        synonyms = list(sg.get_list(tokenized[random_word]))
        if len(synonyms):
            random_word_syn = random.choice(synonyms)
            tokenized[random_word] = random_word_syn
            return ' '.join(tokenized)
    else:
        list_copy = [i for i in range(len(tokenized))]      
        random.shuffle(list_copy)
        for token in list_copy:
            if sg.is_in_dictionary(tokenized[token]):
                synonyms = list(sg.get_list(tokenized[token]))
                if len(synonyms):
                    word_syn = random.choice(synonyms)
                    tokenized[token] = word_syn
                    return ' '.join(tokenized)
    return "No luck"

In [55]:
df = pd.read_csv(init_df_path)
print(df.shape)

df_done = pd.DataFrame()

texts, labels = [], []
texts.extend(df[text_column].tolist())
labels.extend(df[label_column].tolist())

fails = 0
for i, raw in df.iterrows():
    
    text, label = raw[text_column], raw[label_column]
    new_text = change_random_word(text)
    
    if new_text == "No luck":
        fails += 1
        continue
        
    texts.append(new_text)
    labels.append(label)
    
print(f"Failed {fails} times.")
print(len(texts), len(labels))

(11668, 4)
Failed 1725 times.
21611 21611


# W2V Synonyms

In [29]:
# !wget http://vectors.nlpl.eu/repository/20/185.zip
# !unzip -d 180/ 185.zip
# !wget https://raw.githubusercontent.com/akutuzov/universal-pos-tags/4653e8a9154e93fe2f417c7fdb7a357b7d6ce333/ru-rnc.map

In [56]:
mapping = {}

for line in open('ru-rnc.map'):
    ms, ud = line.strip('\n').split()
    mapping[ms] = ud

In [57]:
import re

import gensim
from pymystem3 import Mystem
from thefuzz import fuzz

m = Mystem()
model = gensim.models.KeyedVectors.load_word2vec_format('180/model.bin', binary=True)

In [58]:
def normalize_mystem_wo_filter(text):
    tokens = []
    norm_words = m.analyze(text)
    
    for norm_word in norm_words:
        
        if 'analysis' not in norm_word and 'text' in norm_word:
            lemma = norm_word["text"]
            if lemma not in [' ', '\n']:
                pos = 'NUMPUNCT'
                tokens.append(lemma+'_'+pos)
            
        elif not len(norm_word['analysis']):
            lemma = norm_word['text']
            pos = 'UNKN'
            tokens.append(lemma+'_'+pos)
        else:
            for w in norm_word["analysis"]:
                if "lex" in w:
                    lemma = w["lex"].lower().strip()
                    pos = w["gr"].split(',')[0]
                    pos = pos.split('=')[0].strip()
                    pos = mapping[pos]
                    tokens.append(lemma+'_'+pos)
                else:
                    lemma = w["text"]
                    pos = 'UNKN'
                    tokens.append(lemma+'_'+pos)

    return tokens

In [59]:
def change_random_word_w2v(text):
    
    tokenized_normalized = normalize_mystem_wo_filter(text)
    num = random.choice(range(len(tokenized_normalized)))
    list_copy = [i for i in range(len(tokenized_normalized)) if '_NUMPUNCT' not in tokenized_normalized[i]]      
    random.shuffle(list_copy)

    i = 0
    for token in list_copy:

        if i != num:

            if tokenized_normalized[token] in model.key_to_index:
                synonyms = model.most_similar(tokenized_normalized[token])
                synonyms = [syn[0] for syn in synonyms \
                            if fuzz.token_sort_ratio(syn[0], tokenized_normalized[token]) < 100]

                if len(synonyms):
                    word_syn = random.choice(synonyms)                                
                    tokenized_normalized[token] = word_syn
                    tokenized_normalized = [re.sub("_.+", "", w) for w in tokenized_normalized]
                    i +=1
        if i > 0:            
            return " ".join(tokenized_normalized)
    return "No luck"

In [60]:
fails = 0
for i, raw in df.iterrows():
    
    text, label = raw[text_column], raw[label_column]
    new_text = change_random_word_w2v(text)
    
    if new_text == "No luck":
        fails += 1
        continue
        
    texts.append(new_text)
    labels.append(label)
    
print(f"Failed {fails} times.")
print(len(texts), len(labels))

Failed 2669 times.
30610 30610


In [61]:
df_done[text_column] = texts
df_done[label_column] = labels

print(df_done.shape)
df_done.tail(10)

(30610, 2)


Unnamed: 0,Текст,Тональность
30600,все -всегд оперативно и качественно,Положительный
30601,нужно окно помыть .,Предложение
30602,улучшать качество расчистка .,Предложение
30603,"пол у мы страшный , скоро быть запинаться .",Отрицательный
30604,вообще не организовать данный процесс \n,Отрицательный
30605,"спасибо , что предоставлять троллейбус \n",Положительный
30606,"в офис долинск , ужасный работа клиринговый к...",Отрицательный
30607,"сервис удобный , но здание необходимый всесто...",Положительный
30608,долго ждать машина возвращаться от клиентура ....,Отрицательный
30609,все сделать весьма хорошо,Положительный


In [62]:
df_done.to_csv(output_folder / f"train_{seed}.csv", index=False, encoding="utf-8")