# An algorithm to extract candidate monosemous relatives and then filter them with word2vec model

In [1]:
import pandas as pd
import os
from collections import Counter
from lxml import etree
from tqdm import tqdm_notebook, tqdm
import gensim
from gensim.models import Word2Vec
import re
import xml.etree.cElementTree as ET
from pytorch_pretrained_bert import BertTokenizer, BertModel, BasicTokenizer
from xml.dom import minidom
import sklearn
from sklearn import metrics
import operator
import pymorphy2
import json
import copy
import nltk
import pickle
morph = pymorphy2.MorphAnalyzer()

In [None]:
"""
Loading pre-trained word2vec model
"""
model_2 =  Word2Vec.load(r'Taiga_1billion\Taiga_1billion\word2vec_win3_proza_ru.model')

In [None]:
"""
Dictionary with monosemous nouns
"""
with open(r'monosemous_words.pkl', 'rb') as f:
    mono_dict_all = pickle.load(f)

In [None]:
russe_synsets = {
    
    'замок':['N29241', 'N24173'],
    'лук':['N41975', 'N12915'],
    'бор':['N31212', 'N13050'],
    'дар':['N17912', 'N31026'],
    'двигатель':['N27057', 'N16946'],
    'дедушка':['N20410', 'N35355'],
    'декрет':['N13903', 'N34626'],
    'дерево':['N40123', 'N33095'],
    'диалог':['N34886', 'N38215'],
    'диплом':['N35864', 'N40323', 'N20982'],
    'доктор':['N28236', 'N30160'],
    'доля':['N25033', 'N25297'],
    'достижение':['N38801', 'N34449'],
    'жестокость':['N33820', 'N41393'],
    'жребий':['N21712', 'N25033'],
    'затея':['N18425', 'N20053'],
    'застой':['N25942', 'N37078'],
    'затишье':['N39421', 'N12860'],
    'затмение':['N19420', 'N36219'],
    'капот': ['N13120', 'N15899'],
    'таз': ['N14586', 'N30033'],
    'слог': ['N15152', 'N21947'],
    'байка': ['N16141', 'N39858'],
    'гусеница': ['N19345', 'N21860'],
    'стопка': ['N25286', 'N26126'],
    'гвоздика': ['N26662', 'N31219'],
    'крона': ['N28683', 'N30465', 'N33001', 'N37840'],#норвежская, шведская, датская,крона дерева
    'акция': ['N29853', 'N41588'],
    'такса': ['N35039', 'N36673'],
    'рок': ['N36575', 'N40621']
}

In [None]:
mapping_to_sense_definitions = {
    
'N29241':'ЗАМОК_СТРОЕНИЕ',
'N24173':'ЗАМОК_ЗАПОР',
'N12915':'ЛУК_ОРУЖИЕ',
'N41975':'ЛУК_ОВОЩ',
'N31212': 'БОР_ЭЛЕМЕНТ',
'N13050': 'БОР_ЛЕС',
'N17912':'ДАР_ВРОЖДЕННЫЙ',
'N31026':'ДАР_ПОДАРОК',
'N27057': 'ДВИГАТЕЛЬ_АГРЕГАТ',
'N16946': 'ДВИГАТЕЛЬ_ДВИЖУЩ_СИЛА',
'N20410': 'ДЕДУШКА_СТАРИК',
'N35355': 'ДЕДУШКА_РОДСТВЕНН',
'N13903': 'ДЕКРЕТ_ОТПУСК',
'N34626': 'ДЕКРЕТ_ДОКУМЕНТ',
'N40123': 'ДЕРЕВО_ДЕРЕВЦЕ',
'N33095': 'ДЕРЕВО_ДРЕВЕСИНА',
'N34886': 'ДИАЛОГ_БЕСЕДА',
'N38215': 'ДИАЛОГ_МЖД_СТОРОНАМИ',
'N35864': 'ДИПЛОМ_ВУЗА',
'N40323': 'ДИПЛОМ_ПАМЯТНЫЙ',
'N20982': 'ДИПЛОМ_РАБОТА',
'N28236': 'ДОКТОР_НАУК',
'N30160': 'ДОКТОР_ВРАЧ',
'N25033': 'ДОЛЯ_УЧАСТЬ',
'N25297': 'ДОЛЯ_ЧАСТЬ',
'N38801': 'ДОСТИЖЕНИЕ_ЦЕЛИ',
'N34449': 'ДОСТИЖЕНИЕ_УРОВНЯ',
'N33820': 'ЖЕСТОКОСТЬ_БЕСПОЩАДНОСТЬ',
'N41393': 'ЖЕСТОКОСТЬ_ОБРАЩЕНИЕ',
'N21712': 'ЖРЕБИЙ_РЕШЕНИЕ',
'N25033': 'ЖРЕБИЙ_СУДЬБА',
'N18425': 'ЗАТЕЯ_ЗАБАВА',
'N20053': 'ЗАТЕЯ_НАЧИНАНИЕ',
'N25942': 'ЗАСТОЙ_ЗАСТОЙН_ЯВЛЕНИЕ',
'N37078': 'ЗАСТОЙ_СТАГНАЦИЯ_РАЗВИТ',
'N39421': 'ЗАТИШЬЕ_СНИЖ_АКТИВНОСТИ',
'N12860': 'ЗАТИШЬЕ_БЕЗВЕТР_ТИШЬ',
'N19420': 'ЗАТМЕНИЕ_ОДУРЕНИЕ',
'N36219': 'ЗАТМЕНИЕ_СВЕТИЛА',
    'N13120':'КАПОТ_ОДЕЖДА',
    'N15899': 'КАПОТ_МАШИНЫ',
    'N14586':'ТАЗ_КОСТЬ',
    'N30033': 'ТАЗ_ПОСУДА',
    'N15152':'СЛОГ_ЗВУК', 
    'N21947': 'СЛОГ_СТИЛЬ',
    'N16141': 'БАЙКА_ЛОЖЬ',
    'N39858': 'БАЙКА_ТКАНЬ',
    'N19345': 'ГУСЕНИЦА_МЕХАНИЗМ',
    'N21860': 'ГУСЕНИЦА_ЛИЧИНКА',
    'N25286': 'СТОПКА_КУЧА',
    'N26126': 'СТОПКА_ПОСУДА',
    'N26662': 'ГВОЗДИКА_ПРИПРАВА',
    'N31219': 'ГВОЗДИКА_РАСТЕНИЕ',
    'N28683': 'КРОНА_ДЕНЬГИ',
    'N30465': 'КРОНА_ДЕНЬГИ',
    'N33001': 'КРОНА_ДЕНЬГИ',
    'N37840': 'КРОНА_ДЕРЕВА',
    'N29853': 'АКЦИЯ_КОМПАНИИ',
    'N41588': 'АКЦИЯ_ДЕЙСТВИЕ',
    'N35039': 'ТАКСА_СОБАКА',
    'N36673': 'ТАКСА_ОПЛАТА',
    'N36575': 'РОК_МУЗЫКА',
    'N40621': 'РОК_СУДЬБА',
    
}

In [None]:
"""
Creating dictionary with close synset realtions
"""
relations_file_names = [r'Synsets_xml\synset_relations.N.xml', r'Synsets_xml\synset_relations.A.xml', 
                       r'Synsets_xml\synset_relations.V.xml']

relatives_dict_new = {}

for file in relations_file_names:
    doc_N = etree.parse(file)
    root = doc_N.getroot()
    sense_list = []
    par_id_prev = '' #previous parent_id
    num=0
    for child in tqdm_notebook(root):

        if par_id_prev == child.attrib['parent_id']:
            if child.attrib['name'] in relatives_dict_new[child.attrib['parent_id']].keys():
                relatives_dict_new[child.attrib['parent_id']][child.attrib['name']].append(child.attrib['child_id'])
            else:
                relatives_dict_new[child.attrib['parent_id']][child.attrib['name']]=[child.attrib['child_id']]
                par_id_prev = child.attrib['parent_id']

        else:
            relatives_dict_new[child.attrib['parent_id']]={}

            relatives_dict_new[child.attrib['parent_id']][child.attrib['name']]=[child.attrib['child_id']]
            par_id_prev = child.attrib['parent_id']

In [None]:
def dict_up_list(d1, d2, n=1):
    
    """
    Function to create multi-level dictionary with relatives for a target synset
    Args:
    
    d1 (dict): dictionary with synset realtions
    
    d2 (dict): dictionary with close relatives to a target synset
    
    n (int): number of steps to extract relatives
    
    Returns:
    
    dictionary with all relatives and relations conneting them and target synset
    """
    
    out = {k: {k1: {i: d1[i] for i in val1}
        for k1,val1 in val.items()}
        for k,val in d2.items()}
    if n==1:
        return out
    else:
        out1 = {k:{k1:dict_up_list(d1,val1,n-1)
        for k1,val1 in val.items()}
        for k,val in out.items()}
        return out1
    
def relats_def(rel_lst, mono_dict_all = mono_dict_all, synset_out = False, target_word=''):
    
    """
    Function to extract words from synsets
    Args: 
    
    rel_lst (list): list of the relatives expressed as synset names
    
    mono_dict_all (dict): dictionary with monosemous words
    
    synset_out (bool): if True then we will output all the relative words (without
    dividing them into monosemous and polysemous)
    
    target_word (str): target polysemous word
    
    Returns:
    either list of monosemous and polysemous relative words or list with all relatives
    
    """
    
    rel_list_def = []
    rel_def_mono = []
    rel_def_mult = []
    rel_dict = {}
    
    
    if synset_out==False:
        for i in rel_lst:
            rel_list_def.extend(copy.deepcopy(synset_words[i]))
    else:
        for i in rel_lst:
            rel_dict[i] = normalize_words(copy.deepcopy(synset_words[i]), excl_root=True, target=target_word)
        return rel_dict
     
    for i in rel_list_def:
        if i in mono_dict_all.keys():
            rel_def_mono.append(i)
        else:
            rel_def_mult.append(i)

    return rel_def_mono, rel_def_mult

def normalize_words(syn_1, excl_root = True, target=''):
    
    """
    Function to lemmatize list of words
    Args:
    
    syn_1 (list): list of words
    
    excl_root (bool): whether to exclude target word from a final list or not
    
    target (str): target word (in case we need to exclude it)
    
    Returns:
    list of lemmatized words with #-symbol instead of whitespaces
    """
    
    
    norm_synset = [] #lemmas of the monosemous words
    for i in tqdm_notebook(syn_1):
        if excl_root==True:
            if i!=target:
                tokens = i.split()
                txt = [morph.parse(token)[0].normal_form.strip(' ') for token in tokens]
                norm_synset.append('#'.join(txt))
        else:
            tokens = i.split()
            txt = [morph.parse(token)[0].normal_form.strip(' ') for token in tokens]
            norm_synset.append('#'.join(txt))
    return norm_synset

In [None]:
def relatives_extraction(multinom_word, synset_num, path_to_save_file, nest = False):
    
    
    """
    An algorithm of monosemous relative extraction
    
    Args:
    multinom_word (str): the target polysemous word
    
    synset_num (str): the synset the target word belongs to
    
    path_to_save_file (str): path where output files with relatives will be stored
    
    nest (bool): whether to use only close candidate relatives instead of the ones whithin 4-step relation path
    
    Returns:
    files with relatives and their weights
    
    """
    
    global target_word
    target_word = multinom_word
    
    file_name = path_to_save_file+target_word+'_'+synset_num+'.txt'
    file_name_csv = path_to_save_file+target_word+'_'+synset_num+'.csv'


    root_concept = synset_num
    
    dict_2 = copy.deepcopy(relatives_dict_new[root_concept])
    ex_relats = {root_concept:dict_2}

    # collecting relatives within 4-step path
    relatives_3_step = dict_up_list(relatives_dict_new, ex_relats, n=3) 

    close_relatives = set()
    one_step_relatives = set()
    two_step_relatives  = set() 
    three_step_relatives = set()

    # dictionaries that contain information about relation types between synsets
    relation_list_0 = {}
    relation_list_1 = {}
    relation_list_2 = {}
    relation_list_3 = {}
    
    # list of the relations under consideration
    available_relations = ['hypernym', 'hyponym', 'instance hyponym', 'instance hypernym']


    #synset_inself
    root_syn = copy.deepcopy(synset_words[root_concept])
    root_syn.remove(word)


    for key, val in relatives_3_step.items():
        for key_2, val_2 in val.items():
            if key_2 in available_relations:
                relation = key_2
                for key_3, val_3 in val_2.items():
                    
                    # directly connected relatives, e.g.hyponyms, hypernyms, one step
                    close_relatives.add(key_3)
                        
                    relation_list_0[key_3] = relation

                    for key_4, val_4 in val_3.items():
                        relation_1 = key_4
                        if key_4 in available_relations:
                            for key_5, val_5 in val_4.items():
                                
                                # checking whether the synset is not the one we already put into list
                                if key_5!=root_concept and key_5 not in close_relatives:
                                    # relatives at two-step path
                                    one_step_relatives.add(key_5)

                                    relation_list_1[key_5] = relation+'_'+relation_1
                                    
                                    for key_6, val_6 in val_5.items():
                                        relation_2 = key_6
                                        if key_6 == relation_1 :
                                            for key_7, val_7 in val_6.items():
                                                if key_7!=key_5 and key_7  not in one_step_relatives:
                                                    # relatives at three-step path
                                                    two_step_relatives.add(key_7)
                                                    
                                                    relation_list_2[key_7] = relation+'_'+relation_1+'_'+relation_2

                                                    for key_8, val_8 in val_7.items():
                                                        relation_3 =key_8
                                                        if key_8 == relation_1:
                                                            # relatives at four-step path
                                                            three_step_relatives.update([i for i in val_8 if i not in list(two_step_relatives)+list(one_step_relatives)])

                                                            for k in val_8:
                                                                if k not in list(close_relatives)+list(two_step_relatives)+list(one_step_relatives):
                                                                    relation_list_3[k] = relation+'_'+relation_1+'_'+relation_2+'_'+relation_3

    # cleaning synset lists so that they won't contain synsets from the previous steps
    one_step_relatives -= close_relatives.intersection(one_step_relatives)
    two_step_relatives -= two_step_relatives.intersection(one_step_relatives)
    two_step_relatives -= two_step_relatives.intersection(close_relatives)
    three_step_relatives = three_step_relatives - two_step_relatives.intersection(three_step_relatives)
    three_step_relatives -= one_step_relatives.intersection(three_step_relatives)
    three_step_relatives -= close_relatives.intersection(three_step_relatives)
    
    
    relation_list_1 = {your_key: relation_list_1[your_key] for your_key in one_step_relatives}
    relation_list_2 = {your_key: relation_list_2[your_key] for your_key in two_step_relatives}
    relation_list_3 = {your_key: relation_list_3[your_key] for your_key in three_step_relatives}

    
    relation_list = {**relation_list_0, **relation_list_1}
    relation_list = {**relation_list, **relation_list_2}
    relation_list = {**relation_list, **relation_list_3}
    

    # obtaining monosemous and polysemous words from the synsets of different distances
    close_rel_def_mono, close_rel_def_mult = relats_def(list(close_relatives))
    one_step_def_mono, one_step_def_mult = relats_def(list(one_step_relatives))
    two_step_def_mono, two_step_def_mult = relats_def(list(two_step_relatives))
    three_step_def_mono, three_step_def_mult = relats_def(list(three_step_relatives))
    
    # leaving only monosemous words from a target synset
    root_syn_mono = []
    for i in root_syn:
        if i in mono_dict_all.keys():
            root_syn_mono.append(i)


    #choosing the scope of candidate monosemous relatives 
    if nest == True:
    #we take only close relatives and a synset as candidates to monosemous relatives
        all_relatives_list = list(set(close_rel_def_mono))+root_syn_mono
    
    else:
        # candidate monosemous relatives whithin 4-step relation path
        all_relatives_list = list(set(close_rel_def_mono))+list(set(one_step_def_mono))+\
        list(set(two_step_def_mono))+list(set(three_step_def_mono))+root_syn_mono
    
    # normalizing obtained list
    all_relatives_list_norm = normalize_words(all_relatives_list)
    
    # one step relatives
    close_synsets = relats_def(close_relatives, synset_out=True, target_word=target_word)
    # two step relatives
    one_synsets = relats_def(one_step_relatives, synset_out=True, target_word=target_word)
    # one- and two-step relatives
    relats_0_1 = relats_def(list(one_step_relatives)+list(close_relatives)+[root_concept],
                            synset_out=True, target_word=target_word) 
    
    # defining a synset nest
    relatives_first_circle = root_syn+list(set(close_rel_def_mono))+\
    list(set(close_rel_def_mult))+list(set(one_step_def_mult)) +list(set(one_step_def_mono))

    relatives_first_circle = normalize_words(relatives_first_circle)
    
    
    words = []
    res_100 = []
    res_100_no_weights = []
    res_list = []
    weight_list = {}

    # for every candidate in candidate list
    for i in tqdm_notebook(all_relatives_list_norm):

        word = i.split()[0]
        sim={}
        if word in model_2.wv.vocab:
            res = model_2.wv.most_similar(positive=word, topn=100)
            for j in res:
                sim[j[0]] = j[1]

        else:
            res = 'word is not in w2v vocab'

        words.append(word)
        res_100.append(res)
        res_100_no_weights.append(sim)
    
    similar_100_action = pd.DataFrame(columns=['word', '100_similar_words'])
    similar_100_action['word'] = pd.Series(words)

    similar_100_action['100_similar_words'] = pd.Series(res_100)
    
    similar_100_action['weight_sum'] = ""

    rows_to_delete = []
    rows_to_delete_2 = []
    totally_delete = []

    for i in range(len(similar_100_action)):
        intersect = []
        # weight of the candidate
        weight = 0
        # weights of the synsets
        weight_dict = {}
        for tup in similar_100_action['100_similar_words'].iloc[i]:
            
            # if this word is included in a nest
            if tup[0] in relatives_first_circle:
                
                for key, val in relats_0_1.items():

                    if tup[0] in val and tup[0]!=similar_100_action['word'].iloc[i]:

                        if key in weight_dict.keys():
                                # updating the values in case tthe new one is higher
                                if tup[1] > weight_dict[key]:
                                    weight_dict[key]=tup[1]
                        else:
                            weight_dict[key]=tup[1]

                intersect.append(tup[0])
            weight = sum(weight_dict.values())
            


        if intersect == []:
            intersect = 'No intersection'
        
        # filtering words according to weight and the number of relatives found
        if weight <= 1.5:  
            rows_to_delete.append(i)
        elif weight <= 0.9:  
            rows_to_delete_2.append(i)
        if weight==0:
            totally_delete.append(i)
    
        if len(similar_100_action)<=3:
            rows_to_delete = []
        elif len(rows_to_delete)>=len(similar_100_action)-3:
            if len(rows_to_delete_2)<=len(similar_100_action)-3:
                rows_to_delete = rows_to_delete_2
            else:
                rows_to_delete = []


        similar_100_action['weight_sum'].iloc[i] = weight
    
    similar_100_action = similar_100_action.drop(rows_to_delete+totally_delete)
    similar_100_action = similar_100_action.reset_index(drop=True)
    
    similar_100_action.to_csv(file_name_csv, sep=';', encoding='utf-8-sig')
    
    return None
          

In [None]:
path_to_csv_folder = r'C:\Users\Filtering_relatives_TOLOKA_TAIGA\relatives_'

for key, val in russe_synsets.items():
    
    for syns in val:
        
        relatives_extraction(key, syns, path_to_csv_folder)

In [None]:
"""
Mapping from polysemous word to its relatives
"""

csv_dir = r''
tuples = {}

for file in os.listdir(csv_dir):
    
    if file == 'relatives_доля_N25033.csv' or file == 'доля_N25033.csv':
        
        relat_name = 'ДОЛЯ_УЧАСТЬ'
        
    else:
    
        relat_name = file.replace('relatives_', '').replace('.csv', '')
        relat_name = mapping_to_sense_definitions[relat_name.split('_')[1]]
    
    df = pd.read_csv(csv_dir+file, sep=';')
    
    
    if df.empty == True:
        print(relat_name)
        continue

    tuples[relat_name] = {}
    for i in tqdm_notebook(range(len(df))):
        tuples[relat_name][df.word.iloc[i]] = df.weight_sum.iloc[i]
    
    