In [1]:
import codecs
import networkx as nx
import pandas as pd
import pickle
import re
from gensim.models import KeyedVectors
from os import listdir
from os.path import isfile, join
from utils import printProgressBar

# column number
graph_1 = 3
graph_2 = 6
cat_1 = 8
cat_2 = 10
cstr_1 = 14
cstr_2 = 17
complexite = 19
orientation = 21
fichier_origine = 43



# Extract all valid cstr (Xeur, Xette, reX, ...)

In [9]:
input_dir = 'families'
input_files = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]
family_name_dict = dict()
valid_cstr = set()
valid_cstr.add('X')
counter = 0
for input_file in input_files:
    with codecs.open(join(input_dir, input_file), 'r', encoding='utf-8') as f:
        family_name_dict[input_file.split()[0]] = input_file.split()[1].split('.')[0]  # for setting hyperlink in excel
        for line_num, line in enumerate(f):
            if line_num >= 2:
                elements = line.replace(' ', '').split('\t')
                construction1 = elements[cstr_1].replace('1', '').replace('2', '')
                construction2 = elements[cstr_2].replace('1', '').replace('2', '')
                if construction1 == 'X' and construction2 == 'X':
                    continue
                category1 = elements[cat_1]
                if category1[0] == 'N' and category1 != 'Num':
                    category1 = 'N'
                category2 = elements[cat_2]
                if category2[0] == 'N' and category2 != 'Num':
                    category2 = 'N'
                if construction1 == 'X':
                    valid_cstr.add(construction2 + '-' + category1 + '-' + category2)
                elif construction2 == 'X':
                    valid_cstr.add(construction1 + '-' + category1 + '-' + category2)
    counter += 1
    printProgressBar(counter, len(input_files), prefix='Progress:', suffix='complete', length=50, decimals=2)

Progress: |██████████████████████████████████████████████████| 100.00% complete


In [10]:
#valid_cstr.update(['rX', 'réX'])
print(sorted(list(valid_cstr)))

['X', 'Xable-Adj-N', 'Xable-Adj-V', 'Xade-N-V', 'Xade-V-N', 'Xage-N-N', 'Xage-N-V', 'Xage-V-N', 'Xagier-N-Adj', 'Xagier-N-N', 'Xaie-N-N', 'Xaille-N-V', 'Xaille-V-N', 'Xailler-N-Adj', 'Xailler-N-N', 'Xaillette-N-N', 'Xainat-N-N', 'Xaire-Adj-Adj', 'Xaire-Adj-N', 'Xaire-Adj-Num', 'Xaire-Adj-V', 'Xaire-Adv-Adj', 'Xaire-Adv-N', 'Xaire-N-Adj', 'Xaire-N-N', 'Xaire-N-V', 'Xaire-Num-Adj', 'Xaire-Num-N', 'Xaire-V-Adj', 'Xaire-V-N', 'Xairie-N-Adj', 'Xairie-N-N', 'Xaison-N-V', 'Xaison-V-N', 'Xal-Adj-N', 'Xalat-N-N', 'Xalette-N-N', 'Xaliser-V-N', 'Xanat-N-N', 'Xance-N-V', 'Xance-V-N', 'Xande-N-V', 'Xande-V-N', 'Xange-N-V', 'Xange-V-N', 'Xant-N-N', 'Xariat-N-N', 'Xariat-V-N', 'Xariser-V-Adj', 'Xassier-N-Adj', 'Xassier-N-N', 'Xassier-V-Adj', 'Xassier-V-N', 'Xat-Adj-N', 'Xat-N-N', 'Xataire-N-Adj', 'Xataire-N-N', 'Xataire-V-Adj', 'Xataire-V-N', 'Xate-Adj-N', 'Xaticat-N-N', 'Xatique-Adj-V', 'Xaume-N-N', 'Xel-Adj-N', 'Xelat-N-N', 'Xelet-Adj-Adj', 'Xelet-Adj-N', 'Xelet-N-Adj', 'Xelet-N-N', 'Xelet-V-N', 'X

# Extract all lexemes in each family

In [11]:
family_dict = dict()
with codecs.open('summary_of_families.txt', 'r', encoding='utf-8') as f:
    for line_num, line in enumerate(f):
        if line_num == 0:
            continue
        cols = line.split('\t')
        lexemes = cols[2].replace('{','').replace('}','').replace('\'','').replace('\n','').split(', ')
        family_dict[cols[0]] = set(lexemes)
print(len(family_dict))

13178


# String comparison

In [12]:
# section to obtain families that contain two given families
L = nx.DiGraph()
vertex_attribute_dict = dict()
with codecs.open(join('posets', 'families_simplified_maxgraph.dot'), 'r', encoding='utf-8') as f:
    for line in f:
        if '->' in line:  # a line showing edges between concepts
            elements = line.split()
            L.add_edge(vertex_attribute_dict[elements[2]], vertex_attribute_dict[elements[0]])
        elif 'shape' in line:  # a line describing a concept 
            attribute_number = re.search('Attribute (.*)\|', line.replace('\\n', '')).group(1)
            vertex_attribute_dict[line.split()[0]] = attribute_number
            L.add_node(attribute_number)

def common_superfamily(family_number_1, family_number_2):
    group_1 = str(int(family_number_1.split('-')[0].replace('F', '')))
    group_2 = str(int(family_number_2.split('-')[0].replace('F', '')))
    if group_1 == group_2:
        return 'same family group'
    children1 = nx.descendants(L, group_1)
    children2 = nx.descendants(L, group_2)
    common_children = children1.intersection(children2)
    if len(common_children) == 0:
        return 'no common superfamily'
    if group_1 in children2 or group_2 in children1:
        return 'one family is a superfamily of the other'
    return_string = ''
    for child in common_children:
        return_string += 'F' + child.rjust(5, '0') + ' '
    return return_string[:-1]

In [9]:
common_superfamily('F03359-01', 'F03902-14')

'one family is a superfamily of the other'

In [15]:
# with valid cstr
family_dict_keys = list(family_dict.keys())
df_valid = pd.DataFrame(columns=['family_id_1', 'family_id_2', 'graph_1', 'cat_1', 'graph_2', 'cat_2', 'cstr_1_presumed', 'cstr_2_presumed', 'superfamilies'])
for k1 in range(0, len(family_dict)):
    #if family_dict_keys[k1] != 'F01545':
        #continue
    for k2 in range(k1+1, len(family_dict)):
        set1 = family_dict[family_dict_keys[k1]]
        set2 = family_dict[family_dict_keys[k2]]
        #if family_dict_keys[k2] != 'F03722-5':
            #continue
        connected = False
        for s1 in set1:
            [g1, c1] = s1.split('_')
            for s2 in set2:
                [g2, c2] = s2.split('_')
                if g1 == g2:
                    family_id_1 = family_dict_keys[k1]
                    family_id_2 = family_dict_keys[k2]
                    cstr = 'X'
                    graph1 = g1
                    graph2 = g2
                    cat1 = c1
                    cat2 = c2
                    connected = True
                    break
                if g1 in g2:
                    if c1[0] == 'N' and c1 != 'Num':
                        c1 = 'N'
                    if c2[0] == 'N' and c2 != 'Num':
                        c2 = 'N'
                    if (g2.replace(g1, 'X') + '-' + c1 + '-' + c2) in valid_cstr:
                        family_id_1 = family_dict_keys[k1]
                        family_id_2 = family_dict_keys[k2]
                        cstr = g2.replace(g1, 'X')
                        graph1 = g1
                        graph2 = g2
                        cat1 = c1
                        cat2 = c2
                        connected = True
                        break
                elif g2 in g1:
                    if c1[0] == 'N' and c1 != 'Num':
                        c1 = 'N'
                    if c2[0] == 'N' and c2 != 'Num':
                        c2 = 'N'
                    if (g1.replace(g2, 'X') + '-' + c2 + '-' + c1) in valid_cstr:
                        family_id_1 = family_dict_keys[k2]
                        family_id_2 = family_dict_keys[k1]
                        cstr = g1.replace(g2, 'X')
                        graph1 = g2
                        graph2 = g1
                        cat1 = c2
                        cat2 = c1
                        connected = True
                        break
            if connected:
                break
        if connected:
            df_valid = df_valid.append(pd.Series({
                'family_id_1': '=HYPERLINK("graph_visualization\\' + family_id_1 + ' ' + family_name_dict[family_id_1] + '.dot", "' + family_id_1 + '")',
                'family_id_2': '=HYPERLINK("graph_visualization\\' + family_id_2 + ' ' + family_name_dict[family_id_2] + '.dot", "' + family_id_2 + '")',
                'graph_1': graph1,
                'graph_2': graph2,
                'cat_1': cat1,
                'cat_2': cat2,
                'cstr_1_presumed': 'X',
                'cstr_2_presumed': cstr,
                'superfamilies': common_superfamily(family_dict_keys[k1], family_dict_keys[k2])
            }), ignore_index=True)
    printProgressBar(k1 + 1, len(family_dict), prefix = 'Progress:', suffix = 'complete', length = 50, decimals = 2)
df_valid.to_excel('unconnected,_valid_cstr,_valid_cat.xlsx', index=False)

Progress: |██████████████████████████████████████████████████| 100.00% complete




In [None]:
# valid + invalid cstr
family_dict_keys = list(family_dict.keys())
min_ratio = 0.5
df_valid_invalid = pd.DataFrame(columns=['family_id_1', 'family_id_2', 'graph_1', 'cat_1', 'cstr_1',\
                                         'graph_2', 'cat_2', 'cstr_2', 'ratio'])
for k1 in range(0, len(family_dict)):
    for k2 in range(k1+1, len(family_dict)):
        set1 = family_dict[family_dict_keys[k1]]
        set2 = family_dict[family_dict_keys[k2]]
        connected = False
        for s1 in set1:
            [g1, c1] = s1.split('_')
            for s2 in set2:
                [g2, c2] = s2.split('_')
                if g1 in g2 and len(g1) / len(g2) > min_ratio:
                    family_id_1 = family_dict_keys[k1]
                    family_id_2 = family_dict_keys[k2]
                    cstr = g2.replace(g1, 'X')
                    graph1 = g1
                    graph2 = g2
                    cat1 = c1
                    cat2 = c2
                    ratio = len(g1) / len(g2)
                    connected = True
                    break
                elif g2 in g1 and len(g2) / len(g1) > min_ratio:
                    family_id_1 = family_dict_keys[k2]
                    family_id_2 = family_dict_keys[k1]
                    cstr = g1.replace(g2, 'X')
                    graph1 = g2
                    graph2 = g1
                    cat1 = c2
                    cat2 = c1
                    ratio = len(g2) / len(g1)
                    connected = True
                    break
            if connected:
                break
        if connected:
            df_valid_invalid = df_valid_invalid.append(pd.Series({
                'family_id_1': family_id_1,
                'family_id_2': family_id_2,
                'graph_1': graph1,
                'graph_2': graph2,
                'cat_1': cat1,
                'cat_2': cat2,
                'cstr_1': 'X',
                'cstr_2': cstr,
                'ratio': ratio
            }), ignore_index=True)
    printProgressBar(k1 + 1, len(family_dict), prefix = 'Progress:', suffix = 'complete', length = 50, decimals = 2)
df_valid_invalid.to_excel('unconnected.xls', index=False)

In [6]:
# invalid cstr
df_valid_invalid = pd.read_excel('unconnected.xls')
df_valid = pd.read_excel('unconnected,_valid_cstr.xls')
df_valid_invalid['family_id_join'] = df_valid_invalid.apply(lambda row: sorted([row['family_id_1'], row['family_id_2']])[0]\
                                                            + '+' + sorted([row['family_id_1'], row['family_id_2']])[1],\
                                                            axis=1)
df_valid['family_id_join'] = df_valid.apply(lambda row: sorted([row['family_id_1'], row['family_id_2']])[0] + '+' + \
                                            sorted([row['family_id_1'], row['family_id_2']])[1], axis=1)
df_valid = df_valid[['family_id_join']]
df_invalid = df_valid.merge(df_valid_invalid, how = 'outer', on = ['family_id_join'],\
                                              indicator=True).loc[lambda x : x['_merge']=='right_only']
df_invalid.drop(['family_id_join', '_merge'], axis=1, inplace=True)
df_invalid.to_excel('unconnected,_invalid_cstr.xls', index=False)



# word2vec

In [None]:
model = KeyedVectors.load_word2vec_format('frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin', binary=True, unicode_errors="ignore")

In [None]:
model.most_similar('approchement')

In [None]:
model.similarity('boucher', 'déboucher')