In [1]:
import codecs
import networkx as nx
import pickle
from os import listdir
from os.path import isfile, join
from utils import printProgressBar

In [2]:
input_dir = 'pairs'
input_files = dict()
input_files['converts-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv'] = 'A'
input_files['demonette1-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv'] = 'B'
input_files['denomXaire-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv'] = 'C'
input_files['denomXal-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv'] = 'D'
input_files['denomXique-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv'] = 'E'
input_files['dimocXaie-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv'] = 'F'
input_files['dimocXat-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv'] = 'G'
input_files['dimocXet-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv'] = 'H'
input_files['dimocXier-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv'] = 'I'
input_files['derifde1X-formate-V4-nosem-utf8-lid-newtagset-entete-new-rid.csv'] = 'J'
input_files['derifXable-formate-V4-nosem-utf8-lid-newtagset-entete-new-rid.csv'] = 'K'

# column number
graph_1 = 3
graph_2 = 6
cat_1 = 8
cat_2 = 10
cstr_1 = 14
cstr_2 = 17
complexite = 19
orientation = 21

In [14]:
header = ''
G = nx.DiGraph()
for input_file in input_files:
    with codecs.open(join(input_dir, input_file), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num == 0:
                header = line.replace('\n','')
            elif line_num >= 2:
                elements = line.replace('\n','').replace(' ','').split('\t')
                G.add_node(elements[graph_1] + '_' + elements[cat_1], label=elements[cat_1])
                G.add_node(elements[graph_2] + '_' + elements[cat_2], label=elements[cat_2])
                if elements[orientation] == 'as2de' or elements[orientation] == 'as2des':
                    edge_type = elements[complexite] + '_' + elements[cstr_1] + '-' + elements[cstr_2]
                    G.add_edge(elements[graph_1] + '_' + elements[cat_1], elements[graph_2] + '_' + elements[cat_2], label=edge_type)
                elif elements[orientation] == 'de2as' or elements[orientation] == 'des2as':
                    edge_type = elements[complexite] + '_' + elements[cstr_2] + '-' + elements[cstr_1]
                    G.add_edge(elements[graph_2] + '_' + elements[cat_2], elements[graph_1] + '_' + elements[cat_1], label=edge_type)
                else:
                    sorted_cstr = sorted([elements[cstr_1], elements[cstr_2]])
                    edge_type =  elements[complexite] + '_' + elements[orientation] + '_' + sorted_cstr[0] + '-' + sorted_cstr[1]
                    G.add_edge(elements[graph_1] + '_' + elements[cat_1], elements[graph_2] + '_' + elements[cat_2], label=edge_type)
                    G.add_edge(elements[graph_2] + '_' + elements[cat_2], elements[graph_1] + '_' + elements[cat_1], label=edge_type)

In [15]:
conn_comps = list(nx.weakly_connected_components(G))
number_of_families = len(conn_comps)
number_of_families

13178

In [16]:
checked = list()
H = nx.Graph() # graph of graphs
for fam1 in range(0, number_of_families):
    if fam1 in checked:
        continue
    H.add_node(fam1)
    for fam2 in range(fam1 + 1, number_of_families):
        if fam2 in checked:
            continue
        G1 = nx.subgraph(G, conn_comps[fam1])
        G2 = nx.subgraph(G, conn_comps[fam2])
        if nx.is_isomorphic(G1, G2, node_match=lambda v1,v2: v1['label'] == v2['label'],\
                            edge_match=lambda e1,e2: e1['label'] == e2['label']):
            checked.append(fam2)
            H.add_edge(fam1, fam2)
    printProgressBar(fam1 + 1, number_of_families, prefix = 'Progress:', suffix = 'complete', length = 50, decimals = 2)

Progress: |█████████████████████████████████████████████████-| 99.99% complete

In [17]:
#H_conn_comps = list(nx.connected_components(H))
H_conn_comps = [c for c in sorted(nx.connected_components(H), key=len, reverse=False)]
print(len(H_conn_comps))
#H_conn_comps[0]

3903


In [15]:
conn_comps[65]

{'ajour_Nm', 'ajourage_Nm', 'ajourement_Nm', 'ajourer_V'}

In [21]:
output_folder = 'families'
lexeme_dict = dict()
f_summary = codecs.open('summary_of_families.txt', 'w+', encoding='utf-8')
f_summary.write('family_id\tnumber_of_lexemes\tlexemes\n')
for fam_fam_id, fam_fam in enumerate(H_conn_comps):
    for fam_id, fam in enumerate(fam_fam):
        # K = nx.subgraph(G, conn_comps[fam])
        first_member = sorted(list(conn_comps[fam]))[0]
        if len(fam_fam) == 1:
            family_title = 'F' + str(fam_fam_id).rjust(5, '0')
        else:
            family_title = 'F' + str(fam_fam_id).rjust(5, '0') + '-' + str(fam_id).rjust(len(str(len(fam_fam)-1)), '0')
        f_summary.write(family_title + '\t'\
                       + str(len(conn_comps[fam])) + '\t' + str(conn_comps[fam]) + '\n')
        for lexeme in conn_comps[fam]:
            filename = family_title + ' ' + first_member + '.txt'
            lexeme_dict[lexeme] = filename
            f_out = codecs.open(join(output_folder, filename), 'w+', encoding='utf-8')
            f_out.write(header + '\tfichier_origine' + '\n\n')
            f_out.close()
f_summary.close()

In [22]:
for input_file in input_files:
    with codecs.open(join(input_dir, input_file), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num < 2:
                continue
            elements = line.replace(' ','').split('\t')
            lexeme1 = elements[graph_1] + '_' + elements[cat_1]
            if elements[complexite] not in ['simple', 'complexe', 'motiv-form', 'motiv-sem', 'accidentel']:
                print('warning ', input_file)
            output_filename = lexeme_dict[lexeme1]
            f_out = codecs.open(join(output_folder, output_filename), 'a+', encoding='utf-8')
            f_out.write(line.strip('\n') + '\t' + input_files[input_file] + '\n')
            f_out.close()

In [None]:
graph_file = open('G.p', 'wb')
dict_file = open('lexeme_dict.p', 'wb')
pickle.dump(G, graph_file)
pickle.dump(lexeme_dict, dict_file)
graph_file.close()
dict_file.close()