In [1]:
import codecs
import networkx as nx
import pickle
from os import listdir
from os.path import isfile, join
from utils import printProgressBar

In [21]:
input_dir = 'pairs'
input_files = dict()
input_files['converts-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv'] = 'A'
input_files['demonette1-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv'] = 'B'
input_files['denomXaire-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv'] = 'C'
input_files['denomXal-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv'] = 'D'
input_files['denomXique-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv'] = 'E'
input_files['dimocXaie-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv'] = 'F'
input_files['dimocXat-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv'] = 'G'
input_files['dimocXet-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv'] = 'H'
input_files['dimocXier-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv'] = 'I'
input_files['derifde1X-formate-V4-nosem-utf8-lid-newtagset-entete-new-rid.csv'] = 'J'
input_files['derifXable-formate-V4-nosem-utf8-lid-newtagset-entete-new-rid.csv'] = 'K'

# column number
graph_1 = 3
graph_2 = 6
cat_1 = 8
cat_2 = 10
cstr_1 = 14
cstr_2 = 17
complexite = 19
orientation = 21
fichier_origine = 43

In [14]:
header = ''
G = nx.DiGraph()
for input_file in input_files:
    with codecs.open(join(input_dir, input_file), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num == 0:
                header = line.replace('\n','')
            elif line_num >= 2:
                elements = line.replace('\n','').replace(' ','').split('\t')
                G.add_node(elements[graph_1] + '_' + elements[cat_1], label=elements[cat_1])
                G.add_node(elements[graph_2] + '_' + elements[cat_2], label=elements[cat_2])
                if elements[orientation] == 'as2de' or elements[orientation] == 'as2des':
                    edge_type = elements[complexite] + '_' + elements[cstr_1] + '-' + elements[cstr_2]
                    G.add_edge(elements[graph_1] + '_' + elements[cat_1], elements[graph_2] + '_' + elements[cat_2],\
                               label=edge_type)
                elif elements[orientation] == 'de2as' or elements[orientation] == 'des2as':
                    edge_type = elements[complexite] + '_' + elements[cstr_2] + '-' + elements[cstr_1]
                    G.add_edge(elements[graph_2] + '_' + elements[cat_2], elements[graph_1] + '_' + elements[cat_1],\
                               label=edge_type)
                else:
                    sorted_cstr = sorted([elements[cstr_1], elements[cstr_2]])
                    edge_type =  elements[complexite] + '_' + elements[orientation] + '_' + sorted_cstr[0] + '-' + sorted_cstr[1]
                    G.add_edge(elements[graph_1] + '_' + elements[cat_1], elements[graph_2] + '_' + elements[cat_2],\
                               label=edge_type)
                    G.add_edge(elements[graph_2] + '_' + elements[cat_2], elements[graph_1] + '_' + elements[cat_1],\
                               label=edge_type)

In [15]:
conn_comps = list(nx.weakly_connected_components(G))
number_of_families = len(conn_comps)
number_of_families

13178

In [16]:
checked = list()
H = nx.Graph() # graph of graphs
for fam1 in range(0, number_of_families):
    if fam1 in checked:
        continue
    H.add_node(fam1)
    for fam2 in range(fam1 + 1, number_of_families):
        if fam2 in checked:
            continue
        G1 = nx.subgraph(G, conn_comps[fam1])
        G2 = nx.subgraph(G, conn_comps[fam2])
        if nx.is_isomorphic(G1, G2, node_match=lambda v1,v2: v1['label'] == v2['label'],\
                            edge_match=lambda e1,e2: e1['label'] == e2['label']):
            checked.append(fam2)
            H.add_edge(fam1, fam2)
    printProgressBar(fam1 + 1, number_of_families, prefix = 'Progress:', suffix = 'complete', length = 50, decimals = 2)

Progress: |█████████████████████████████████████████████████-| 99.99% complete

In [17]:
#H_conn_comps = list(nx.connected_components(H))
H_conn_comps = [c for c in sorted(nx.connected_components(H), key=len, reverse=False)]
print(len(H_conn_comps))
#H_conn_comps[0]

3903


In [15]:
conn_comps[65]

{'ajour_Nm', 'ajourage_Nm', 'ajourement_Nm', 'ajourer_V'}

In [21]:
output_folder = 'families'
lexeme_dict = dict()
f_summary = codecs.open('summary_of_families.txt', 'w+', encoding='utf-8')
f_summary.write('family_id\tnumber_of_lexemes\tlexemes\n')
for fam_fam_id, fam_fam in enumerate(H_conn_comps):
    for fam_id, fam in enumerate(fam_fam):
        # K = nx.subgraph(G, conn_comps[fam])
        first_member = sorted(list(conn_comps[fam]))[0]
        if len(fam_fam) == 1:
            family_title = 'F' + str(fam_fam_id).rjust(5, '0')
        else:
            family_title = 'F' + str(fam_fam_id).rjust(5, '0') + '-' + str(fam_id).rjust(len(str(len(fam_fam)-1)), '0')
        f_summary.write(family_title + '\t'\
                       + str(len(conn_comps[fam])) + '\t' + str(conn_comps[fam]) + '\n')
        for lexeme in conn_comps[fam]:
            filename = family_title + ' ' + first_member + '.txt'
            lexeme_dict[lexeme] = filename
            f_out = codecs.open(join(output_folder, filename), 'w+', encoding='utf-8')
            f_out.write(header + '\tfichier_origine' + '\n\n')
            f_out.close()
f_summary.close()

In [22]:
for input_file in input_files:
    with codecs.open(join(input_dir, input_file), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num < 2:
                continue
            elements = line.replace(' ','').split('\t')
            lexeme1 = elements[graph_1] + '_' + elements[cat_1]
            if elements[complexite] not in ['simple', 'complexe', 'motiv-form', 'motiv-sem', 'accidentel']:
                print('warning ', input_file)
            output_filename = lexeme_dict[lexeme1]
            f_out = codecs.open(join(output_folder, output_filename), 'a+', encoding='utf-8')
            f_out.write(line.strip('\n') + '\t' + input_files[input_file] + '\n')
            f_out.close()

# Verification of double arcs

In [26]:
def reverse_direction(input_orientation):
    if input_orientation == 'de2as':
        input_orientation = 'as2de'
    elif input_orientation == 'as2de':
        input_orientation = 'de2as'
    return input_orientation


def line_writer(input_line):
    elements = line.replace('\n','').replace(' ','').replace('des2as', 'de2as').replace('as2des', 'as2de').split('\t')
    ret_line = elements[graph_1] + '\t' + elements[cat_1] + '\t' + elements[graph_2] + '\t' + elements[cat_2] + '\t'\
    + elements[orientation] + '\t' + elements[complexite] + '\t' + elements[cstr_1] + '\t' + elements[cstr_2] + '\t'\
    + elements[fichier_origine] + '\n'
    return ret_line


f_duplicates = codecs.open('paires_doublons.txt', 'w+', encoding='utf-8')
f_duplicates.write('graph_1\tcat_1\tgraph_2\tcat_2\torientation\tcomplexite\tcstr_1\tcstr_2\tfichier_origine\n')
input_dir = 'families'
input_files = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]
counter = 0
for input_file in input_files:
    G = nx.Graph()
    with codecs.open(join(input_dir, input_file), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num >= 2:
                reverse = False
                elements = line.replace('\n','').replace(' ','').replace('des2as', 'de2as').replace('as2des', 'as2de').split('\t')
                v1 = elements[graph_1] + '_' + elements[cat_1]
                v2 = elements[graph_2] + '_' + elements[cat_2]
                if v1 > v2:
                    reverse = True
                if reverse:
                    edge_label = elements[complexite] + '_' + reverse_direction(elements[orientation]) + '_'\
                    + elements[cstr_2] + '-' + elements[cstr_1]
                else:
                    edge_label = elements[complexite] + '_' + elements[orientation] + '_' \
                    + elements[cstr_1] + '-' + elements[cstr_2]
                if G.has_edge(v1, v2) and G[v1][v2]['label'] != edge_label and not G[v1][v2]['checked']:
                    f_duplicates.write(G[v1][v2]['complete_line'])
                    f_duplicates.write(line_writer(line) + '\n')
                    G[v1][v2]['checked'] = True
                elif not G.has_edge(v1, v2):
                    G.add_edge(v1, v2, label=edge_label, complete_line=line_writer(line), checked=False)
    counter += 1
    printProgressBar(counter + 1, len(input_files), prefix='Progress:', suffix='complete', length=50, decimals=2)
f_duplicates.close()

Progress: |██████████████████████████████████████████████████| 100.00% complete
Progress: |██████████████████████████████████████████████████| 100.01% complete

# Verification of families with non-connected graph

In [4]:
family_dict = dict()
with codecs.open('summary_of_families.txt', 'r', encoding='utf-8') as f:
    for line_num, line in enumerate(f):
        if line_num == 0:
            continue
        cols = line.split('\t')
        lexemes = cols[2].replace('{','').replace('}','').replace('\'','').split(', ')
        lexeme_set = set()
        for lexeme in lexemes:
            lexeme_set.add(lexeme.split('_')[0])
        family_dict[cols[0]] = lexeme_set
print(len(family_dict))

13178


In [24]:
family_dict_keys = list(family_dict.keys())
f_out = codecs.open('non-connected_families.txt', 'w+', encoding='utf-8')
f_out.write('family_id_1\tfamily_id_2\tsuspected\tlength_1\tlength_2\n')
for k1 in range(0, len(family_dict)):
    for k2 in range(k1+1, len(family_dict)):
        set1 = family_dict[family_dict_keys[k1]]
        set2 = family_dict[family_dict_keys[k2]]
        connected = False
        suspected = ''
        for s1 in set1:
            for s2 in set2:
                if s1 in s2 or s2 in s1:
                    suspected = s1 + '-' + s2 + '\t' + str(len(s1)) + '\t' + str(len(s2))
                    connected = True
                    break
            if connected:
                break
        if connected:
            f_out.write(family_dict_keys[k1] + '\t' + family_dict_keys[k2] + '\t' + suspected + '\n')
    printProgressBar(k1 + 1, len(family_dict), prefix = 'Progress:', suffix = 'complete', length = 50, decimals = 2)
f_out.close()

Progress: |██████████████████████████████████████████████████| 100.00% complete
