In [2]:
import codecs
import networkx as nx
import pandas as pd
import pickle
from os import listdir
from os.path import isfile, join

In [4]:
# column number
lemma1 = 0
lemma2 = 1
cat1 = 2
cat2 = 3
origine_morpho = 4
origine_def = 5
BAP1 = 6
BAP2 = 7
BAPsize = 8
FAP1 = 9
FAP2 = 10
FAPsize = 11
radical = 12
FAPtype = 13

def FAPconverter(input_fap):
    return input_fap.replace('^(.+)', 'X').replace('$', 'X')

In [15]:
header = ''
G = nx.Graph()
number_of_pairs = 0
number_of_unique_pairs = 0
with codecs.open('glawinette.csv', 'r', encoding='utf-8') as f:
    for line_num, line in enumerate(f):
        if line_num == 0:
            header = line.replace('\n','')
        elif line_num >= 1:
            elements = line.replace('\n','').replace(' ','').split('\t')
            v1 = elements[lemma1] + '_' + elements[cat1]
            v2 = elements[lemma2] + '_' + elements[cat2]
            if G.has_edge(v1, v2):
                continue
            number_of_pairs += 1
            G.add_node(v1, label=elements[cat1])
            G.add_node(v2, label=elements[cat1])
            pattern1 = FAPconverter(elements[FAP1])
            pattern2 = FAPconverter(elements[FAP2])
            sorted_pattern = sorted([pattern1, pattern2])
            G.add_edge(v1, v2, label=sorted_pattern[0] + '-' + sorted_pattern[1])
print(number_of_pairs, 'pairs')

14 pairs


In [17]:
conn_comps = list(nx.connected_components(G))
number_of_families = len(conn_comps)
print(number_of_families, 'families')

13 families


In [18]:
checked = list()
H = nx.Graph() # graph of graphs
for fam1 in range(0, number_of_families):
    if fam1 in checked:
        continue
    H.add_node(fam1)
    for fam2 in range(fam1 + 1, number_of_families):
        if fam2 in checked:
            continue
        G1 = nx.subgraph(G, conn_comps[fam1])
        G2 = nx.subgraph(G, conn_comps[fam2])
        if nx.is_isomorphic(G1, G2, node_match=lambda v1,v2: v1['label'] == v2['label'],\
                            edge_match=lambda e1,e2: e1['label'] == e2['label']):
            checked.append(fam2)
            H.add_edge(fam1, fam2)
        print(str(fam1) + ' ' + str(fam2) + '   ', end='\r')

0 1   0 2   0 3   0 4   0 5   0 6   0 7   0 8   0 9   0 10   0 11   0 12   1 2   1 3   1 4   1 5   1 6   1 7   1 8   1 9   1 10   1 11   1 12   2 3   2 4   2 5   2 6   2 7   2 8   2 9   2 10   2 11   2 12   3 4   3 5   3 6   3 7   3 8   3 9   3 10   3 11   3 12   4 5   4 6   4 7   4 8   4 9   4 10   4 11   4 12   5 6   5 7   5 8   5 9   5 10   5 11   5 12   6 7   6 8   6 9   6 10   6 11   6 12   7 8   7 9   7 10   7 11   7 12   8 9   8 10   8 11   8 12   9 10   9 11   9 12   10 11   10 12   11 12   

In [10]:
isomorphy_graph = open('glawinette_isomorphy_graph.p', 'wb')
pickle.dump(H, isomorphy_graph)
isomorphy_graph.close()

In [11]:
H = pickle.load(open('glawinette_isomorphy_graph.p', 'rb'))

In [19]:
H_conn_comps = [c for c in sorted(nx.connected_components(H), key=len, reverse=False)]
print(len(H_conn_comps), 'groups')

13 groups


In [22]:
def find_representative(K):
    # the representative of a family: the node with the most degrees
    max_degree = 0
    selected_node = ''
    for n in list(K.nodes):
        if K.degree[n] > max_degree:
            max_degree = K.degree[n]
            selected_node = n
        elif K.degree[n] == max_degree:
            if len(selected_node) > len(n):
                selected_node = n
    return selected_node.split('_')[0]

In [23]:
output_folder = 'glawinette_families'
lexeme_dict = dict()
f_summary = codecs.open('summary_of_glawinette_families.txt', 'w+', encoding='utf-8')
f_summary.write('family_id\tnumber_of_lexemes\tlexemes\n')
for fam_fam_id, fam_fam in enumerate(H_conn_comps):
    for fam_id, fam in enumerate(fam_fam):
        representative = find_representative(nx.subgraph(G, conn_comps[fam]))
        if len(fam_fam) == 1:
            family_title = 'F' + str(fam_fam_id).rjust(5, '0')
        else:
            family_title = 'F' + str(fam_fam_id).rjust(5, '0') + '-' + str(fam_id).rjust(len(str(len(fam_fam)-1)), '0')
        f_summary.write(family_title + '\t'\
                       + str(len(conn_comps[fam])) + '\t' + str(conn_comps[fam]) + '\n')
        for lexeme in conn_comps[fam]:
            filename = family_title + ' ' + representative + '.txt'
            lexeme_dict[lexeme] = filename
            f_out = codecs.open(join(output_folder, filename), 'w+', encoding='utf-8')
            f_out.write(header + '\tfichier_origine' + '\n\n')
            f_out.close()
f_summary.close()