In [None]:
import codecs
import networkx as nx
import pandas as pd
import pickle
from os import listdir
from os.path import isfile, join
from networkx.drawing.nx_pydot import write_dot

In [None]:
# column number
lemma1 = 0
lemma2 = 1
cat1 = 2
cat2 = 3
origine_morpho = 4
origine_def = 5
BAP1 = 6
BAP2 = 7
BAPsize = 8
FAP1 = 9
FAP2 = 10
FAPsize = 11
radical = 12
FAPtype = 13

def FAPconverter(input_fap):
    return input_fap.replace('(.+)', 'X').replace('$', '').replace('^', '')

In [None]:
header = ''
G = nx.Graph()
number_of_pairs = 0
number_of_unique_pairs = 0
with codecs.open('glawinette-series.csv', 'r', encoding='utf-8') as f:
    for line_num, line in enumerate(f):
        if line_num == 0:
            header = line.replace('\n','')
        elif line_num >= 1:
            elements = line.replace('\n','').replace(' ','').split('\t')
            v1 = elements[lemma1] + '_' + elements[cat1]
            v2 = elements[lemma2] + '_' + elements[cat2]
            if G.has_edge(v1, v2):
                continue
            number_of_pairs += 1
            G.add_node(v1, label=elements[cat1])
            G.add_node(v2, label=elements[cat1])
            pattern1 = FAPconverter(elements[FAP1])
            pattern2 = FAPconverter(elements[FAP2])
            sorted_pattern = sorted([pattern1, pattern2])
            G.add_edge(v1, v2, label=sorted_pattern[0] + '-' + sorted_pattern[1])
print(number_of_pairs, 'pairs')

In [None]:
conn_comps = list(nx.connected_components(G))
number_of_families = len(conn_comps)
print(number_of_families, 'families')

In [None]:
checked = list()
H = nx.Graph() # graph of graphs
for fam1 in range(0, number_of_families):
    if fam1 in checked:
        continue
    H.add_node(fam1)
    for fam2 in range(fam1 + 1, number_of_families):
        if fam2 in checked:
            continue
        G1 = nx.subgraph(G, conn_comps[fam1])
        G2 = nx.subgraph(G, conn_comps[fam2])
        if nx.is_isomorphic(G1, G2, node_match=lambda v1,v2: v1['label'] == v2['label'],\
                            edge_match=lambda e1,e2: e1['label'] == e2['label']):
            checked.append(fam2)
            H.add_edge(fam1, fam2)
        print(str(fam1) + ' ' + str(fam2) + '   ', end='\r')

In [None]:
isomorphy_graph = open('glawinette_isomorphy_graph.p', 'wb')
pickle.dump(H, isomorphy_graph)
isomorphy_graph.close()

In [None]:
H = pickle.load(open('glawinette_isomorphy_graph.p', 'rb'))

In [None]:
H_conn_comps = [c for c in sorted(nx.connected_components(H), key=len, reverse=False)]
print(len(H_conn_comps), 'groups')

In [None]:
def find_representative(K):
    # the representative of a family: the node with the most degrees
    max_degree = 0
    selected_node = ''
    for n in list(K.nodes):
        if K.degree[n] > max_degree:
            max_degree = K.degree[n]
            selected_node = n
        elif K.degree[n] == max_degree:
            if len(selected_node) > len(n):
                selected_node = n
    return selected_node.split('_')[0]

# Creation of txt file for each family

In [None]:
output_folder = 'glawinette_families'
lexeme_dict = dict()
f_summary = codecs.open('summary_of_glawinette_families.txt', 'w+', encoding='utf-8')
f_summary.write('family_id\tnumber_of_lexemes\tlexemes\n')
for fam_fam_id, fam_fam in enumerate(H_conn_comps):
    for fam_id, fam in enumerate(fam_fam):
        representative = find_representative(nx.subgraph(G, conn_comps[fam]))
        if len(fam_fam) == 1:
            family_title = 'F' + str(fam_fam_id).rjust(5, '0')
        else:
            family_title = 'F' + str(fam_fam_id).rjust(5, '0') + '-' + str(fam_id).rjust(len(str(len(fam_fam)-1)), '0')
        f_summary.write(family_title + '\t'\
                       + str(len(conn_comps[fam])) + '\t' + str(conn_comps[fam]) + '\n')
        for lexeme in conn_comps[fam]:
            filename = family_title + ' ' + representative + '.txt'
            lexeme_dict[lexeme] = filename
            f_out = codecs.open(join(output_folder, filename), 'w+', encoding='utf-8')
            f_out.write(header + '\tfichier_origine' + '\n\n')
            f_out.close()
f_summary.close()

In [None]:
with codecs.open('glawinette-series.csv', 'r', encoding='utf-8') as f:
    for line_num, line in enumerate(f):
        if line_num < 1:
            continue
        elements = line.replace(' ','').split('\t')
        lexeme1 = elements[lemma1] + '_' + elements[cat1]
        output_filename = lexeme_dict[lexeme1]
        f_out = codecs.open(join(output_folder, output_filename), 'a+', encoding='utf-8')
        f_out.write(line)
        f_out.close()

# Creation of graph

In [None]:
def edge_writer(L):
    ret_str = ''
    edges = L.edges(data=True)
    for e in edges:
        category0 = e[0].split('_')[1]
        category1 = e[1].split('_')[1]
        ret_str += category0 + ' - ' + e[2]['label'] + ' - ' + category1 + '; '
    return ret_str

f_summary = codecs.open('summary_of_glawinette_groups.txt', 'w+', encoding='utf-8')
f_summary.write('group id\tnumber of lexemes\tnumber of pairs\tnumber of families\tpairs\tfamilies\n')
number_of_edges = []
number_of_families = []
words = ''
group_prec = ''
family_count = 0
L = nx.Graph()
input_dir = 'glawinette_families'
input_files = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]
for input_file in input_files:
    group_id = input_file.split(' ')[0].split('-')[0]
    if group_id != group_prec and group_prec != '':
        f_summary.write(group_prec + '\t' + str(len(L)) + '\t' + str(L.size()) + '\t' + str(family_count) + '\t')
        f_summary.write(edge_writer(L)[:-2])
        f_summary.write('\t' + words[:-2] + '\n')
        family_count = 0
        words = ''
    family_count += 1
    group_prec = group_id
    L = nx.Graph()
    with codecs.open(join(input_dir, input_file), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num >= 2:
                elements = line.replace('\n','').replace(' ','').split('\t')
                v1 = elements[lemma1] + '_' + elements[cat1]
                v2 = elements[lemma2] + '_' + elements[cat2]
                if L.has_edge(v1, v2):
                    continue
                L.add_node(v1, label=elements[lemma1] + '\n' + elements[cat1])
                L.add_node(v2, label=elements[lemma2] + '\n' + elements[cat2])
                pattern1 = FAPconverter(elements[FAP1])
                pattern2 = FAPconverter(elements[FAP2])
                sorted_pattern = sorted([pattern1, pattern2])
                L.add_edge(v1, v2, label=sorted_pattern[0] + '-' + sorted_pattern[1])
    words += str(list(L.nodes())) + '; '
    write_dot(L, join('glawinette_visualization', input_file.replace('.txt','.dot')))
    print(input_file.split(' ')[0], end='\r')
f_summary.write(group_prec + '\t' + str(len(L)) + '\t' + str(L.size()) + '\t' + str(family_count) + '\t')
f_summary.write(edge_writer(L)[:-2])
f_summary.write('\t' + words[:-2] + '\n')  
f_summary.close()