In [1]:
import codecs
import networkx as nx
import pandas as pd
import pickle
from os import listdir
from os.path import isfile, join
from utils import printProgressBar

In [2]:
input_dir = 'pairs'
input_files = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]

input_file_code = dict()
input_file_code['converts'] = 'C'
input_file_code['demonette1'] = 'D'

input_file_code['denomCONVX'] = 'Na'
input_file_code['denomPREFX'] = 'Nb'
input_file_code['denomXaire'] = 'Nc'
input_file_code['denomXal'] = 'Nd'
input_file_code['denomXique'] = 'Ne'
input_file_code['denomXSUF1'] = 'Nf'
input_file_code['denomXSUF2'] = 'Ng'
input_file_code['denomXSUF3'] = 'Nh'
input_file_code['denomXSUF4'] = 'Ni'
input_file_code['denomXSUF5'] = 'Nj'
input_file_code['denomXSUF6'] = 'Nk'

input_file_code['dimocXaie'] = 'Ma'
input_file_code['dimocXat'] = 'Mb'
input_file_code['dimocXet'] = 'Mc'
input_file_code['dimocXier'] = 'Md'

input_file_code['derifantiX'] = 'Ra'
input_file_code['derifde1X'] = 'Rb'
input_file_code['derifenX'] = 'Rc'
input_file_code['derifinX'] = 'Rd'
input_file_code['derifQUANTX'] = 'Re'
input_file_code['derifreX'] = 'Rf'
input_file_code['deriftriX'] = 'Rg'
input_file_code['derifXable'] = 'Rh'
input_file_code['derifXiser'] = 'Ri'

# column number
graph_1 = 3
graph_2 = 6
cat_1 = 8
cat_2 = 10
cstr_1 = 14
cstr_2 = 17
complexite = 19
orientation = 21
fichier_origine = 43

In [3]:
def find_representative(K):
    # a representative of a family: the root (no direct relation going in), or the node with the largest outdegree
    in_degree_dict = dict()  # only counting 'des2as' and 'as2des'
    for v in list(K.nodes):
        in_degree_dict[v] = 0
    for e in list(K.edges.data()):
        if not ('NA' in e[2]['label'] or 'indirect' in e[2]['label']):
            in_degree_dict[e[1]] += 1
    roots = list()
    for k in in_degree_dict:
        if in_degree_dict[k] == 0:
            roots.append(k)
    if len(roots) == 1:
        return roots[0].split('_')[0]
    elif len(roots) == 0:
        max_out_degree = 0
        selected_node = ''
        for n in list(K.nodes):
            if K.out_degree(n) > max_out_degree:
                max_out_degree = K.out_degree(n)
                selected_node = n
        return selected_node.split('_')[0]
    elif len(roots) > 1:
        max_out_degree = 0
        selected_root = roots[0]
        for r in roots:
            if K.out_degree(r) > max_out_degree:
                max_out_degree = K.out_degree(r)
                selected_root = r
        return selected_root.split('_')[0]
    
def category_shortening(cat):
    if cat != 'Num' and cat[0] == 'N':
        return 'N'
    return cat

In [4]:
header = ''
G = nx.DiGraph()
number_of_pairs = 0
number_of_unique_pairs = 0
for input_file in input_files:
    with codecs.open(join(input_dir, input_file), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num == 0:
                header = line.replace('\n','')
            elif line_num >= 2:
                number_of_pairs += 1
                elements = line.replace('\n','').replace(' ','').split('\t')
                v1 = elements[graph_1] + '_' + elements[cat_1]
                v2 = elements[graph_2] + '_' + elements[cat_2]
                if G.has_edge(v1, v2) or G.has_edge(v2, v1):
                    continue
                G.add_node(elements[graph_1] + '_' + elements[cat_1], label=category_shortening(elements[cat_1]))
                G.add_node(elements[graph_2] + '_' + elements[cat_2], label=category_shortening(elements[cat_2]))
                if elements[orientation] == 'as2de' or elements[orientation] == 'as2des':
                    G.add_edge(v1, v2, label=elements[cstr_1] + '-' + elements[cstr_2]) # without complexity
                elif elements[orientation] == 'de2as' or elements[orientation] == 'des2as':
                    G.add_edge(v2, v1, label=elements[cstr_2] + '-' + elements[cstr_1])
                else:
                    #sorted_cstr = sorted([elements[cstr_1], elements[cstr_2]])
                    G.add_edge(v1, v2, label=elements[cstr_1] + '-' + elements[cstr_2] + '_' + elements[orientation])
                    G.add_edge(v2, v1, label=elements[cstr_2] + '-' + elements[cstr_1] + '_' + elements[orientation])
                number_of_unique_pairs += 1
print(number_of_pairs, 'pairs')
print(number_of_unique_pairs, 'unique pairs')

103377 pairs
51830 unique pairs


In [5]:
conn_comps = list(nx.weakly_connected_components(G))
number_of_families = len(conn_comps)
number_of_families

13897

In [6]:
checked = list()
H = nx.Graph() # graph of graphs
for fam1 in range(0, number_of_families):
    if fam1 in checked:
        continue
    H.add_node(fam1)
    for fam2 in range(fam1 + 1, number_of_families):
        if fam2 in checked:
            continue
        G1 = nx.subgraph(G, conn_comps[fam1])
        G2 = nx.subgraph(G, conn_comps[fam2])
        if nx.is_isomorphic(G1, G2, node_match=lambda v1,v2: v1['label'] == v2['label'],\
                            edge_match=lambda e1,e2: e1['label'] == e2['label']):
            checked.append(fam2)
            H.add_edge(fam1, fam2)
    printProgressBar(fam1 + 1, number_of_families, prefix = 'Progress:', suffix = 'complete', length = 50, decimals = 2)

Progress: |█████████████████████████████████████████████████-| 99.99% complete

In [7]:
isomorphy_graph = open('isomorphy_graph.p', 'wb')
pickle.dump(H, isomorphy_graph)
isomorphy_graph.close()

In [8]:
H = pickle.load(open('isomorphy_graph.p', 'rb'))

In [9]:
#H_conn_comps = list(nx.connected_components(H))
H_conn_comps = [c for c in sorted(nx.connected_components(H), key=len, reverse=False)]
print(len(H_conn_comps))
#H_conn_comps[0]

4849


In [10]:
output_folder = 'families'
lexeme_dict = dict()
f_summary = codecs.open('summary_of_families.txt', 'w+', encoding='utf-8')
f_summary.write('family_id\tnumber_of_lexemes\tlexemes\n')
for fam_fam_id, fam_fam in enumerate(H_conn_comps):
    for fam_id, fam in enumerate(fam_fam):
        # first_member = sorted(list(conn_comps[fam]))[0]
        representative = find_representative(nx.subgraph(G, conn_comps[fam]))
        if len(fam_fam) == 1:
            family_title = 'F' + str(fam_fam_id).rjust(5, '0')
        else:
            family_title = 'F' + str(fam_fam_id).rjust(5, '0') + '-' + str(fam_id).rjust(len(str(len(fam_fam)-1)), '0')
        f_summary.write(family_title + '\t'\
                       + str(len(conn_comps[fam])) + '\t' + str(conn_comps[fam]) + '\n')
        for lexeme in conn_comps[fam]:
            filename = family_title + ' ' + representative + '.txt'
            lexeme_dict[lexeme] = filename
            f_out = codecs.open(join(output_folder, filename), 'w+', encoding='utf-8')
            f_out.write(header + '\tfichier_origine' + '\n\n')
            f_out.close()
f_summary.close()

In [11]:
for input_file in input_files:
    with codecs.open(join(input_dir, input_file), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num < 2:
                continue
            elements = line.replace(' ','').split('\t')
            lexeme1 = elements[graph_1] + '_' + elements[cat_1]
            if elements[complexite] not in ['simple', 'complexe', 'motiv-form', 'motiv-sem', 'accidentel']:
                print('warning ', input_file)
            output_filename = lexeme_dict[lexeme1]
            f_out = codecs.open(join(output_folder, output_filename), 'a+', encoding='utf-8')
            f_out.write(line.strip('\n') + '\t' + input_file_code.get(input_file.split('-')[0]) + '\n')
            f_out.close()

# get all categories, lexemes, orientation, etc.

In [17]:
header = ''
list_of_categories = set()
lexemes_with_nonalpha = set()
input_dir = 'pairs'
input_files = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]
for input_file in input_files:
    with codecs.open(join(input_dir, input_file), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num == 0:
                header = line.replace('\n','')
            elif line_num >= 2:
                elements = line.replace('\n','').replace(' ','').split('\t')
                if elements[cat_2] == 'More':
                    print(elements[graph_2])
                list_of_categories.add(elements[cat_1])
                list_of_categories.add(elements[cat_2])
                if not elements[graph_1].isalpha():
                    lexemes_with_nonalpha.add(elements[graph_1])
                if not elements[graph_2].isalpha():
                    lexemes_with_nonalpha.add(elements[graph_2])
print('categories: ' + str(list_of_categories))
#print(lexemes_with_nonalpha)

centans
milleans
fauxjumeau
caractèrefemelle
caractèrefemelle
centans
caractèremâle
caractèremâle
caractèremâle
êtrevivant
fauxjumeau
categories: {'Npm', 'Nm', 'Nf', 'Adj', 'Nfp', 'Nx', 'Pro', 'Npf', 'V', 'Nmp', 'Npx', 'Num', 'More', 'Adv'}


# Verification of double arcs

In [None]:
def reverse_direction(input_orientation):
    if input_orientation == 'de2as':
        input_orientation = 'as2de'
    elif input_orientation == 'as2de':
        input_orientation = 'de2as'
    return input_orientation


def line_writer(input_line):
    elements = line.replace('\n','').replace(' ','').replace('des2as', 'de2as').replace('as2des', 'as2de').split('\t')
    ret_line = elements[graph_1] + '\t' + elements[cat_1] + '\t' + elements[graph_2] + '\t' + elements[cat_2] + '\t'\
    + elements[orientation] + '\t' + elements[complexite] + '\t' + elements[cstr_1] + '\t' + elements[cstr_2] + '\t'\
    + elements[fichier_origine] + '\n'
    return ret_line


f_duplicates = codecs.open('paires_doublons.txt', 'w+', encoding='utf-8')
f_duplicates.write('graph_1\tcat_1\tgraph_2\tcat_2\torientation\tcomplexite\tcstr_1\tcstr_2\tfichier_origine\n')
input_dir = 'families'
input_files = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]
counter = 0
for input_file in input_files:
    G = nx.Graph()
    with codecs.open(join(input_dir, input_file), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num >= 2:
                reverse = False
                elements = line.replace('\n','').replace(' ','').replace('des2as', 'de2as').replace('as2des', 'as2de').split('\t')
                v1 = elements[graph_1] + '_' + elements[cat_1]
                v2 = elements[graph_2] + '_' + elements[cat_2]
                if v1 > v2:
                    reverse = True
                if reverse:
                    edge_label = elements[complexite] + '_' + reverse_direction(elements[orientation]) + '_'\
                    + elements[cstr_2] + '-' + elements[cstr_1]
                else:
                    edge_label = elements[complexite] + '_' + elements[orientation] + '_' \
                    + elements[cstr_1] + '-' + elements[cstr_2]
                if G.has_edge(v1, v2) and G[v1][v2]['label'] != edge_label and not G[v1][v2]['checked']:
                    f_duplicates.write(G[v1][v2]['complete_line'])
                    f_duplicates.write(line_writer(line) + '\n')
                    G[v1][v2]['checked'] = True
                elif not G.has_edge(v1, v2):
                    G.add_edge(v1, v2, label=edge_label, complete_line=line_writer(line), checked=False)
    counter += 1
    printProgressBar(counter + 1, len(input_files), prefix='Progress:', suffix='complete', length=50, decimals=2)
f_duplicates.close()

# Fingerprints
Category is used when creating nodes (to avoid joining two lexemes with same graphie but different category), but not for comparing isomorphy.

Ignoring pairs with "indirect" and "NA" as orientation -> possible for some lexemes to be isolated

In [None]:
header = ''
G = nx.DiGraph()
for input_file in input_files:
    with codecs.open(join(input_dir, input_file), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num == 0:
                header = line.replace('\n','')
            elif line_num >= 2:
                elements = line.replace('\n','').replace(' ','').split('\t')
                G.add_node(elements[graph_1] + '_' + elements[cat_1], label='')
                G.add_node(elements[graph_2] + '_' + elements[cat_2], label='')
                if elements[orientation] == 'as2de' or elements[orientation] == 'as2des':
                    edge_type = elements[cstr_1] + '-' + elements[cstr_2]
                    G.add_edge(elements[graph_1] + '_' + elements[cat_1], elements[graph_2] + '_' + elements[cat_2],\
                               label=edge_type)
                elif elements[orientation] == 'de2as' or elements[orientation] == 'des2as':
                    edge_type = elements[cstr_2] + '-' + elements[cstr_1]
                    G.add_edge(elements[graph_2] + '_' + elements[cat_2], elements[graph_1] + '_' + elements[cat_1],\
                               label=edge_type)
                elif elements[orientation] == 'NA':
                    edge_type = elements[cstr_1] + '-' + elements[cstr_2]
                    G.add_edge(elements[graph_1] + '_' + elements[cat_1], elements[graph_2] + '_' + elements[cat_2],\
                               label=edge_type)
                    edge_type = elements[cstr_2] + '-' + elements[cstr_1]
                    G.add_edge(elements[graph_2] + '_' + elements[cat_2], elements[graph_1] + '_' + elements[cat_1],\
                               label=edge_type)
                else:  # orientation: indirect
                    pass

In [None]:
conn_comps = list(nx.weakly_connected_components(G))
number_of_families = len(conn_comps)
number_of_families

In [None]:
checked = list()
H = nx.Graph() # graph of graphs
for fam1 in range(0, number_of_families):
    if fam1 in checked:
        continue
    H.add_node(fam1)
    G1 = nx.subgraph(G, conn_comps[fam1])
    if len(G1) == 1:  # isolated
        continue
    for fam2 in range(fam1 + 1, number_of_families):
        if fam2 in checked:
            continue
        G2 = nx.subgraph(G, conn_comps[fam2])
        if len(G2) == 1:  # isolated
            continue
        if nx.is_isomorphic(G1, G2, node_match=lambda v1,v2: v1['label'] == v2['label'],\
                            edge_match=lambda e1,e2: e1['label'] == e2['label']):
            checked.append(fam2)
            H.add_edge(fam1, fam2)
    printProgressBar(fam1 + 1, number_of_families, prefix = 'Progress:', suffix = 'complete', length = 50, decimals = 2)

In [None]:
isomorphy_fingerprint = open('isomorphy_fingerprint.p', 'wb')
pickle.dump(H, isomorphy_fingerprint)
isomorphy_fingerprint.close()

In [None]:
H = pickle.load(open('isomorphy_fingerprint.p', 'rb'))
#H_conn_comps = list(nx.connected_components(H))
H_conn_comps = [c for c in sorted(nx.connected_components(H), key=len, reverse=False)]
print(len(H_conn_comps))
#H_conn_comps[0]

In [None]:
output_folder = 'families_fingerprint'
lexeme_dict = dict()
f_summary = codecs.open('summary_of_families_from_fingerprint.txt', 'w+', encoding='utf-8')
f_summary.write('family_id\tnumber_of_lexemes\tlexemes\n')
for fam_fam_id, fam_fam in enumerate(H_conn_comps):
    for fam_id, fam in enumerate(fam_fam):
        # first_member = sorted(list(conn_comps[fam]))[0]
        representative = find_representative(nx.subgraph(G, conn_comps[fam]))
        if len(fam_fam) == 1:
            family_title = 'F' + str(fam_fam_id).rjust(5, '0')
        else:
            family_title = 'F' + str(fam_fam_id).rjust(5, '0') + '-' + str(fam_id).rjust(len(str(len(fam_fam)-1)), '0')
        f_summary.write(family_title + '\t'\
                       + str(len(conn_comps[fam])) + '\t' + str(conn_comps[fam]) + '\n')
        for lexeme in conn_comps[fam]:
            filename = family_title + ' ' + representative + '.txt'
            lexeme_dict[lexeme] = filename
            f_out = codecs.open(join(output_folder, filename), 'w+', encoding='utf-8')
            f_out.write(header + '\tfichier_origine' + '\n\n')
            f_out.close()
f_summary.close()

In [None]:
for input_file in input_files:
    with codecs.open(join(input_dir, input_file), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num < 2:
                continue
            elements = line.replace(' ','').split('\t')
            lexeme1 = elements[graph_1] + '_' + elements[cat_1]
            if elements[complexite] not in ['simple', 'complexe', 'motiv-form', 'motiv-sem', 'accidentel']:
                print('warning ', input_file)
            output_filename = lexeme_dict[lexeme1]
            f_out = codecs.open(join(output_folder, output_filename), 'a+', encoding='utf-8')
            f_out.write(line.strip('\n') + '\t' + input_files[input_file] + '\n')
            f_out.close()