In [1]:
import codecs
import networkx as nx
import os
import pandas as pd
import pickle
from difflib import SequenceMatcher
from networkx.drawing.nx_pydot import write_dot
from networkx.algorithms import isomorphism
from os import listdir
from os.path import isfile, join
from utils import printProgressBar, category_shortening

# column number
graph_1 = 3
graph_2 = 6
cat_1 = 8
cat_2 = 10
cstr_1 = 14
cstr_2 = 17
complexite = 19
orientation = 21
fichier_origine = 43

In [2]:
def generate_png(target_folder):
    dot_files = [f for f in listdir(target_folder) if isfile(join(target_folder, f)) and '.dot' in f]
    counter = 0
    for dot_file in dot_files:
        os.system('dot -Tpng "' + join(target_folder, dot_file) + '" -o "' + join(target_folder, dot_file.replace('.dot', '.png')) + '"')
        counter += 1
        printProgressBar(counter, len(dot_files), prefix = 'Progress:', suffix = 'complete', length = 50, decimals = 2)

# 1. Graphs with red edges for false derivations

In [None]:
family_folder = 'D-families'
spurious_folder = 'D-web-spurious'
model_folder = 'D-web-reference'
df = pd.read_excel('D_false_deriv.xlsx')
selected_rows = df[df['spurious_node_freq'] < 1]
selected_rows = selected_rows[selected_rows['parent_node_count'] > 2]
selected_rows.to_excel('D_false_deriv_filtered.xlsx', index=False)
selected_rows.shape

In [None]:
def create_graph(output_folder, in_dot_file_name, out_dot_file_name, spurious=''):
    G = pickle.load(open(join('D-graph-binary', in_dot_file_name.split()[0]), 'rb'))
    H = nx.DiGraph()
    with codecs.open(join(family_folder, in_dot_file_name.replace('.dot', '.txt')), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num >= 2:
                elements = line.replace('\n','').replace(' ','').split('\t')
                v1 = elements[graph_1] + '_' + elements[cat_1]
                v2 = elements[graph_2] + '_' + elements[cat_2]
                if H.has_edge(v1, v2) or H.has_edge(v2, v1):
                    continue
                H.add_node(v1, label=elements[graph_1] + '\n' + category_shortening(elements[cat_1]) + ', ' + str(G.nodes[v1]['frequency']))
                H.add_node(v2, label=elements[graph_2] + '\n' + category_shortening(elements[cat_2]) + ', ' + str(G.nodes[v2]['frequency']))
                if elements[orientation] == 'as2de' or elements[orientation] == 'as2des':
                    edge_type = elements[fichier_origine] + ': ' + elements[cstr_1] + '-' + elements[cstr_2]
                    H.add_edge(v1, v2, label=edge_type, style='')
                elif elements[orientation] == 'de2as' or elements[orientation] == 'des2as':
                    edge_type = elements[fichier_origine] + ': ' + elements[cstr_2] + '-' + elements[cstr_1]
                    H.add_edge(v2, v1, label=edge_type, style='')
                elif elements[orientation] == 'indirect':
                    sorted_lex = sorted([v1, v2])
                    sorted_cstr = sorted([elements[cstr_1], elements[cstr_2]])
                    edge_type = elements[fichier_origine] + ': ' + sorted_cstr[0] + '-' + sorted_cstr[1]
                    H.add_edge(sorted_lex[0], sorted_lex[1], dir='none', style='dotted', label=edge_type)
                elif elements[orientation] == 'NA':
                    sorted_lex = sorted([v1, v2])
                    sorted_cstr = sorted([elements[cstr_1], elements[cstr_2]])
                    edge_type = elements[fichier_origine] + ': ' + sorted_cstr[0] + '-' + sorted_cstr[1]
                    H.add_edge(sorted_lex[0], sorted_lex[1], dir='none', style='dashed', label=edge_type)
                else:
                    print(input_file, elements[orientation])
    if spurious == '':
        write_dot(H, join(output_folder, out_dot_file_name))
    else:
        spurious_nodes = spurious.split(', ')
        for spurious_node in spurious_nodes:
            for in_edge in H.in_edges(spurious_node):
                H.edges[in_edge]['color'] = 'red'
                H.edges[in_edge]['fontcolor'] = 'red'
            for out_edge in H.out_edges(spurious_node):
                H.edges[out_edge]['color'] = 'red'
                H.edges[out_edge]['fontcolor'] = 'red'
            H.nodes[spurious_node]['color'] = 'red'
            H.nodes[spurious_node]['fontcolor'] = 'red'
        write_dot(H, join(output_folder, out_dot_file_name))

For the code below, if there's an error like `...has no attribute 'split'`, try to open the `D_false_deriv.xlsx` and save it. Then reexecute the code from the beginning of this section.

In [None]:
counter = 0
for index, row in selected_rows.iterrows():
    create_graph(spurious_folder, row['child'], row['parent'].split()[0] + '_' + row['child'].split()[0] + '.dot', row['spurious_node'])
    counter += 1
    printProgressBar(counter, selected_rows.shape[0], prefix = 'Progress:', suffix = 'complete', length = 50, decimals = 2)

list_of_parents = selected_rows['parent'].unique()
counter = 0
for parent in list_of_parents:
    create_graph(model_folder, parent, parent)
    counter += 1
    printProgressBar(counter, len(list_of_parents), prefix = 'Progress:', suffix = 'complete', length = 50, decimals = 2)

In [None]:
generate_png(spurious_folder)

In [None]:
generate_png(model_folder)

# 2. Graphs with green edges for missing derivations

In [3]:
family_folder = 'D-families'
missing_folder = 'D-web-missing'
model_folder = 'D-web-reference'
df = pd.read_excel('D_missing_deriv.xlsx')
df.shape

(3941, 12)

In [None]:
frequencies = pd.read_csv('frequencies-frcowvec.csv', header=0, index_col=0)
frequencies.shape

In [None]:
frcowvec_categories = {'Nm': 'NOM', 'Nf': 'NOM', 'Nmp': 'NOM', 'Nfp': 'NOM', 'Nx': 'NOM', 'More': 'NOM',
                       'Npm': 'NAM', 'Npf': 'NAM', 'Npx': 'NAM', 'Npmp': 'NAM', 'Npfp': 'NAM',
                       'IJ': 'INT', 'Adj': 'ADJ', 'V': 'VER', 'Num': 'NUM', 'Pro': 'PRO', 'Adv': 'ADV'}

def frcowvec_cat_conversion(lexeme):
    old_cat = lexeme.split('_')[-1]
    new_cat = frcowvec_categories.get(old_cat, old_cat)
    return lexeme.split('_')[0] + '_' + new_cat

def get_frequency(lexeme):
    if '??' in lexeme:
        return 0
    else:
        try:
            freq = frequencies.loc[frcowvec_cat_conversion(lexeme)]['freq']
            return freq
        except KeyError:
            return 0

def guess_missing_lexeme(in_a1, in_a2, in_b1):  # to be refined
    # input_str ex : "micocoulier_Nm : micocoule_Nf = cotonéaster_Nm : ?, micocoulier_Nm : micocouleraie_Nf = cotonéaster_Nm : ?"
    a1 = '{' + in_a1.split('_')[0] + '}'
    a2 = '{' + in_a2.split('_')[0] + '}'
    b1 = '{' + in_b1.split('_')[0] + '}'
    a2_cat = in_a2.split('_')[1]
    match = SequenceMatcher(None, a1, a2).find_longest_match(0, len(a1), 0, len(a2))
    common = a1[match.a:match.a+match.size]
    a1_suffix = a1.replace(common, '')
    a2_suffix = a2.replace(common, '')
    b2 = b1.replace(a1_suffix, a2_suffix)
    if b2 == b1 and a1 != a2:
        return '??'
    else:
        return b2.replace('{', '').replace('}', '') + '_' + a2_cat

In [None]:
def create_graph_for_missing_lexemes(parent, child, output_folder, output_filename):
    H = nx.DiGraph()
    if parent == '':
        to_be_generated = child
    else:
        to_be_generated = parent
    G = pickle.load(open(join('D-graph-binary', to_be_generated.split()[0]), 'rb'))
    with codecs.open(join(family_folder, to_be_generated.replace('dot', 'txt').replace(' **', '')), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num >= 2:
                elements = line.replace('\n','').replace(' ','').split('\t')
                v1 = elements[graph_1] + '_' + elements[cat_1]
                v2 = elements[graph_2] + '_' + elements[cat_2]
                if H.has_edge(v1, v2) or H.has_edge(v2, v1):
                    continue
                H.add_node(v1, label=elements[graph_1] + '\n' + category_shortening(elements[cat_1]) + ', ' + str(G.nodes[v1]['frequency']))
                H.add_node(v2, label=elements[graph_2] + '\n' + category_shortening(elements[cat_2]) + ', ' + str(G.nodes[v2]['frequency']))
                if elements[orientation] == 'as2de' or elements[orientation] == 'as2des':
                    edge_type = elements[fichier_origine] + ': ' + elements[cstr_1] + '-' + elements[cstr_2]
                    H.add_edge(v1, v2, label=edge_type, style='')
                elif elements[orientation] == 'de2as' or elements[orientation] == 'des2as':
                    edge_type = elements[fichier_origine] + ': ' + elements[cstr_2] + '-' + elements[cstr_1]
                    H.add_edge(v2, v1, label=edge_type, style='')
                elif elements[orientation] == 'indirect':
                    sorted_lex = sorted([v1, v2])
                    sorted_cstr = sorted([elements[cstr_1], elements[cstr_2]])
                    edge_type = elements[fichier_origine] + ': ' + sorted_cstr[0] + '-' + sorted_cstr[1]
                    H.add_edge(sorted_lex[0], sorted_lex[1], dir='none', style='dotted', label=edge_type)
                elif elements[orientation] == 'NA':
                    sorted_lex = sorted([v1, v2])
                    sorted_cstr = sorted([elements[cstr_1], elements[cstr_2]])
                    edge_type = elements[fichier_origine] + ': ' + sorted_cstr[0] + '-' + sorted_cstr[1]
                    H.add_edge(sorted_lex[0], sorted_lex[1], dir='none', style='dashed', label=edge_type)
    if parent == '':
        write_dot(H, join(output_folder, output_filename))
        return
    if '**' in parent:
        G_parent = pickle.load(open(join('D-graph-binary', parent.split()[0]), 'rb'))
        G_child = pickle.load(open(join('D-graph-binary', child.split()[0]), 'rb'))
        GM = isomorphism.DiGraphMatcher(G_child, G_parent, node_match=lambda v1,v2: v1['label'] == v2['label'], edge_match=lambda e1,e2: e1['label'] == e2['label'])
        node_diff = set()
        for subgraph in GM.subgraph_isomorphisms_iter():
            node_diff = G_child.nodes - subgraph
        #print('subgraph', subgraph)
        #print('node_diff', node_diff)
        for n in node_diff:
            #print('in:', list(G_child.in_edges(n, data=True)))
            #print('out:', list(G_child.out_edges(n, data=True)))
            new_lexeme = n
            if len(G_child.in_edges(n)) > 0:
                one_origin = list(G_child.in_edges(n))[0][0]
                try:
                    new_lexeme = guess_missing_lexeme(one_origin, n, subgraph.get(one_origin))
                except TypeError:
                    pass
                except AttributeError:
                    continue
            else:
                one_derived = list(G_child.out_edges(n))[0][1]
                try:
                    new_lexeme = guess_missing_lexeme(one_derived, n, subgraph.get(one_derived))
                except TypeError:
                    pass
                except AttributeError:
                    continue
            new_node_freq = get_frequency(new_lexeme)
            if new_lexeme == '??':
                new_lexeme = new_lexeme + '_' + n
                H.add_node(new_lexeme, label='?? \n' + category_shortening(n.split('_')[1]) + ', ' + str(new_node_freq), fontcolor='green', color='green')
            else:
                H.add_node(new_lexeme, label=new_lexeme.split('_')[0] + '\n' + category_shortening(n.split('_')[1]) + ', ' + str(new_node_freq), fontcolor='green', color='green')
            for i in list(G_child.in_edges(n, data=True)):
                if 'NA' in i[2].get('label') and subgraph.get(i[0]) is not None:
                    H.add_edge(subgraph.get(i[0]), new_lexeme, label=i[2].get('label').split('_')[0], style='dashed', dir='none', fontcolor='green', color='green')
                elif 'indirect' in i[2].get('label') and subgraph.get(i[0]) is not None:
                    H.add_edge(subgraph.get(i[0]), new_lexeme, label=i[2].get('label').split('_')[0], style='dotted', dir='none', fontcolor='green', color='green')
                elif subgraph.get(i[0]) is not None:
                    H.add_edge(subgraph.get(i[0]), new_lexeme, label=i[2].get('label'), fontcolor='green', color='green')
            for o in list(G_child.out_edges(n, data=True)):
                if 'NA' in o[2].get('label') or 'indirect' in o[2].get('label'):
                    continue
                if subgraph.get(o[1]) is not None:
                    H.add_edge(new_lexeme, subgraph.get(o[1]), label=o[2].get('label'), fontcolor='green', color='green')
        write_dot(H, join(output_folder, output_filename))
    else:
        pass
    return

For the code below, if there's an error like `...has no attribute 'split'`, try to open the `D_missing_deriv.xlsx` and save it. Then reexecute the code from the beginning of this section.

In [None]:
list_of_child = df['child'].unique()
counter = 0
for child in list_of_child:
    create_graph_for_missing_lexemes('', child, model_folder, child)
    counter += 1
    printProgressBar(counter, len(list_of_child), prefix = 'Progress:', suffix = 'complete', length = 50, decimals = 2)

In [None]:
counter = 0
for index, row in df.iterrows():
    create_graph_for_missing_lexemes(row['parent'], row['child'], missing_folder, row['parent'].split()[0] + '_' + row['child'].split()[0] + '.dot')
    counter += 1
    printProgressBar(counter, df.shape[0], prefix = 'Progress:', suffix = 'complete', length = 50, decimals = 2)

In [None]:
generate_png(model_folder)

In [None]:
generate_png(missing_folder)

# Afterwards

In the end, we'll have the folders `D-web-missing`, `D-web-reference`, and `D-web-spurious` each populated with DOT and PNG files.

Copy these PNG to `missing`, `reference`, and `spurious` folders, respectively, in the project `Demonext-web`.

You should also copy the files `D_false_deriv_filtered.xlsx` and `D_missing_deriv.xlsx` to the root folder of `Demonext-web`.