In [1]:
import codecs
import glob
import networkx as nx
import os
import pandas as pd
import pickle
import re
from difflib import SequenceMatcher
from os import listdir
from os.path import isfile, join
from networkx.algorithms import isomorphism
from networkx.drawing.nx_pydot import write_dot
from utils import printProgressBar

In [2]:
frequencies = pd.read_csv('frequencies-frcowvec-filtered.csv', header=0, index_col=0)
print(frequencies.shape)

(12772221, 1)


In [3]:
frcowvec_categories = {'Nm': 'NOM', 'Nf': 'NOM', 'Nmp': 'NOM', 'Nfp': 'NOM', 'Nx': 'NOM', 'More': 'NOM',
                       'Npm': 'NAM', 'Npf': 'NAM', 'Npx': 'NAM', 'Npmp': 'NAM', 'Npfp': 'NAM',
                       'Adj': 'ADJ', 'V': 'VER', 'Num': 'NUM', 'Pro': 'PRO', 'Adv': 'ADV'}
def get_frequency(lex_and_cat):
    if '_' not in lex_and_cat:
        return 0
    old_cat = lex_and_cat.split('_')[-1]
    new_cat = frcowvec_categories.get(old_cat)
    new_lex_and_cat = lex_and_cat.split('_')[0] + '_' + new_cat
    try:
        freq = frequencies.loc[new_lex_and_cat]['freq']
        return freq
    except KeyError:
        return 0

In [4]:
def generate_lexeme(a1, a2, b1):
    a1_lex = '^' + a1.split('_')[0] + '$'
    a2_lex = '^' + a2.split('_')[0] + '$'
    b1_lex = '^' + b1.split('_')[0] + '$'
    a2_cat = a2.split('_')[1]
    match = SequenceMatcher(None, a1_lex, a2_lex).find_longest_match(0, len(a1_lex), 0, len(a2_lex))
    common = a1_lex[match.a:match.a+match.size]
    a1_affix = a1_lex.replace(common, '(.+)')
    a2_affix = a2_lex.replace(common, '(.+)')
    a1_prefix = a1_affix[1:a1_affix.index('(')]
    a1_postfix = a1_affix[a1_affix.index(')')+1:-1]
    if a1_prefix not in b1_lex or a1_postfix not in b1_lex:
        return '??'
    b1_stem = b1_lex.replace('^', '').replace('$', '').replace(a1_prefix, '', 1)
    if a1_postfix:  # if not empty
        b1_stem = ''.join(b1_stem.rsplit(a1_postfix, 1))
    b2_lex = a2_affix.replace('(.+)', b1_stem)
    return b2_lex.replace('^', '').replace('$', '') + '_' + a2_cat

def edge_compare(e1, e2):
    return e1['label'].split('$')[0] == e2['label'].split('$')[0]

def completion(G_candidate, G_model):
    GM = isomorphism.DiGraphMatcher(G_model, G_candidate, node_match=lambda v1,v2: v1['label'] == v2['label'], edge_match=edge_compare)
    node_diff = set()
    matching_count = 0
    for subgraph in GM.subgraph_isomorphisms_iter():
        node_diff = G_model.nodes - subgraph
    subgraph_keys = list(subgraph.keys())
    equivalence = dict()
    score = 0  # sum of frequencies
    for d in node_diff:
        equivalence[d] = generate_lexeme(subgraph_keys[0], d, subgraph.get(subgraph_keys[0]))
        score += get_frequency(equivalence[d])
    return (subgraph, equivalence, score)

In [19]:
def generate_prediction_graph(ori_dict, pred_dict, candidate_number, model_number):
    dot_filenames = glob.glob(join('demonette-glawinette_graph', model_number + '*'))
    if not dot_filenames: # dot file not found
        return
    dot_filename = dot_filenames[0]
    f_out = codecs.open(join('predictions', candidate_number + ' + ' + model_number + '.dot'), 'w+', encoding='latin-1')
    with codecs.open(dot_filename, 'r', encoding='latin-1') as f:
        for line in f:
            if '{' in line or '}' in line:  # first and last line
                f_out.write(line)
            elif '->' in line:
                elements = line.split()
                line = re.sub(r'G: [A-zÀ-ú-]*', '', line)
                line = re.sub(r'label="[A-z]*: ', 'label="', line)
                if elements[0].replace('"', '') in pred_dict or elements[2].replace('"', '') in pred_dict:
                    line = re.sub(r'\];', ', color=blue, fontcolor=blue];', line)
                f_out.write(line)
            else:
                elements = line.split()
                lexeme_cat = elements[0].replace('"', '')
                if lexeme_cat in ori_dict:
                    pred = ori_dict.get(lexeme_cat)
                    line = line.replace('label="' + lexeme_cat.split('_')[0], 'label="' + pred.split('_')[0])
                    line = line.replace('"]', ', ' + str(get_frequency(pred)) + '"]')
                else:
                    pred = pred_dict.get(lexeme_cat)
                    line = line.replace('label="' + lexeme_cat.split('_')[0], 'label="' + pred.split('_')[0])
                    line = line.replace('];', ', color=blue, fontcolor=blue];')
                    if pred != '??':
                        line = line.replace('", color', ', ' + str(get_frequency(pred)) + '", color')
                f_out.write(line)
    f_out.close()

In [20]:
binary_dir = 'demonette-glawinette_graph_binary'
candidate_number = 'F04114'
model_group_number = 'F01426'
candidate = pickle.load(open(join(binary_dir, candidate_number), 'rb'))
families = glob.glob(join(binary_dir, model_group_number + '*'))
max_score = 0
subgraph = dict()
equivalence = dict()
chosen_model_number = model_group_number
for family in families:
    model = pickle.load(open(family, 'rb'))
    new_subgraph, new_equivalence, score = completion(candidate, model)
    if score > max_score:
        max_score = score
        subgraph = new_subgraph
        equivalence = new_equivalence
        chosen_model_number = family.split('\\')[1]
#print(subgraph)
#print(equivalence)
generate_prediction_graph(subgraph, equivalence, candidate_number, chosen_model_number)

nouer_V [label="vibromasser\nV, 2", color=blue, fontcolor=blue];

"dénouage_Nm" [label="dévibromassage\nN, 0", color=blue, fontcolor=blue];

"dénouer_V" [label="dévibromasser\nV, 0", color=blue, fontcolor=blue];

"dénouement_Nm" [label="dévibromassement\nN, 0", color=blue, fontcolor=blue];

"dénoueur_Nm" [label="dévibromasseur\nN, 0", color=blue, fontcolor=blue];

nouement_Nm [label="vibromassement\nN, 0", color=blue, fontcolor=blue];

noueuse_Nf [label="vibromasseuse\nN, 0", color=blue, fontcolor=blue];

renouage_Nm [label="revibromassage\nN, 0", color=blue, fontcolor=blue];

renouer_V [label="revibromasser\nV, 0", color=blue, fontcolor=blue];

renouement_Nm [label="revibromassement\nN, 0", color=blue, fontcolor=blue];

"noué_Adj" [label="vibromassé\nAdj, 0", color=blue, fontcolor=blue];

"indénouable_Adj" [label="indévibromassable\nAdj, 0", color=blue, fontcolor=blue];

"dénouable_Adj" [label="dévibromassable\nAdj, 0", color=blue, fontcolor=blue];

"redénouer_V" [label="

In [12]:
inane = '"dénouable_Adj" [label="dévibromassable\nAdj", color=blue, fontcolor=blue];'
elements = inane.split()
lexeme_cat = elements[0].replace('"', '')
category = lexeme_cat.split('_')[1]
inane = inane.replace(category + '",', category + ', ' + str(99) + '",')
inane

'"dénouable_Adj" [label="dévibromassable\nAdj, 99", color=blue, fontcolor=blue];'