In [1]:
import codecs
import glob
import networkx as nx
import os
import pandas as pd
import pickle
import re
from difflib import SequenceMatcher
from os import listdir
from os.path import isfile, join
from networkx.algorithms import isomorphism
from networkx.drawing.nx_pydot import write_dot
from utils import printProgressBar

In [2]:
frequencies = pd.read_csv('frequencies-frcowvec-filtered.csv', header=0, index_col=0)
print(frequencies.shape)

(12772221, 1)


In [3]:
frcowvec_categories = {'Nm': 'NOM', 'Nf': 'NOM', 'Nmp': 'NOM', 'Nfp': 'NOM', 'Nx': 'NOM', 'More': 'NOM',
                       'Npm': 'NAM', 'Npf': 'NAM', 'Npx': 'NAM', 'Npmp': 'NAM', 'Npfp': 'NAM',
                       'Adj': 'ADJ', 'V': 'VER', 'Num': 'NUM', 'Pro': 'PRO', 'Adv': 'ADV'}
def get_frequency(lex_and_cat):
    if '_' not in lex_and_cat:
        return 0
    old_cat = lex_and_cat.split('_')[-1]
    new_cat = frcowvec_categories.get(old_cat)
    new_lex_and_cat = lex_and_cat.split('_')[0] + '_' + new_cat
    try:
        freq = frequencies.loc[new_lex_and_cat]['freq']
        return freq
    except KeyError:
        return 0

In [4]:
demonette_lexemes = set()
with codecs.open('lexemes.csv', 'r', encoding='utf-8') as f:
    for line in f:
        elements = line.split('\t')
        if elements[0] == 'lid':
            continue
        demonette_lexemes.add(elements[2])

In [5]:
def generate_lexeme(a1, a2, b1):  # generate lexeme and it's part-of-speech
    a1_lex = '^' + a1.split('_')[0] + '$'
    a2_lex = '^' + a2.split('_')[0] + '$'
    b1_lex = '^' + b1.split('_')[0] + '$'
    a2_cat = a2.split('_')[1]
    match = SequenceMatcher(None, a1_lex, a2_lex).find_longest_match(0, len(a1_lex), 0, len(a2_lex))
    common = a1_lex[match.a:match.a+match.size]
    a1_affix = a1_lex.replace(common, '(.+)')
    a2_affix = a2_lex.replace(common, '(.+)')
    a1_prefix = a1_affix[1:a1_affix.index('(')]
    a1_postfix = a1_affix[a1_affix.index(')')+1:-1]
    if a1_prefix not in b1_lex or a1_postfix not in b1_lex:
        return '??'
    b1_stem = b1_lex.replace('^', '').replace('$', '').replace(a1_prefix, '', 1)
    if a1_postfix:  # if not empty
        b1_stem = ''.join(b1_stem.rsplit(a1_postfix, 1))
    b2_lex = a2_affix.replace('(.+)', b1_stem)
    return b2_lex.replace('^', '').replace('$', '') + '_' + a2_cat

def edge_compare(e1, e2):
    return e1['label'].split('$')[0] == e2['label'].split('$')[0]

def completion(G_candidate, G_model):
    GM = isomorphism.DiGraphMatcher(G_model, G_candidate, node_match=lambda v1,v2: v1['label'] == v2['label'], edge_match=edge_compare)
    node_diff = set()
    matching_count = 0
    for subgraph in GM.subgraph_isomorphisms_iter():
        node_diff = G_model.nodes - subgraph
    subgraph_keys = list(subgraph.keys())
    equivalence = dict()
    score = 0  # sum of frequencies
    for d in node_diff:
        equivalence[d] = generate_lexeme(subgraph_keys[0], d, subgraph.get(subgraph_keys[0]))
        score_d = get_frequency(equivalence[d])
        score += score_d
        if score_d == 0 and equivalence[d].split('_')[0] not in demonette_lexemes:
            score -= 1
    return (subgraph, equivalence, score)

In [6]:
def generate_prediction_graph(ori_dict, pred_dict, candidate_number, model_number):
    dot_filenames = glob.glob(join('demonette-glawinette_graph', model_number + '*'))
    if not dot_filenames: # dot file not found
        return
    dot_filename = dot_filenames[0]
    f_out = codecs.open(join('predictions', candidate_number + ' + ' + model_number + '.dot'), 'w+', encoding='latin-1')
    with codecs.open(dot_filename, 'r', encoding='latin-1') as f:
        for line in f:
            if '{' in line or '}' in line:  # first and last line
                f_out.write(line)
            elif '->' in line:
                elements = line.split()
                line = re.sub(r'G: [A-zÀ-ú-]*', '', line)
                line = re.sub(r'label="[A-z]*: ', 'label="', line)
                if elements[0].replace('"', '') in pred_dict or elements[2].replace('"', '') in pred_dict:
                    line = re.sub(r'\];', ', color=blue, fontcolor=blue];', line)
                f_out.write(line)
            else:
                elements = line.split()
                lexeme_cat = elements[0].replace('"', '')
                if lexeme_cat in ori_dict:
                    pred = ori_dict.get(lexeme_cat)
                    line = line.replace('label="' + lexeme_cat.split('_')[0], 'label="' + pred.split('_')[0])
                    line = line.replace('"]', ', ' + str(get_frequency(pred)) + '"]')
                else:
                    pred = pred_dict.get(lexeme_cat)
                    line = line.replace('label="' + lexeme_cat.split('_')[0], 'label="' + pred.split('_')[0])
                    line = line.replace('];', ', color=blue, fontcolor=blue];')
                    if pred != '??':
                        line = line.replace('", color', ', ' + str(get_frequency(pred)) + '", color')
                f_out.write(line)
    f_out.close()

# generate predictions for one family

In [None]:
binary_dir = 'demonette-glawinette_graph_binary'
candidate_number = 'F06606-01'
model_group_number = 'F01896'
candidate = pickle.load(open(join(binary_dir, candidate_number), 'rb'))
families = glob.glob(join(binary_dir, model_group_number + '*'))
max_score = -1
subgraph = dict()
equivalence = dict()
chosen_model_number = model_group_number
for family in families:
    model = pickle.load(open(family, 'rb'))
    new_subgraph, new_equivalence, score = completion(candidate, model)
    if score > max_score:
        max_score = score
        subgraph = new_subgraph
        equivalence = new_equivalence
        chosen_model_number = family.split('\\')[1]
#print(subgraph)
#print(equivalence)
generate_prediction_graph(subgraph, equivalence, candidate_number, chosen_model_number)

# automatic for all concepts

In [7]:
ignored = ['F06082', 'F06086', 'F06138', 'F04553', 'F04843', 'F04879', 'F04942', 'F04945', 'F05607', 'F05702', 'F05920', 'F05956', 'F05989', 'F05963', 'F06003', 'F06020', 'F06027', 'F06030', 'F06032', 'F06038', 'F06048', 'F06049', 'F06050', 'F06067', 'F06072', 'F06085', 'F06102', 'F06127', 'F06129', 'F06139', 'F06165', 'F06167', 'F06168', 'F06188', 'F06192', 'F06197']
graph_filenames = list()
graph_binary_filenames = list()
for f in listdir('demonette-glawinette_graph'):
    if isfile(join('demonette-glawinette_graph', f)):
        if f.split()[0] not in ignored:
            graph_filenames.append(f)
for f in listdir('demonette-glawinette_graph_binary'):
    if isfile(join('demonette-glawinette_graph_binary', f)):
        if f not in ignored:
            graph_binary_filenames.append(f)
graph_filenames.sort()
graph_binary_filenames.sort()

In [8]:
L = nx.DiGraph()
with codecs.open(join('demonette-glawinette_posets', 'maxgraph_simplified.dot'), 'r', encoding='utf-8') as f:
    for line in f:
        if '->' in line:
            v = line.split()
            L.add_edge(v[2], v[0])
        elif 'Attribute' in line:
            vertex_id = line.split()[0]
            #introduced_intent = col_names[int(line.split('|')[1].replace('Attribute ', '').replace('\\n', ''))]
            introduced_extent = line.replace('Object ', '').split('|')[-1].split('\\n')[:-1]
            extent_size = re.search('E: (.*)\)', line).group(1)
            introduced_extent = list(map(lambda e: graph_binary_filenames[int(e)], introduced_extent))
            L.add_node(vertex_id, introduced_intent=int(line.split('|')[1].replace('Attribute ', '').replace('\\n', '')),\
                       extent_size=extent_size, introduced_extent=introduced_extent,\
                       concept_number=re.search('<(.*)>', line).group(1))
vertices = list(L.nodes)

In [9]:
L.nodes['1628289406']

{'introduced_intent': 6717,
 'extent_size': '125',
 'introduced_extent': ['F06606-0', 'F06606-1'],
 'concept_number': '6786'}

In [15]:
binary_dir = 'demonette-glawinette_graph_binary'
for v in vertices:
    if v != '1628289406':
        continue
    descendants = nx.descendants(L, v)
    for candidate_number in L.nodes[v]['introduced_extent']:
        candidate = pickle.load(open(join(binary_dir, candidate_number), 'rb'))
        max_score = -1000
        subgraph = dict()
        equivalence = dict()
        for d in descendants:
            if len(list(L.neighbors(d))) > 0:  # not leaf
                continue
            model_group_numbers = L.nodes[d]['introduced_extent']
            for family in model_group_numbers:
                model = pickle.load(open(join(binary_dir, family), 'rb'))
                new_subgraph, new_equivalence, score = completion(candidate, model)
                print(family, score)
                if score > max_score:
                    max_score = score
                    subgraph = new_subgraph
                    equivalence = new_equivalence
                    chosen_model_number = family
        generate_prediction_graph(subgraph, equivalence, candidate_number, chosen_model_number)

F03705 -2
F01918 -3
F03502 -2
F00477 -3
F01247 -3
F00994 17
F01854 0
F02475 -3
F00303 1
F03188 -3
F01504 -4
F00779 -17
F00414 -22
F00100 -23
F00671 -50
F00576 11568
F01899 15
F02428 16
F00804 4
F01047 2
F00162 -4
F00299 2
F03083 -7
F02231 -4
F03445 0
F03571 -17
F01366 1
F02952 -4
F02474 -4
F03924 -5
F01651 -17
F00387 -15
F01667 10
F00771 4
F01319 -8
F00331 -7
F00505 -5
F00748 0
F00444 -3
F01275 -30
F01934 -6
F00510 -37
F01532 -13
F00522 -13
F01286 0
F00179 2
F02782 -19
F01390 -5
F01100 -7
F01776 -6
F01701 0
F02090 1
F00885 5
F02632 -33
F01998 -15
F00174 -23
F00402 -10
F02313 -13
F00244 -67
F00049 17
F01475 -23
F01838 -21
F02684 -28
F01744 -35
F01544 0
F03844 -9
F02823 -24
F00684 -20
F03668 -19
F01610 -1
F01901 -2
F00620 -26
F02648 -10
F00648 -7
F02010 0
F02241 11
F00042 -4
F01822 7
F00177 -4
F00306 -30
F00407 -15
F00880 11585
F00796 -9
F01223 9
F01391 7
F02392 -7
F00791 -4
F01311 -25
F01355 -15
F03019 -11
F01964 11608
F00021 -9
F01896 -4
F00999 8
F00292 0
F03705 1
F01918 10
F03502 8
F0