In [None]:
import codecs
import networkx as nx
import os
import pandas as pd
import pickle
import re
from difflib import SequenceMatcher
from os import listdir
from os.path import isfile, join
from networkx.algorithms import isomorphism
from networkx.drawing.nx_pydot import write_dot
from utils import printProgressBar

In [None]:
frequencies = pd.read_csv('frequencies-frcowvec.csv', header=0, index_col=0)
frequencies.shape

In [None]:
frcowvec_categories = {'Nm': 'NOM', 'Nf': 'NOM', 'Nmp': 'NOM', 'Nfp': 'NOM', 'Nx': 'NOM', 'More': 'NOM',
                       'Npm': 'NAM', 'Npf': 'NAM', 'Npx': 'NAM', 'Npmp': 'NAM', 'Npfp': 'NAM',
                       'Adj': 'ADJ', 'V': 'VER', 'Num': 'NUM', 'Pro': 'PRO', 'Adv': 'ADV'}

def get_frequency(lex_and_cat):
    if '_' not in lex_and_cat:
        return 0
    old_cat = lex_and_cat.split('_')[-1]
    new_cat = frcowvec_categories.get(old_cat)
    new_lex_and_cat = lex_and_cat.split('_')[0] + '_' + new_cat
    try:
        freq = frequencies.loc[new_lex_and_cat]['freq']
        return freq
    except KeyError:
        return 0

In [None]:
ignored = []
graph_filenames = list()
graph_binary_filenames = list()
for f in listdir('DG-graph'):
    if isfile(join('DG-graph', f)):
        if f.split()[0] not in ignored:
            graph_filenames.append(f)
for f in listdir('DG-graph-binary'):
    if isfile(join('DG-graph-binary', f)):
        if f not in ignored:
            graph_binary_filenames.append(f)
graph_filenames.sort()
graph_binary_filenames.sort()

In [None]:
full_extent_dict = dict()
with codecs.open(join('DG-posets', 'aoc-full.dot'), 'r', encoding='utf-8') as f:
    for line in f:
        if 'Attribute' in line:
            full_extent_dict[line.split()[0]] = set(line.replace('Object ', '').split('|')[-1].split('\\n')[:-1])
        elif '->' in line:
            break
            
L = nx.DiGraph()
with codecs.open(join('DG-posets', 'aoc-simplified.dot'), 'r', encoding='utf-8') as f:
    for line in f:
        if '->' in line:
            v = line.split()
            L.add_edge(v[2], v[0])
        elif 'Attribute' in line:
            vertex_id = line.split()[0]
            #introduced_intent = col_names[int(line.split('|')[1].replace('Attribute ', '').replace('\\n', ''))]
            introduced_extent = line.replace('Object ', '').split('|')[-1].split('\\n')[:-1]
            extent_size = re.search('E: (.*)\)', line).group(1)
            L.add_node(vertex_id, introduced_intent=int(line.split('|')[1].replace('Attribute ', '').replace('\\n', '')),\
                       extent_size=extent_size, introduced_extent=introduced_extent,\
                       concept_number=re.search('<(.*)>', line).group(1))
vertices = list(L.nodes)

In [None]:
def generate_lexeme(a1, a2, b1):
    a1_lex = '{' + a1.split('_')[0] + '}'
    a2_lex = '{' + a2.split('_')[0] + '}'
    b1_lex = '{' + b1.split('_')[0] + '}'
    a2_cat = a2.split('_')[1]
    match = SequenceMatcher(None, a1_lex, a2_lex).find_longest_match(0, len(a1_lex), 0, len(a2_lex))
    common = a1_lex[match.a:match.a+match.size]
    a1_suffix = a1_lex.replace(common, '')
    a2_suffix = a2_lex.replace(common, '')
    b2_lex = b1_lex.replace(a1_suffix, a2_suffix)
    if b2_lex == b1_lex and a1_lex != a2_lex:
        return 'XX'
    return b2_lex.replace('{', '').replace('}', '') + '_' + a2_cat

def affix_in_glawi(a1, a2):
    #  check whether affix pair exists in glawinette. E.G. brandilleur-brandilleuse = eur-euse -> exists
    for c in glawi_constructions:
        if len(c.split('-')) != 2:
            continue
        [affix1, affix2] = c.split('-')
        if 'X' not in affix1 or 'X' not in affix2:
            continue
        [affix1a, affix1b] = affix1.split('X')
        [affix2a, affix2b] = affix2.split('X')
        stem1 = a1.replace(affix1a, '', 1)  # remove prefix
        if affix1b != '':
            stem1 = ''.join(stem1.rsplit(affix1b, 1))  # remove postfix
        stem2 = a2.replace(affix2a, '', 1)
        if affix2b != '':
            stem2 = ''.join(stem2.rsplit(affix2b, 1))
        if stem1 == stem2:
            return True
    return False

def edge_compare(e1, e2):
    return e1['label'].split('$')[0] == e2['label'].split('$')[0]

def inner_prediction(G_parent, G_child):
    GM = isomorphism.DiGraphMatcher(G_child, G_parent, node_match=lambda v1,v2: v1['label'] == v2['label'], edge_match=edge_compare)
    node_diff = set()
    matching_count = 0
    for subgraph in GM.subgraph_isomorphisms_iter():
        # subgraph => {'héraclitien_Adj': 'élyséen_Adj', 'Héraclite_Npx': 'Elysée_Npx', 'héraclitéen_Nm': 'élyséen_Nm'}
        matching_count += 1
        node_diff = G_child.nodes - subgraph
    # node_diff => {'héraclitéen_Adj'}
#         if matching_count > 1:
#             return '??'
    ret_str = ''
    for n in node_diff:
        homograph = ''
        for key in subgraph:
            if key.split('_')[0] == n.split('_')[0]:
                homograph = key
                break
        if homograph != '':
            pred = subgraph.get(key).split('_')[0] + '_' + n.split('_')[1]
            ret_str += pred + ' (' + str(get_frequency(pred)) +  '), '
        else:
            freq_cow = 0
            pred = ''
            exists = False
            for key in subgraph:
                pred = generate_lexeme(key, n, subgraph.get(key))
                if pred == 'XX':
                    continue
                freq_cow = get_frequency(pred)
                affix_exists = affix_in_glawi(subgraph.get(key).split('_')[0], pred.split('_')[0])
                if freq_cow > 0 or pred.split('_')[0] in demonette_lexemes or affix_exists:
                    exists = True
                    break
            if freq_cow > 0 or exists:
                ret_str += pred + ' (' + str(freq_cow) + '), '
            elif pred == 'XX':
                ret_str += 'XX, '
    return ret_str

def prediction(G_parent, C_child):
    child_extents = L.nodes[C_child]['introduced_extent']
    for e in child_extents:
        G_child = pickle.load(open(join(binary_dir, graph_binary_filenames[int(e)]), 'rb'))
        ret_str = inner_prediction(G_parent, G_child)
        if 'XX' not in ret_str:
            return ret_str[:-2]
    # search grandchildren
    for c in nx.descendants(L, C_child):
        extents = L.nodes[c]['introduced_extent']
        for e in extents:
            G_grchild = pickle.load(open(join(binary_dir, graph_binary_filenames[int(e)]), 'rb'))
            GM = isomorphism.DiGraphMatcher(G_grchild, G_child, node_match=lambda v1,v2: v1['label'] == v2['label'], edge_match=edge_compare)
            node_diff = set()
            for subgraph in GM.subgraph_isomorphisms_iter():
                pass
            ret_str = inner_prediction(G_parent, G_grchild.subgraph(subgraph))
            if 'XX' not in ret_str:
                return ret_str[:-2]
    return '??'

In [None]:
edge_counter = 0
L_size = L.size()
binary_dir = 'DG-graph-binary'
df_lattice = pd.DataFrame(columns=['concept_pair', 'parent_ext', 'child_ext', 'ext_ratio', 'ante_node_count',
                                   'cons_node_count', 'missing_node_count', 'parent_proper_extent', 'parent', 'child', 'missing'])
for vertex in vertices:
#     if vertex != '1971152916':
#         continue
    current_extent_size = int(L.nodes[vertex]['extent_size'])
    one_G_parent = pickle.load(open(join(binary_dir, graph_binary_filenames[int(L.nodes[vertex]['introduced_extent'][0])]), 'rb'))
    current_graph_len = len(one_G_parent)
    children = L.neighbors(vertex)
    for child in children:
        edge_counter += 1
        printProgressBar(edge_counter, L_size, prefix = 'Progress:', suffix = 'complete', length = 50, decimals = 2)
        ext_ratio = int(L.nodes[child]['extent_size']) / current_extent_size
        if ext_ratio < 0.8:
            continue
        one_introduced_extent = L.nodes[child]['introduced_extent'][0]
        one_G_child = pickle.load(open(join(binary_dir, graph_binary_filenames[int(one_introduced_extent)]), 'rb'))
        child_graph_len = len(one_G_child)
        node_count_diff = child_graph_len - current_graph_len
        if node_count_diff > 1:
            continue
        concept_pair = L.nodes[vertex]['concept_number'] + '-' + L.nodes[child]['concept_number']
        introduced_extent = set(L.nodes[vertex]['introduced_extent'])
        for e in introduced_extent:
            G_parent = pickle.load(open(join(binary_dir, graph_binary_filenames[int(e)]), 'rb'))
            df_lattice = df_lattice.append(pd.Series({
                    'parent': '=HYPERLINK("DG-graph\\' + graph_filenames[int(e)] + '", "' + graph_filenames[int(e)] + ' **")',
                    'child': '=HYPERLINK("DG-graph\\' + graph_filenames[int(one_introduced_extent)] + '", "' + graph_filenames[int(one_introduced_extent)] + '")',
                    'parent_ext': current_extent_size,
                    'child_ext': int(L.nodes[child]['extent_size']),
                    'ext_ratio': ext_ratio,
                    'ante_node_count': current_graph_len,
                    'cons_node_count': child_graph_len,
                    'missing_node_count': node_count_diff,
                    'parent_proper_extent': 'y',
                    'missing': '--missing edge only--' if node_count_diff == 0 else prediction(G_parent, child),
                    'concept_pair': concept_pair
                }), ignore_index=True)
        
        extent_diff = full_extent_dict[vertex] - full_extent_dict[child] - introduced_extent
        for e in extent_diff:
            G_child = pickle.load(open(join(binary_dir, graph_binary_filenames[int(e)]), 'rb'))
            GM = isomorphism.DiGraphMatcher(G_child, one_G_parent, node_match=lambda v1,v2: v1['label'] == v2['label'], edge_match=edge_compare)
            matching_count = 0
            for subgraph in GM.subgraph_isomorphisms_iter():
                matching_count += 1
            df_lattice = df_lattice.append(pd.Series({
                    'parent': '=HYPERLINK("DG-graph\\' + graph_filenames[int(e)] + '", "' + graph_filenames[int(e)] + '")',
                    'child': '=HYPERLINK("DG-graph\\' + graph_filenames[int(one_introduced_extent)] + '", "' + graph_filenames[int(one_introduced_extent)] + '")',
                    'parent_ext': current_extent_size,
                    'child_ext': int(L.nodes[child]['extent_size']),
                    'ext_ratio': ext_ratio,
                    'ante_node_count': current_graph_len,
                    'cons_node_count': child_graph_len,
                    'missing_node_count': node_count_diff,
                    'parent_proper_extent': 'n',
                    'missing': '--missing edge only--' if node_count_diff == 0 else prediction(G_child.subgraph(subgraph), child),
                    'concept_pair': concept_pair 
                }), ignore_index=True)

In [None]:
df_lattice.to_excel('DG-missing.xlsx', index=False)