In [None]:
import codecs
import networkx as nx
import pandas as pd
import pickle
import re
from difflib import SequenceMatcher
from networkx.algorithms import isomorphism
from os import listdir
from os.path import isfile, join
from utils import printProgressBar

In [None]:
frequencies = pd.read_csv('frequencies-frcowvec.csv', header=0, index_col=0)
frequencies.shape

In [None]:
graph_filenames = [f for f in listdir('D-graph') if isfile(join('D-graph', f)) and f.startswith('F')]
graph_binary_filenames  = [f for f in listdir('D-graph-binary') if isfile(join('D-graph-binary', f)) and f.startswith('F')]

# 1. Detection of missing derivations

Lattice reconstruction

In [None]:
full_extent_dict = dict()
with codecs.open(join('D-posets', 'aoc_full.dot'), 'r', encoding='utf-8') as f:
    for line in f:
        if 'Attribute' in line:
            full_extent_dict[line.split()[0]] = set(line.replace('Object ', '').split('|')[-1].split('\\n')[:-1])
        elif '->' in line:
            break
            
L = nx.DiGraph()
with codecs.open(join('D-posets', 'aoc_simplified.dot'), 'r', encoding='utf-8') as f:
    for line in f:
        if '->' in line:
            v = line.split()
            L.add_edge(v[2], v[0])
        elif 'Attribute' in line:
            vertex_id = line.split()[0]
            #introduced_intent = col_names[int(line.split('|')[1].replace('Attribute ', '').replace('\\n', ''))]
            introduced_extent = line.replace('Object ', '').split('|')[-1].split('\\n')[:-1]
            extent_size = re.search('E: (.*)\)', line).group(1)
            L.add_node(vertex_id, introduced_intent=int(line.split('|')[1].replace('Attribute ', '').replace('\\n', '')),\
                       extent_size=extent_size, introduced_extent=introduced_extent,\
                       concept_number=re.search('<(.*)>', line).group(1))

Some functions

In [None]:
frcowvec_categories = {'Nm': 'NOM', 'Nf': 'NOM', 'Nmp': 'NOM', 'Nfp': 'NOM', 'Nx': 'NOM', 'More': 'NOM',
                       'Npm': 'NAM', 'Npf': 'NAM', 'Npx': 'NAM', 'Npmp': 'NAM', 'Npfp': 'NAM',
                       'IJ': 'INT', 'Adj': 'ADJ', 'V': 'VER', 'Num': 'NUM', 'Pro': 'PRO', 'Adv': 'ADV'}

def frcowvec_cat_conversion(lexeme):
    old_cat = lexeme.split('_')[-1]
    new_cat = frcowvec_categories.get(old_cat, old_cat)
    return lexeme.split('_')[0] + '_' + new_cat

def get_max_frequency(input_str):
    lexemes = input_str.split(', ')
    max_freq = 0
    for lexeme in lexemes:
        if lexeme == 'XX':
            continue
        try:
            freq = frequencies.loc[frcowvec_cat_conversion(lexeme)]['freq']
            if freq > max_freq:
                max_freq = freq
        except KeyError:
            pass
    return max_freq

def guess_missing_lexemes(input_str):
    # input_str ex : "micocoulier_Nm : micocoule_Nf = cotonéaster_Nm : ?, micocoulier_Nm : micocouleraie_Nf = cotonéaster_Nm : ?"
    complete_strs = input_str.replace(' ','').split(',')
    ret_str = ''
    for complete_str in complete_strs:
        a1 = '{' + complete_str.split('=')[0].split(':')[0].split('_')[0] + '}'
        a2 = '{' + complete_str.split('=')[0].split(':')[1].split('_')[0] + '}'
        b1 = '{' + complete_str.split('=')[1].split(':')[0].split('_')[0] + '}'
        a2_cat = complete_str.split('=')[0].split(':')[1].split('_')[1]
        match = SequenceMatcher(None, a1, a2).find_longest_match(0, len(a1), 0, len(a2))
        common = a1[match.a:match.a+match.size]
        a1_suffix = a1.replace(common, '')
        a2_suffix = a2.replace(common, '')
        b2 = b1.replace(a1_suffix, a2_suffix)
        if b2 == b1 and a1 != a2:
            ret_str += 'XX, '
        else:
            ret_str += b2.replace('{', '').replace('}', '') + '_' + a2_cat + ', '
    return ret_str[:-2]

def generate_analogy(G_parent, G_child):
    GM = isomorphism.DiGraphMatcher(G_child, G_parent, node_match=lambda v1,v2: v1['label'] == v2['label'], edge_match=lambda e1,e2: e1['label'] == e2['label'])
    node_diff = set()
    for subgraph in GM.subgraph_isomorphisms_iter():
        node_diff = G_child.nodes - subgraph
    ret_str = ''
    for n in node_diff:
        if len(G_child.in_edges(n)) > 0:
            one_origin = list(G_child.in_edges(n))[0][0]
            try:
                ret_str += one_origin + ' : ' + n + ' = ' + subgraph.get(one_origin) + ' : ?' + ', '
            except TypeError:
                pass
        else:
            one_derived = list(G_child.out_edges(n))[0][1]
            try:
                ret_str += one_derived + ' : ' + n + ' = ' + subgraph.get(one_derived) + ' : ?' + ', '
            except TypeError:
                pass
    return ret_str[:-2]

def generate_analogy2(G_parent, G_sibling, G_child):
    GMpc = isomorphism.DiGraphMatcher(G_child, G_parent, node_match=lambda v1,v2: v1['label'] == v2['label'], edge_match=lambda e1,e2: e1['label'] == e2['label'])
    GMps = isomorphism.DiGraphMatcher(G_sibling, G_parent, node_match=lambda v1,v2: v1['label'] == v2['label'], edge_match=lambda e1,e2: e1['label'] == e2['label'])
    node_diff = set()
    for subgraph in GMpc.subgraph_isomorphisms_iter():
        node_diff = G_child.nodes - subgraph
    #print(subgraph)
    #print(node_diff)
    for subgraph2 in GMps.subgraph_isomorphisms_iter():
        pass
    #print(subgraph2)
    ret_str = ''
    for n in node_diff:
        if len(G_child.in_edges(n)) > 0:
            one_origin = list(G_child.in_edges(n))[0][0]
            for key in subgraph2:
                if subgraph2.get(key) == subgraph.get(one_origin):
                    one_origin_sibling = key
            #print(one_origin_sibling)
            ret_str += one_origin + ' : ' + n + ' = ' + one_origin_sibling + ' : ?' + ', '
        else:
            one_derived = list(G_child.out_edges(n))[0][1]
            for key in subgraph2:
                if subgraph2.get(key) == subgraph.get(one_derived):
                    one_derived_sibling = key
            #print(one_derived_sibling)
            ret_str += one_derived + ' : ' + n + ' = ' + one_derived_sibling + ' : ?' + ', '
    #print(ret_str[:-2])
    return ret_str[:-2]

Compare every pair of neighboring concepts

In [None]:
vertices = list(L.nodes)
edge_counter = 0
L_size = L.size()
binary_dir = 'D-graph-binary'
df_lattice = pd.DataFrame(columns=['concept_pair', 'parent_ext', 'child_ext', 'ext_ratio', 'ante_node_count',
                                   'cons_node_count', 'missing_node_count', 'parent', 'child', 'analogy'])
for vertex in vertices:
    #if vertex != '1567705314':
        #continue
    current_extent_size = int(L.nodes[vertex]['extent_size'])
    G_parent = pickle.load(open(join(binary_dir, graph_binary_filenames[int(L.nodes[vertex]['introduced_extent'][0])]), 'rb'))
    current_graph_len = len(G_parent)
    children = L.neighbors(vertex)
    for child in children:
        edge_counter += 1
        printProgressBar(edge_counter, L_size, prefix = 'Progress:', suffix = 'complete', length = 50, decimals = 2)
        ext_ratio = int(L.nodes[child]['extent_size']) / current_extent_size
        if ext_ratio < 0.8:
            continue
        one_introduced_extent = L.nodes[child]['introduced_extent'][0]
        one_G_child = pickle.load(open(join(binary_dir, graph_binary_filenames[int(one_introduced_extent)]), 'rb'))
        child_graph_len = len(one_G_child)
        node_count_diff = child_graph_len - current_graph_len
        concept_pair = L.nodes[vertex]['concept_number'] + '-' + L.nodes[child]['concept_number']
        introduced_extent = set(L.nodes[vertex]['introduced_extent'])
        for e in introduced_extent:
            G_parent = pickle.load(open(join(binary_dir, graph_binary_filenames[int(e)]), 'rb'))
            df_lattice = df_lattice.append(pd.Series({
                    'parent': '=HYPERLINK("' + join('D-graph', graph_filenames[int(e)]) + '", "' + graph_filenames[int(e)] + ' **")',
                    'child': '=HYPERLINK("' + join('D-graph', graph_filenames[int(one_introduced_extent)]) + '", "' + graph_filenames[int(one_introduced_extent)] + '")',
                    'parent_ext': current_extent_size,
                    'child_ext': int(L.nodes[child]['extent_size']),
                    'ext_ratio': ext_ratio,
                    'ante_node_count': current_graph_len,
                    'cons_node_count': child_graph_len,
                    'missing_node_count': node_count_diff,
                    'analogy': generate_analogy(G_parent, one_G_child),
                    'concept_pair': concept_pair
                }), ignore_index=True)
        
        extent_diff = full_extent_dict[vertex] - full_extent_dict[child] - introduced_extent
        for e in extent_diff:
            G_sibling = pickle.load(open(join(binary_dir, graph_binary_filenames[int(e)]), 'rb'))
            df_lattice = df_lattice.append(pd.Series({
                    'parent': '=HYPERLINK("' + join('D-graph', graph_filenames[int(e)]) + '", "' + graph_filenames[int(e)] + '")',
                    'child': '=HYPERLINK("' + join('D-graph', graph_filenames[int(one_introduced_extent)]) + '", "' + graph_filenames[int(one_introduced_extent)] + '")',
                    'parent_ext': current_extent_size,
                    'child_ext': int(L.nodes[child]['extent_size']),
                    'ext_ratio': ext_ratio,
                    'ante_node_count': current_graph_len,
                    'cons_node_count': child_graph_len,
                    'missing_node_count': node_count_diff,
                    'analogy': generate_analogy2(G_parent, G_sibling, one_G_child),
                    'concept_pair': concept_pair 
                }), ignore_index=True)
df_lattice['missing'] = df_lattice['analogy'].apply(guess_missing_lexemes)
df_lattice['max_frequency'] = df_lattice['missing'].apply(get_max_frequency)

In [None]:
df_lattice.to_excel('D_missing_deriv.xlsx', index=False)

# 2. Detection of false derivations

Lattice reconstruction

In [None]:
L = nx.DiGraph()
binary_dir = 'D-graph-binary'
with codecs.open(join('D-posets', 'aoc_simplified.dot'), 'r', encoding='utf-8') as f:
    for line in f:
        if '->' in line:
            v = line.split()
            L.add_edge(v[2], v[0])
        elif 'Attribute' in line:
            vertex_id = line.split()[0]
            #introduced_intent = col_names[int(line.split('|')[1].replace('Attribute ', '').replace('\\n', ''))]
            introduced_extent = line.replace('Object ', '').split('|')[-1].split('\\n')[:-1]
            introduced_extent_size = line.count('Object')
            G = pickle.load(open(join(binary_dir, graph_binary_filenames[int(introduced_extent[0])]), 'rb'))
            L.add_node(vertex_id, introduced_extent=introduced_extent, introduced_extent_size=introduced_extent_size,\
                      concept_number=re.search('<(.*)>', line).group(1),\
                      node_count=len(G))

Compare every pair of neighboring concepts

In [None]:
vertices = list(L.nodes)
edge_counter = 0
L_size = L.size()
df_lattice = pd.DataFrame(columns=['concept_pair', 'parent_introduced_extent', 'child_introduced_extent',
                                   'introduced_extent_ratio',
                                   'parent_node_count', 'child_node_count', 'node_count_diff',
                                   'parent', 'child',
                                   'spurious_node', 'spurious_node_freq'])
for vertex in vertices:
#     if vertex != '1567705314':
#         continue
    current_extent_size = int(L.nodes[vertex]['introduced_extent_size'])
    current_graph_len = L.nodes[vertex]['node_count']
    children = L.neighbors(vertex)
    one_parent_extent = L.nodes[vertex]['introduced_extent'][0]
    G_parent = pickle.load(open(join(binary_dir, graph_binary_filenames[int(one_parent_extent)]), 'rb'))
    for child in children:
        edge_counter += 1
        printProgressBar(edge_counter, L_size, prefix = 'Progress:', suffix = 'complete', length = 50, decimals = 2)
        node_diff = L.nodes[child]['node_count'] - current_graph_len
        intr_ext_ratio = int(L.nodes[child]['introduced_extent_size']) / current_extent_size
        if node_diff > 2 or intr_ext_ratio > 0.3:  # or L.nodes[child]['introduced_extent_size'] > 1:
            continue
        concept_pair = L.nodes[vertex]['concept_number'] + '-' + L.nodes[child]['concept_number']
        child_extent = L.nodes[child]['introduced_extent']
        for e in child_extent:
            G_child = pickle.load(open(join(binary_dir, graph_binary_filenames[int(e)]), 'rb'))
            GM = isomorphism.DiGraphMatcher(G_child, G_parent, node_match=lambda v1,v2: v1['label'] == v2['label'], edge_match=lambda e1,e2: e1['label'] == e2['label'])
            H = set()
            spurious_node = ''
            spurious_node_str = ''
            spurious_node_freq = 999999999
            for subgraph in GM.subgraph_isomorphisms_iter():
                H = G_child.nodes - subgraph
#                 spurious_node_temp = next(iter(H))
                for spurious_node_temp in iter(H):
                    spurious_node_str += spurious_node_temp + ', '
                    if G_child.nodes[spurious_node_temp]['frequency'] < spurious_node_freq:
                        spurious_node = spurious_node_temp
                        spurious_node_freq = G_child.nodes[spurious_node]['frequency']
                break  #  for avoiding double detection
            df_lattice = df_lattice.append(pd.Series({
                    'parent': '=HYPERLINK("' + join('D-graph', graph_filenames[int(one_parent_extent)]) + '", "' + graph_filenames[int(one_parent_extent)] + '")',
                    'child': '=HYPERLINK("' + join('D-graph', graph_filenames[int(e)]) + '", "' + graph_filenames[int(e)] + '")',
                    'parent_introduced_extent': current_extent_size,
                    'child_introduced_extent': L.nodes[child]['introduced_extent_size'],
                    'introduced_extent_ratio': intr_ext_ratio,
                    'parent_node_count': current_graph_len,
                    'child_node_count': L.nodes[child]['node_count'],
                    'node_count_diff': node_diff,
                    'spurious_node': spurious_node_str[:-2],
                    'spurious_node_freq': G_child.nodes[spurious_node]['frequency'],
                    'concept_pair': concept_pair
                }), ignore_index=True)

In [None]:
df_lattice.to_excel('D_false_deriv.xlsx', index=False)

In [None]:
df_lattice.head()