In [None]:
import ast
import codecs
import glob
import networkx as nx
import numpy as np
import os
import pandas as pd
import pickle
import re
from collections import Counter
from difflib import SequenceMatcher
from numpy import dot
from numpy.linalg import norm
from os import listdir
from os.path import isfile, join
from networkx.algorithms import isomorphism
from networkx.drawing.nx_pydot import write_dot
from utils import printProgressBar

# 1. Excel

In [None]:
frequencies = pd.read_csv('frequencies-frcowvec.csv', header=0, index_col=0)
frequencies.shape

In [None]:
frcowvec_categories = {'Nm': 'NOM', 'Nf': 'NOM', 'Nmp': 'NOM', 'Nfp': 'NOM', 'Nx': 'NOM', 'More': 'NOM',
                       'Npm': 'NAM', 'Npf': 'NAM', 'Npx': 'NAM', 'Npmp': 'NAM', 'Npfp': 'NAM',
                       'IJ': 'INT', 'Adj': 'ADJ', 'V': 'VER', 'Num': 'NUM', 'Pro': 'PRO', 'Adv': 'ADV'}

def get_frequency(lex_and_cat):
    if '_' not in lex_and_cat:
        return 0
    old_cat = lex_and_cat.split('_')[-1]
    new_cat = frcowvec_categories.get(old_cat, old_cat)
    new_lex_and_cat = lex_and_cat.split('_')[0] + '_' + new_cat
    try:
        freq = frequencies.loc[new_lex_and_cat]['freq']
        return freq
    except KeyError:
        return 0

In [None]:
ignored = []
graph_filenames = list()
graph_binary_filenames = list()
for f in listdir('DG-graph'):
    if isfile(join('DG-graph', f)):
        if f.split()[0] not in ignored:
            graph_filenames.append(f)
for f in listdir('DG-graph-binary'):
    if isfile(join('DG-graph-binary', f)):
        if f not in ignored:
            graph_binary_filenames.append(f)
graph_filenames.sort()
graph_binary_filenames.sort()

In [None]:
full_extent_dict = dict()
with codecs.open(join('DG-posets', 'aoc-full.dot'), 'r', encoding='utf-8') as f:
    for line in f:
        if 'Attribute' in line:
            full_extent_dict[line.split()[0]] = set(line.replace('Object ', '').split('|')[-1].split('\\n')[:-1])
        elif '->' in line:
            break
            
L = nx.DiGraph()
with codecs.open(join('DG-posets', 'aoc-simplified.dot'), 'r', encoding='utf-8') as f:
    for line in f:
        if '->' in line:
            v = line.split()
            L.add_edge(v[2], v[0])
        elif 'Attribute' in line:
            vertex_id = line.split()[0]
            #introduced_intent = col_names[int(line.split('|')[1].replace('Attribute ', '').replace('\\n', ''))]
            introduced_extent = line.replace('Object ', '').split('|')[-1].split('\\n')[:-1]
            extent_size = re.search('E: (.*)\)', line).group(1)
            L.add_node(vertex_id, introduced_intent=int(line.split('|')[1].replace('Attribute ', '').replace('\\n', '')),\
                       extent_size=extent_size, introduced_extent=introduced_extent,\
                       concept_number=re.search('<(.*)>', line).group(1))
vertices = list(L.nodes)

In [None]:
def generate_lexeme(a1, a2, b1):
    a1_lex = '{' + a1.split('_')[0] + '}'
    a2_lex = '{' + a2.split('_')[0] + '}'
    b1_lex = '{' + b1.split('_')[0] + '}'
    a2_cat = a2.split('_')[1]
    match = SequenceMatcher(None, a1_lex, a2_lex).find_longest_match(0, len(a1_lex), 0, len(a2_lex))
    common = a1_lex[match.a:match.a+match.size]
    a1_suffix = a1_lex.replace(common, '')
    a2_suffix = a2_lex.replace(common, '')
    b2_lex = b1_lex.replace(a1_suffix, a2_suffix)
    if b2_lex == b1_lex and a1_lex != a2_lex:
        return 'XX'
    return b2_lex.replace('{', '').replace('}', '') + '_' + a2_cat

def affix_in_glawi(a1, a2):
    #  check whether affix pair exists in glawinette. E.G. brandilleur-brandilleuse = eur-euse -> exists
    for c in glawi_constructions:
        if len(c.split('-')) != 2:
            continue
        [affix1, affix2] = c.split('-')
        if 'X' not in affix1 or 'X' not in affix2:
            continue
        [affix1a, affix1b] = affix1.split('X')
        [affix2a, affix2b] = affix2.split('X')
        stem1 = a1.replace(affix1a, '', 1)  # remove prefix
        if affix1b != '':
            stem1 = ''.join(stem1.rsplit(affix1b, 1))  # remove postfix
        stem2 = a2.replace(affix2a, '', 1)
        if affix2b != '':
            stem2 = ''.join(stem2.rsplit(affix2b, 1))
        if stem1 == stem2:
            return True
    return False

def edge_compare(e1, e2):
    return e1['label'].split('$')[0] == e2['label'].split('$')[0]

def inner_prediction(G_parent, G_child):
    GM = isomorphism.DiGraphMatcher(G_child, G_parent, node_match=lambda v1,v2: v1['label'] == v2['label'], edge_match=edge_compare)
    node_diff = set()
    matching_count = 0
    for subgraph in GM.subgraph_isomorphisms_iter():
        # subgraph => {'héraclitien_Adj': 'élyséen_Adj', 'Héraclite_Npx': 'Elysée_Npx', 'héraclitéen_Nm': 'élyséen_Nm'}
        matching_count += 1
        node_diff = G_child.nodes - subgraph
    # node_diff => {'héraclitéen_Adj'}
#         if matching_count > 1:
#             return '??'
    ret_str = ''
    for n in node_diff:
        homograph = ''
        for key in subgraph:
            if key.split('_')[0] == n.split('_')[0]:
                homograph = key
                break
        if homograph != '':
            pred = subgraph.get(key).split('_')[0] + '_' + n.split('_')[1]
            ret_str += pred + ' (' + str(get_frequency(pred)) +  '), '
        else:
            freq_cow = 0
            pred = ''
            exists = False
            for key in subgraph:
                pred = generate_lexeme(key, n, subgraph.get(key))
                if pred == 'XX':
                    continue
                freq_cow = get_frequency(pred)
                affix_exists = affix_in_glawi(subgraph.get(key).split('_')[0], pred.split('_')[0])
                if freq_cow > 0 or pred.split('_')[0] in demonette_lexemes or affix_exists:
                    exists = True
                    break
            if freq_cow > 0 or exists:
                ret_str += pred + ' (' + str(freq_cow) + '), '
            elif pred == 'XX':
                ret_str += 'XX, '
    return ret_str

def prediction(G_parent, C_child):
    child_extents = L.nodes[C_child]['introduced_extent']
    for e in child_extents:
        G_child = pickle.load(open(join(binary_dir, graph_binary_filenames[int(e)]), 'rb'))
        ret_str = inner_prediction(G_parent, G_child)
        if 'XX' not in ret_str:
            return ret_str[:-2]
    # search grandchildren
    for c in nx.descendants(L, C_child):
        extents = L.nodes[c]['introduced_extent']
        for e in extents:
            G_grchild = pickle.load(open(join(binary_dir, graph_binary_filenames[int(e)]), 'rb'))
            GM = isomorphism.DiGraphMatcher(G_grchild, G_child, node_match=lambda v1,v2: v1['label'] == v2['label'], edge_match=edge_compare)
            node_diff = set()
            for subgraph in GM.subgraph_isomorphisms_iter():
                pass
            ret_str = inner_prediction(G_parent, G_grchild.subgraph(subgraph))
            if 'XX' not in ret_str:
                return ret_str[:-2]
    return '??'

In [None]:
edge_counter = 0
L_size = L.size()
binary_dir = 'DG-graph-binary'
df_lattice = pd.DataFrame(columns=['concept_pair', 'parent_ext', 'child_ext', 'ext_ratio', 'ante_node_count',
                                   'cons_node_count', 'missing_node_count', 'parent_proper_extent', 'parent', 'child', 'missing'])
for vertex in vertices:
#     if vertex != '1971152916':
#         continue
    current_extent_size = int(L.nodes[vertex]['extent_size'])
    one_G_parent = pickle.load(open(join(binary_dir, graph_binary_filenames[int(L.nodes[vertex]['introduced_extent'][0])]), 'rb'))
    current_graph_len = len(one_G_parent)
    children = L.neighbors(vertex)
    for child in children:
        edge_counter += 1
        printProgressBar(edge_counter, L_size, prefix = 'Progress:', suffix = 'complete', length = 50, decimals = 2)
        ext_ratio = int(L.nodes[child]['extent_size']) / current_extent_size
        if ext_ratio < 0.8:
            continue
        one_introduced_extent = L.nodes[child]['introduced_extent'][0]
        one_G_child = pickle.load(open(join(binary_dir, graph_binary_filenames[int(one_introduced_extent)]), 'rb'))
        child_graph_len = len(one_G_child)
        node_count_diff = child_graph_len - current_graph_len
        if node_count_diff > 1:
            continue
        concept_pair = L.nodes[vertex]['concept_number'] + '-' + L.nodes[child]['concept_number']
        introduced_extent = set(L.nodes[vertex]['introduced_extent'])
        for e in introduced_extent:
            G_parent = pickle.load(open(join(binary_dir, graph_binary_filenames[int(e)]), 'rb'))
            df_lattice = df_lattice.append(pd.Series({
                    'parent': '=HYPERLINK("DG-graph\\' + graph_filenames[int(e)] + '", "' + graph_filenames[int(e)] + ' **")',
                    'child': '=HYPERLINK("DG-graph\\' + graph_filenames[int(one_introduced_extent)] + '", "' + graph_filenames[int(one_introduced_extent)] + '")',
                    'parent_ext': current_extent_size,
                    'child_ext': int(L.nodes[child]['extent_size']),
                    'ext_ratio': ext_ratio,
                    'ante_node_count': current_graph_len,
                    'cons_node_count': child_graph_len,
                    'missing_node_count': node_count_diff,
                    'parent_proper_extent': 'y',
                    'missing': '--missing edge only--' if node_count_diff == 0 else prediction(G_parent, child),
                    'concept_pair': concept_pair
                }), ignore_index=True)
        
        extent_diff = full_extent_dict[vertex] - full_extent_dict[child] - introduced_extent
        for e in extent_diff:
            G_child = pickle.load(open(join(binary_dir, graph_binary_filenames[int(e)]), 'rb'))
            GM = isomorphism.DiGraphMatcher(G_child, one_G_parent, node_match=lambda v1,v2: v1['label'] == v2['label'], edge_match=edge_compare)
            matching_count = 0
            for subgraph in GM.subgraph_isomorphisms_iter():
                matching_count += 1
            df_lattice = df_lattice.append(pd.Series({
                    'parent': '=HYPERLINK("DG-graph\\' + graph_filenames[int(e)] + '", "' + graph_filenames[int(e)] + '")',
                    'child': '=HYPERLINK("DG-graph\\' + graph_filenames[int(one_introduced_extent)] + '", "' + graph_filenames[int(one_introduced_extent)] + '")',
                    'parent_ext': current_extent_size,
                    'child_ext': int(L.nodes[child]['extent_size']),
                    'ext_ratio': ext_ratio,
                    'ante_node_count': current_graph_len,
                    'cons_node_count': child_graph_len,
                    'missing_node_count': node_count_diff,
                    'parent_proper_extent': 'n',
                    'missing': '--missing edge only--' if node_count_diff == 0 else prediction(G_child.subgraph(subgraph), child),
                    'concept_pair': concept_pair 
                }), ignore_index=True)

In [None]:
df_lattice.to_excel('DG-missing.xlsx', index=False)

# 2. Proposition of lexemes using Glawinette's patterns

In [None]:
binary_dir = 'DG-graph-binary'
input_files = [f for f in listdir(binary_dir) if isfile(join(binary_dir, f))]
input_files.sort()
print(len(input_files), 'families')

In [None]:
glawi_constructions = list()
with codecs.open('glawi-constructions.txt', 'r', encoding='utf-8') as f:
    for line in f:
        glawi_constructions.append(line.strip('\n'))

lexemes_in_demTable = set()
with codecs.open('lexemes.csv', 'r', encoding='utf-8') as f:
    for line in f:
        elements = line.split('\t')
        if elements[0] == 'lid':
            continue
        lexemes_in_demTable.add(elements[2])
print(len(lexemes_in_demTable), 'lexemes in Demonette\'s table of lexemes')

In [None]:
def match(pattern, word):  # pattern = 'preXisation', word = 'precognisation' => True
    if pattern == 'X':
        return True
    counter = 0
    try:
        for c in pattern:
            if c == 'X':
                break
            if c != word[counter]:
                return False
            counter += 1
        counter = -1
        while True:
            if pattern[counter] == 'X':
                break
            if pattern[counter] != word[counter]:
                return False
            counter -= 1
    except IndexError:  # Xtractif & actif
        return False
    return True

## 2.1. Generation of lexemes and their frequency

In [None]:
cow_dict = dict()
with codecs.open('frequencies-frcowvec.csv', 'r', encoding='utf-8') as f:
    for line in f:
        if '_PUN' in line:
            continue
        elements = line.replace('"', '').strip('\n').split(',')
        if elements[-1] == 'freq':
            continue
        lexeme_and_cat = ','.join(elements[0:-1])
        lexeme = '_'.join(lexeme_and_cat.split('_')[0:-1])
        try:
            if cow_dict[lexeme] < int(elements[-1]):
                cow_dict[lexeme] = int(elements[-1])
        except KeyError:
            cow_dict[lexeme] = int(elements[-1])
print(len(cow_dict))

In [None]:
def generate_lexemes_and_freq(nodes):
    return_list = list()
    existing_lexemes = set()
    generated = set()
    for n in nodes:
        existing_lexemes.add(n.split('_')[0])
    for lexeme in existing_lexemes:
        best_const_length = -1
        best_const = ''
        for c in glawi_constructions:
            const1 = c.split('-')[0]
            if len(const1) > best_const_length and match(const1, lexeme):
                best_const_length = len(const1)
                best_const = const1
        for c in glawi_constructions:
            [const1, const2] = c.split('-')
            if const1 != best_const:
                continue
            [prefix, postfix] = const1.split('X')
            stem = lexeme.replace(prefix, '', 1)
            if postfix:  # if not empty
                stem = ''.join(stem.rsplit(postfix, 1))
            new_lexeme = const2.replace('X', stem)
            if new_lexeme in generated or new_lexeme in existing_lexemes:
                continue
            if new_lexeme in lexemes_in_demTable and new_lexeme in cow_dict:
                return_list.append((new_lexeme, cow_dict.get(new_lexeme, 0)))
                generated.add(new_lexeme)
    return return_list

In [None]:
output_file = codecs.open('DG_propositions_and_freq.txt', 'w', encoding='utf-8')
# counter_file = -1
proposed_lexemes = set()
for input_file in input_files:
    print(input_file, end='\r')
#     counter_file += 1
#     if counter_file > 20:
#         break
    G = pickle.load(open(join(binary_dir, input_file), 'rb'))
    prop_and_freq = generate_lexemes_and_freq(G.nodes())
    prop_and_freq.sort(key=lambda x:x[1], reverse=True)
    output_file.write(input_file + '\t' + str(prop_and_freq) + '\n')
output_file.close()

## 2.2. Generation of lexemes and their cosine similarity

In [None]:
lexemes_in_bow = set()
with codecs.open('lemma-A-pos-bow.txt', 'r', encoding='utf-8') as f:
    for line in f:
        lexeme = line.split()[0].split('_')[0]
        lexemes_in_bow.add(lexeme)
print(len(lexemes_in_bow), 'lexemes have distribution vectors')

lexemes_in_demonette_families = set()
for file in input_files:
    G = pickle.load(open(join(binary_dir, file), 'rb'))
    for n in G.nodes():
        lexemes_in_demonette_families.add(n.split('_')[0])
print(len(lexemes_in_demonette_families), 'lexemes in Demonette')

In [None]:
def generate_lexemes(nodes):
    return_set = set()
    existing_lexemes = set()
    for n in nodes:
        existing_lexemes.add(n.split('_')[0])
    for lexeme in existing_lexemes:
        best_const_length = -1
        best_const = ''
        for c in glawi_constructions:
            const1 = c.split('-')[0]
            if len(const1) > best_const_length and match(const1, lexeme):
                best_const_length = len(const1)
                best_const = const1
        for c in glawi_constructions:
            [const1, const2] = c.split('-')
            if const1 != best_const:
                continue
            [prefix, postfix] = const1.split('X')
            stem = lexeme.replace(prefix, '', 1)
            if postfix:  # if not empty
                stem = ''.join(stem.rsplit(postfix, 1))
            new_lexeme = const2.replace('X', stem)
            if new_lexeme in return_set or new_lexeme in existing_lexemes:
                continue
            if new_lexeme in lexemes_in_bow and new_lexeme in lexemes_in_demTable:
                return_set.add(new_lexeme)
    return return_set

### Generation of lexemes

In [None]:
output_file = codecs.open('DG_propositions.txt', 'w', encoding='utf-8')
# counter_file = -1
proposed_lexemes = set()
for input_file in input_files:
    print(input_file, end='\r')
#     counter_file += 1
#     if counter_file < 82:
#         continue
    G = pickle.load(open(join(binary_dir, input_file), 'rb'))
    propositions = generate_lexemes(G.nodes())
    proposed_lexemes.update(propositions)
    output_file.write(input_file + '\t' + str(propositions) + '\n')
output_file.close()

In [None]:
print(len(proposed_lexemes), 'new lexemes')

### Calculation of max and avg of cosine similarity for each family

In [None]:
vector_dict = dict()
counter = 0
with codecs.open('lemma-A-pos-bow.txt', 'r', encoding='utf-8') as f:
    for line in f:
        lexeme = line.split()[0].split('_')[0]
        if lexeme in lexemes_in_demonette_families: #  or lexeme in proposed_lexemes:
            v = np.array(list(map(lambda x: float(x), line.split()[1:])))
            vector_dict[lexeme] = v
        print(counter, end='\r')
        counter += 1
print(counter, 'vectors in bow')
print(len(vector_dict), 'vectors kept')

In [None]:
output_file = codecs.open('DG_family_cosine.txt', 'w', encoding='utf-8')
output_file.write('familyID\tmax_cosine\tavg_cosine\tlexemes\n')
for file in input_files:
    G = pickle.load(open(join(binary_dir, file), 'rb'))
    max_cos = -3
    comparison_counter = 0
    total_cosine = 0
    nodes = list(G.nodes())
    for n1 in range(0, len(nodes) - 1):
        for n2 in range(n1 + 1, len(nodes) - 1):
            try:
                lex1 = nodes[n1].split('_')[0]
                lex2 = nodes[n2].split('_')[0]
                if lex1 == lex2:
                    continue
                vec1 = vector_dict[lex1]
                vec2 = vector_dict[lex2]
                cosine_similarity = dot(vec1, vec2)/(norm(vec1)*norm(vec2))
                if cosine_similarity > max_cos:
                    max_cos = cosine_similarity
                total_cosine += cosine_similarity
                comparison_counter += 1
            except KeyError:
                pass
    if comparison_counter == 0:
        output_file.write(file + '\t?\t0\t' + str(nodes) + '\n')
    else:
        output_file.write(file + '\t' + str(round(max_cos, 2)) + '\t' + str(round(total_cosine/comparison_counter, 2))\
                      + '\t' + str(nodes) + '\n')
    print(file, end='\r')
output_file.close()

### Calculation of cosine similarity for each new lexeme

In [None]:
proposed_lexemes = set()
with codecs.open('DG_propositions.txt', 'r', encoding='utf-8') as f:
    for line in f:
        elements = line.split('\t')
        propositions = eval(elements[1])
        proposed_lexemes.update(propositions)
        print(elements[0], end='\r')

In [None]:
vector_dict = dict()
counter = 0
with codecs.open('lemma-A-pos-bow.txt', 'r', encoding='utf-8') as f:
    for line in f:
        lexeme = line.split()[0].split('_')[0]
        if lexeme in lexemes_in_demonette_families or lexeme in proposed_lexemes:
            v = np.array(list(map(lambda x: float(x), line.split()[1:])))
            vector_dict[lexeme] = v
        print(counter, end='\r')
        counter += 1
print(counter, 'vectors in bow')
print(len(vector_dict), 'vectors kept')

In [None]:
output_file = codecs.open('DG_propositions_and_cosine.txt', 'w', encoding='utf-8')
with codecs.open('DG_propositions.txt', 'r', encoding='utf-8') as f:
    for line in f:
        elements = line.split('\t')
        output_file.write(elements[0] + '\t')
        propositions = eval(elements[1])
        prop_and_cos = list()
        for p in propositions:
            total_cos = 0
            count_cos = 0
            max_cos = -2
            vec1 = vector_dict[p]
            G = pickle.load(open(join(binary_dir, elements[0]), 'rb'))
            for n in G.nodes():
                try:
                    vec2 = vector_dict[n.split('_')[0]]
                    cosine_similarity = dot(vec1, vec2)/(norm(vec1)*norm(vec2))
                    if cosine_similarity > max_cos:
                        max_cos = cosine_similarity
                    total_cos += cosine_similarity
                    count_cos += 1
                except KeyError:
                    pass
            if count_cos == 0:
                prop_and_cos.append((p, 0, 0))
            else:
                prop_and_cos.append((p, round(max_cos, 2), round(total_cos/count_cos, 2)))
        if len(prop_and_cos) > 0:
            prop_and_cos.sort(key=lambda x:x[1], reverse=True)
        output_file.write(str(prop_and_cos) + '\n')
        print(elements[0], end='\r')
output_file.close()