In [None]:
import codecs
import glob
import networkx as nx
import numpy as np
import os
import pandas as pd
import pickle
import re
from difflib import SequenceMatcher
from numpy import dot
from numpy.linalg import norm
from os import listdir
from os.path import isfile, join
from networkx.algorithms import isomorphism
from networkx.drawing.nx_pydot import write_dot
from utils import printProgressBar

In [None]:
binary_dir = 'DG-graph-binary'
input_files = [f for f in listdir(binary_dir) if isfile(join(binary_dir, f))]
input_files.sort()
print(len(input_files), 'families')

In [None]:
glawi_constructions = list()
with codecs.open('glawi-constructions.txt', 'r', encoding='utf-8') as f:
    for line in f:
        glawi_constructions.append(line.strip('\n'))

In [None]:
def match(pattern, word):  # pattern = 'preXisation', word = 'precognisation' => True
    if pattern == 'X':
        return True
    counter = 0
    try:
        for c in pattern:
            if c == 'X':
                break
            if c != word[counter]:
                return False
            counter += 1
        counter = -1
        while True:
            if pattern[counter] == 'X':
                break
            if pattern[counter] != word[counter]:
                return False
            counter -= 1
    except IndexError:  # Xtractif & actif
        return False
    return True

# generate propositions and their frequency

In [None]:
cow_dict = dict()
with codecs.open('frequencies-frcowvec.csv', 'r', encoding='utf-8') as f:
    for line in f:
        if '_PUN' in line:
            continue
        elements = line.replace('"', '').strip('\n').split(',')
        if elements[-1] == 'freq':
            continue
        lexeme_and_cat = ','.join(elements[0:-1])
        lexeme = '_'.join(lexeme_and_cat.split('_')[0:-1])
        try:
            if cow_dict[lexeme] < int(elements[-1]):
                cow_dict[lexeme] = int(elements[-1])
        except KeyError:
            cow_dict[lexeme] = int(elements[-1])
print(len(cow_dict))

In [None]:
# lexemes_in_cow = set()
# with codecs.open('frequencies-frcowvec-filtered.csv', 'r', encoding='utf-8') as f:
#     for line in f:
#         elements = line.strip('\n').split(',')
#         if elements[1] == 'freq':
#             continue
#         lexemes_in_cow.add(elements[0].split('_')[0])
# print(len(lexemes_in_cow), 'lexemes in cow')

lexemes_in_demTable = set()
with codecs.open('lexemes.csv', 'r', encoding='utf-8') as f:
    for line in f:
        elements = line.split('\t')
        if elements[0] == 'lid':
            continue
        lexemes_in_demTable.add(elements[2])
print(len(lexemes_in_demTable), 'lexemes in Demonette\'s table of lexemes')

In [None]:
def generate_lexemes_and_freq(nodes):
    return_list = list()
    existing_lexemes = set()
    generated = set()
    for n in nodes:
        existing_lexemes.add(n.split('_')[0])
    for lexeme in existing_lexemes:
        best_const_length = -1
        best_const = ''
        for c in glawi_constructions:
            const1 = c.split('-')[0]
            if len(const1) > best_const_length and match(const1, lexeme):
                best_const_length = len(const1)
                best_const = const1
        for c in glawi_constructions:
            [const1, const2] = c.split('-')
            if const1 != best_const:
                continue
            [prefix, postfix] = const1.split('X')
            stem = lexeme.replace(prefix, '', 1)
            if postfix:  # if not empty
                stem = ''.join(stem.rsplit(postfix, 1))
            new_lexeme = const2.replace('X', stem)
            if new_lexeme in generated or new_lexeme in existing_lexemes:
                continue
            if new_lexeme in lexemes_in_demTable or new_lexeme in cow_dict:
                return_list.append((new_lexeme, cow_dict.get(new_lexeme, 0)))
                generated.add(new_lexeme)
    return return_list

In [None]:
output_file = codecs.open('DG_propositions_and_freq.txt', 'w', encoding='utf-8')
# counter_file = -1
proposed_lexemes = set()
for input_file in input_files:
    print(input_file, end='\r')
#     counter_file += 1
#     if counter_file > 20:
#         break
    G = pickle.load(open(join(binary_dir, input_file), 'rb'))
    prop_and_freq = generate_lexemes_and_freq(G.nodes())
    prop_and_freq.sort(key=lambda x:x[1], reverse=True)
    output_file.write(input_file + '\t' + str(prop_and_freq) + '\n')
output_file.close()

# generate propositions and their cosine similarity

In [None]:
lexemes_in_bow = set()
with codecs.open('lemma-A-pos-bow.txt', 'r', encoding='utf-8') as f:
    for line in f:
        lexeme = line.split()[0].split('_')[0]
        lexemes_in_bow.add(lexeme)
print(len(lexemes_in_bow), 'lexemes have distribution vectors')

lexemes_in_demonette_families = set()
for file in input_files:
    G = pickle.load(open(join(binary_dir, file), 'rb'))
    for n in G.nodes():
        lexemes_in_demonette_families.add(n.split('_')[0])
print(len(lexemes_in_demonette_families), 'lexemes in Demonette')

In [None]:
def generate_lexemes(nodes):
    return_set = set()
    existing_lexemes = set()
    for n in nodes:
        existing_lexemes.add(n.split('_')[0])
    for lexeme in existing_lexemes:
        best_const_length = -1
        best_const = ''
        for c in glawi_constructions:
            const1 = c.split('-')[0]
            if len(const1) > best_const_length and match(const1, lexeme):
                best_const_length = len(const1)
                best_const = const1
        for c in glawi_constructions:
            [const1, const2] = c.split('-')
            if const1 != best_const:
                continue
            [prefix, postfix] = const1.split('X')
            stem = lexeme.replace(prefix, '', 1)
            if postfix:  # if not empty
                stem = ''.join(stem.rsplit(postfix, 1))
            new_lexeme = const2.replace('X', stem)
            if new_lexeme in return_set or new_lexeme in existing_lexemes:
                continue
            if new_lexeme in lexemes_in_bow:
                return_set.add(new_lexeme)
    return return_set

## generate propositions

In [None]:
output_file = codecs.open('DG_propositions.txt', 'w', encoding='utf-8')
# counter_file = -1
proposed_lexemes = set()
for input_file in input_files:
    print(input_file, end='\r')
#     counter_file += 1
#     if counter_file < 82:
#         continue
    G = pickle.load(open(join(binary_dir, input_file), 'rb'))
    propositions = generate_lexemes(G.nodes())
    proposed_lexemes.update(propositions)
    output_file.write(input_file + '\t' + str(propositions) + '\n')
output_file.close()

In [None]:
print(len(proposed_lexemes))

## max and average of cosine similarity in each family

In [None]:
vector_dict = dict()
counter = 0
with codecs.open('lemma-A-pos-bow.txt', 'r', encoding='utf-8') as f:
    for line in f:
        lexeme = line.split()[0].split('_')[0]
        if lexeme in lexemes_in_demonette_families: #  or lexeme in proposed_lexemes:
            v = np.array(list(map(lambda x: float(x), line.split()[1:])))
            vector_dict[lexeme] = v
        print(counter, end='\r')
        counter += 1
print(counter, 'vectors in bow')
print(len(vector_dict), 'vectors kept')

In [None]:
output_file = codecs.open('DG_family_cosine.txt', 'w', encoding='utf-8')
output_file.write('familyID\tmax_cosine\tavg_cosine\tlexemes\n')
for file in input_files:
    G = pickle.load(open(join(binary_dir, file), 'rb'))
    max_cos = -3
    comparison_counter = 0
    total_cosine = 0
    nodes = list(G.nodes())
    for n1 in range(0, len(nodes) - 1):
        for n2 in range(n1 + 1, len(nodes) - 1):
            try:
                lex1 = nodes[n1].split('_')[0]
                lex2 = nodes[n2].split('_')[0]
                if lex1 == lex2:
                    continue
                vec1 = vector_dict[lex1]
                vec2 = vector_dict[lex2]
                cosine_similarity = dot(vec1, vec2)/(norm(vec1)*norm(vec2))
                if cosine_similarity > max_cos:
                    max_cos = cosine_similarity
                total_cosine += cosine_similarity
                comparison_counter += 1
            except KeyError:
                pass
    if comparison_counter == 0:
        output_file.write(file + '\t?\t0\t' + str(nodes) + '\n')
    else:
        output_file.write(file + '\t' + str(round(max_cos, 2)) + '\t' + str(round(total_cosine/comparison_counter, 2))\
                      + '\t' + str(nodes) + '\n')
    print(file, end='\r')
output_file.close()

## calculate cosine for each proposition

In [None]:
proposed_lexemes = set()
with codecs.open('DG_propositions.txt', 'r', encoding='utf-8') as f:
    for line in f:
        elements = line.split('\t')
        propositions = eval(elements[1])
        proposed_lexemes.update(propositions)
        print(elements[0], end='\r')

In [None]:
vector_dict = dict()
counter = 0
with codecs.open('lemma-A-pos-bow.txt', 'r', encoding='utf-8') as f:
    for line in f:
        lexeme = line.split()[0].split('_')[0]
        if lexeme in lexemes_in_demonette_families or lexeme in proposed_lexemes:
            v = np.array(list(map(lambda x: float(x), line.split()[1:])))
            vector_dict[lexeme] = v
        print(counter, end='\r')
        counter += 1
print(counter, 'vectors in bow')
print(len(vector_dict), 'vectors kept')

In [None]:
output_file = codecs.open('DG_propositions_and_cosine.txt', 'w', encoding='utf-8')
with codecs.open('DG_propositions.txt', 'r', encoding='utf-8') as f:
    for line in f:
        elements = line.split('\t')
        output_file.write(elements[0] + '\t')
        propositions = eval(elements[1])
        prop_and_cos = list()
        for p in propositions:
            total_cos = 0
            count_cos = 0
            max_cos = -2
            vec1 = vector_dict[p]
            G = pickle.load(open(join(binary_dir, elements[0]), 'rb'))
            for n in G.nodes():
                try:
                    vec2 = vector_dict[n.split('_')[0]]
                    cosine_similarity = dot(vec1, vec2)/(norm(vec1)*norm(vec2))
                    if cosine_similarity > max_cos:
                        max_cos = cosine_similarity
                    total_cos += cosine_similarity
                    count_cos += 1
                except KeyError:
                    pass
            if count_cos == 0:
                prop_and_cos.append((p, 0, 0))
            else:
                prop_and_cos.append((p, round(max_cos, 2), round(total_cos/count_cos, 2)))
        if len(prop_and_cos) > 0:
            prop_and_cos.sort(key=lambda x:x[1], reverse=True)
        output_file.write(str(prop_and_cos) + '\n')
        print(elements[0], end='\r')
output_file.close()