In [None]:
import codecs
import glob
import networkx as nx
import os
import pandas as pd
import pickle
import re
from os import listdir
from os.path import isfile, join
from networkx.algorithms import isomorphism
from networkx.drawing.nx_pydot import write_dot
from utils import printProgressBar

# glawinette column number
lemma1 = 0
lemma2 = 1
cat1 = 2
cat2 = 3
origine_morpho = 4
origine_def = 5
BAP1 = 6
BAP2 = 7
BAPsize = 8
FAP1 = 9
FAP2 = 10
FAPsize = 11
radical = 12
FAPtype = 13

# demonette column number
graph_1 = 3
graph_2 = 6
cat_1 = 8
cat_2 = 10
cstr_1 = 14
cstr_2 = 17
complexite = 19
orientation = 21
fichier_origine = 43

In [None]:
folders_needed = ['DG-graph-binary', 'DG-contexts', 'DG-posets', 'DG-subposets']
for f in folders_needed:
    if not os.path.exists(f):
        os.makedirs(f)

In [None]:
def FAPconverter(input_fap):
    return input_fap.replace('(.+)', 'X').replace('$', '').replace('^', '')

def category_shortening(cat):
    if cat != 'Num' and cat[0] == 'N':
        if cat[1] == 'p':  # nom propre
            return 'Np'
        return 'N'  # nom
    return cat

header = ''
glawi_dict = dict()
with codecs.open('glawinette-series.csv', 'r', encoding='utf-8') as f:
    for line_num, line in enumerate(f):
        if line_num >= 1:
            elements = line.replace('\n','').replace(' ','').split('\t')
            glawi_dict[(elements[lemma1], elements[lemma2])] = FAPconverter(elements[FAP1]) + '-' + FAPconverter(elements[FAP2])
            glawi_dict[(elements[lemma2], elements[lemma1])] = FAPconverter(elements[FAP2]) + '-' + FAPconverter(elements[FAP1])
print(len(glawi_dict))

# 1. Creation of binary file for each graph

In a binary file, each node also contains the lexeme's frequency, obtained from COW

In [None]:
frcowvec_categories = {'Nm': 'NOM', 'Nf': 'NOM', 'Nmp': 'NOM', 'Nfp': 'NOM', 'Nx': 'NOM', 'More': 'NOM',
                       'Npm': 'NAM', 'Npf': 'NAM', 'Npx': 'NAM', 'Npmp': 'NAM', 'Npfp': 'NAM',
                       'IJ': 'INT', 'Adj': 'ADJ', 'V': 'VER', 'Num': 'NUM', 'Pro': 'PRO', 'Adv': 'ADV'}

def frcowvec_cat_conversion(lexeme):
    old_cat = lexeme.split('_')[-1]
    new_cat = frcowvec_categories.get(old_cat, old_cat)
    return lexeme.split('_')[0] + '_' + str(new_cat)

frequencies = pd.read_csv('frequencies-frcowvec.csv', header=0, index_col=0)
frequencies.shape

In [None]:
input_dir = 'DG-families'
input_files = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]
if '.gitignore' in input_files:
    input_files.remove('.gitignore')
output_dir = 'DG-graph-binary'

for input_file in input_files:
    fam_id = input_file.split()[0]
    group_id = fam_id.split('-')[0]
    H = nx.DiGraph()
    with codecs.open(join(input_dir, input_file), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num >= 2:
                elements = line.replace('\n','').replace(' ','').split('\t')
                va = elements[graph_1] + '_' + elements[cat_1]
                vb = elements[graph_2] + '_' + elements[cat_2]
                if H.has_edge(va, vb) or H.has_edge(vb, va):
                    continue
                try:
                    freq_a = frequencies.loc[frcowvec_cat_conversion(va)]['freq']
                except KeyError:
                    freq_a = 0
                try:
                    freq_b = frequencies.loc[frcowvec_cat_conversion(vb)]['freq']
                except KeyError:
                    freq_b = 0
                H.add_node(va, label=category_shortening(elements[cat_1]), frequency=freq_a)
                H.add_node(vb, label=category_shortening(elements[cat_2]), frequency=freq_b)
                if elements[orientation] == 'as2de' or elements[orientation] == 'as2des':
                    if (elements[graph_1], elements[graph_2]) in glawi_dict.keys():
                        H.add_edge(va, vb, label=elements[cstr_1] + '-' + elements[cstr_2]\
                                  + '$' + glawi_dict.get((elements[graph_1], elements[graph_2])))
                    else:
                        H.add_edge(va, vb, label=elements[cstr_1] + '-' + elements[cstr_2])
                elif elements[orientation] == 'de2as' or elements[orientation] == 'des2as':
                    if (elements[graph_2], elements[graph_1]) in glawi_dict.keys():
                        H.add_edge(vb, va, label=elements[cstr_2] + '-' + elements[cstr_1]\
                                  + '$' + glawi_dict.get((elements[graph_2], elements[graph_1])))
                    else:
                        H.add_edge(vb, va, label=elements[cstr_2] + '-' + elements[cstr_1])
                else:
                    if (elements[graph_1], elements[graph_2]) in glawi_dict.keys():
                        H.add_edge(va, vb, label=elements[cstr_1] + '-' + elements[cstr_2] + '_' + elements[orientation]\
                                  + '$' + glawi_dict.get((elements[graph_1], elements[graph_2])))
                        H.add_edge(vb, va, label=elements[cstr_2] + '-' + elements[cstr_1] + '_' + elements[orientation]\
                                  + '$' + glawi_dict.get((elements[graph_2], elements[graph_1])))
                    else:
                        H.add_edge(va, vb, label=elements[cstr_1] + '-' + elements[cstr_2] + '_' + elements[orientation])
                        H.add_edge(vb, va, label=elements[cstr_2] + '-' + elements[cstr_1] + '_' + elements[orientation])
    graph_file = open(join(output_dir, fam_id), 'wb')
    pickle.dump(H, graph_file)
    graph_file.close()
    print(input_file.split()[0], end='\r')

# 2. Creation of formal context

In [None]:
input_dir = 'DG-graph-binary'
input_files = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]
if '.gitignore' in input_files:
    input_files.remove('.gitignore')
input_files.sort()
input_files.sort()
ignored = []
for i in ignored:
    try:
        input_files.remove(i)
    except ValueError:
        pass
print(len(ignored), 'ignored')
print(len(input_files), 'families')

In [None]:
node_count_dict = dict()
for graph in input_files:
    G2 = pickle.load(open(join(input_dir, graph), 'rb'))
    node_count_dict[graph] = len(G2)

In [None]:
def edge_compare(e1, e2):
    if '$' in e1['label'] and '$' not in e2['label']:
        return e1['label'].split('$')[0] == e2['label']
    else:
        return e1['label'] == e2['label']

context_folder = 'DG-contexts'
subgroup_prev = ''
context = pd.DataFrame()
counter = -1
input_files_count = len(input_files)
for subgraph in input_files:
    subgroup_id = subgraph.split('-')[0]
    if subgroup_id == subgroup_prev:
        continue
    counter += 1
    if counter < 2000:
        continue
    if counter % 1000 == 0 and counter > 2000:
        context.index = input_files
        context.to_csv(join(context_folder, 'DG-context-' + str(counter/1000) + '.csv'))
        context = pd.DataFrame()
    G2 = pickle.load(open(join(input_dir, subgraph), 'rb'))
    G2_node_count = node_count_dict.get(subgraph)
    supergroup_prev = ''
    is_subgraph = 0
    membership = [0] * len(input_files)
    for counter2 in range(input_files_count-1, -1, -1):
        if membership[counter2] == 1:
            continue
        supergraph = input_files[counter2]
        supergroup_id = supergraph.split('-')[0]
        if supergroup_id == subgroup_id:
            membership[counter2] = 1
            continue
        if supergroup_id == supergroup_prev:
            membership[counter2] = membership[counter2+1]
            continue
        print(subgraph + ' ' + supergraph + '        ', end='\r')
        G1_node_count = node_count_dict.get(supergraph)
        if G1_node_count >= G2_node_count:
            G1 = pickle.load(open(join(input_dir, supergraph), 'rb'))
            GM = isomorphism.DiGraphMatcher(G1, G2, node_match=lambda v1,v2: v1['label'] == v2['label'], edge_match=edge_compare)
            if GM.subgraph_is_isomorphic():
                membership[counter2] = 1
#                 try:
#                     membership_of_G1 = context[supergroup_id.replace('F', 'G')]
#                     membership = (membership_of_G1 | membership).astype(int)
#                 except KeyError:
#                     pass
        supergroup_prev = supergroup_id
    subgroup_prev = subgroup_id
    membership = pd.Series(membership, name=subgroup_id.replace('F', 'G'))
    context = pd.concat([context, membership], axis=1)
context.index = input_files
context.to_csv(join(context_folder, 'DG-context-last.csv'))

In [None]:
context_files = [f for f in listdir(context_folder) if isfile(join(context_folder, f))]
if '.gitignore' in context_files:
    context_files.remove('.gitignore')
input_files.sort()
context_files.sort()
context_join = pd.read_csv(join(context_folder, 'DG-context-1.0.csv'), header=0, index_col=0)
for i in ignored:
    if i in context_join.index:
        context_join.drop(index=i, inplace=True)
for c in range(1, len(context_files)):
    new_ctx = pd.read_csv(join(context_folder, context_files[c]), header=0, index_col=0)
    for i in ignored:
        if i in new_ctx.index:
            new_ctx.drop(index=i, inplace=True)
    context_join = pd.concat([context_join, new_ctx], axis=1)
print('context shape:', context_join.shape)

In [None]:
context_join.to_csv('DG-context.csv')

# 3. Calculation of AOC-poset

In [None]:
context_without_header = 'context_without_header.csv'
context_with_header = pd.read_csv('DG-context.csv', header=0, index_col=0)
context_with_header.to_csv(context_without_header, header=False, index=False)

In [None]:
os.system('java -jar AOCPosetBuilder.jar -i ' + context_without_header + ' -a HERMES -d DG-posets/aoc-simplified.dot -f SIMPLIFIED -z')
#os.system('dot -Tpdf posets/families_simplified.dot -o posets/families_simplified.pdf')
os.system('java -jar AOCPosetBuilder.jar -i ' + context_without_header + ' -a HERMES -d DG-posets/aoc-full.dot -f FULL -z')
#os.system('java -jar AOCPosetBuilder.jar -i ' + context_without_header + ' -a HERMES -d posets/families_minimal.dot -f MINIMAL')

# 4. Calculation of subposets (parents and children of each concept)

In [None]:
context_with_header = pd.read_csv('DG-context.csv', header=0, index_col=0)
col_names = context_with_header.columns
family_ids = context_with_header.index

families = [f for f in listdir('DG-families') if isfile(join('DG-families', f))]
if '.gitignore' in families:
    families.remove('.gitignore')
families_dict = dict()  # contains the representative word for a given family
for f in families:
    elements = f.replace('.txt', '').split()
    families_dict[elements[0]] = elements[1].split('_')[0]

In [None]:
full_dot = 'aoc-full.dot'
simplified_dot = 'aoc-simplified.dot'
directory = 'DG-posets'
out_directory = 'DG-subposets'
L1 = nx.DiGraph()
L2 = nx.DiGraph()
with codecs.open(join(directory, simplified_dot), 'r', encoding='utf-8') as f:
    for line in f:
        if '->' in line:  # a line showing edges between concepts
            elements = line.split()
            L1.add_edge(elements[0], elements[2])   
            L2.add_edge(elements[2], elements[0])  
        elif 'shape' in line:  # a line describing a concept 
            L1.add_node(line.split()[0])
            L2.add_node(line.split()[0])
            
group_prev = ''
counter = 0
for file_name in families:
    #if file_name != 'F01317 abducteur.txt':
        #continue
    family_id = file_name.split()[0]
    group_id = family_id.split('-')[0]
    if group_id == group_prev:
        continue
    group_prev = group_id
    
    # find vertex that contains the intended family and its parents+children
    try:
        object_id = family_ids.get_loc(family_id)
    except KeyError:
        #  ignored families (e.g. too much Npx)
        continue
    with codecs.open(join(directory, simplified_dot), 'r', encoding='utf-8') as f:
        for line in f:
            if ('Object ' + str(object_id) + '\\n') in line:
                vertex = line.split()[0]
                break
    parents = nx.descendants(L1, vertex)
    children = nx.descendants(L2, vertex)
    selected_vertices = parents.union(children)
    selected_vertices.add(vertex)
    
    # find all vertex connected to the intended family, and write to dot
    out_file_name = 'poset_' + family_id + '_simplified' + '.dot'
    out_file = codecs.open(join(out_directory, out_file_name), 'w')
    with codecs.open(join(directory, simplified_dot), 'r', encoding='utf-8') as f:
        for line in f:
            if 'graph' in line or 'rankdir' in line or line == '}':
                out_file.write(line)
                continue
            if '->' in line:
                elements = line.split()
                v1 = elements[0]
                v2 = elements[2]
                if v1 in selected_vertices and v2 in selected_vertices:
                    out_file.write(line)
                continue
            vertex_id = line.split()[0]
            concept_id = re.search('<(.*)>', line).group(1)
            if vertex_id in selected_vertices:
                to_be_written = line.split('|')[0] + '|'
                attribute_string = line.split('|')[1]
                if 'Attribute' not in attribute_string: # empty intent
                    #to_be_written += '|'
                    pass
                else:
                    attributes = attribute_string.split('\\n')
                    for attribute in attributes:
                        if attribute == '':
                            continue
                        to_be_written += col_names[int(attribute.split()[1])] + '\\n'
                to_be_written += '|'
                object_string = line.split('|')[2]
                if 'Object' not in object_string: # empty extent
                    #to_be_written += '|'
                    pass
                else:
                    objects = object_string.split('\\n')
                    for obj in objects:
                        if obj == '' or '}' in obj:
                            continue
                        to_be_written += families_dict[family_ids[int(obj.split()[1])]] + '\\n'
                to_be_written += '}"];\n'
                to_be_written = re.sub('\(I.*\)\|', concept_id + '|', to_be_written)
                to_be_written = to_be_written.replace(',fillcolor=orange', '').replace(',fillcolor=lightblue', '')
                if vertex_id == vertex:
                    to_be_written = to_be_written.replace('style=filled', 'style=filled,fillcolor=orange')
                out_file.write(to_be_written)
    out_file.close()
    counter += 1
    print(group_id, end='\r')