In [None]:
import codecs
import networkx as nx
import os
import pandas as pd
import pickle
import re
from networkx.algorithms import isomorphism
from os import listdir
from os.path import isfile, join
from utils import printProgressBar

# context from gSpan + maxgraph

In [None]:
input_dir = 'gSpan_s20_l3'
family_labels = pickle.load(open('label_family.p', 'rb'))
file_names = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]
context_from_gSpan = pd.DataFrame()
counter = 0
for file_name in file_names:
    if 'members' not in file_name:
        continue
    graph_number = 'G' + file_name.split('.')[0]
    membership = pd.Series([0] * len(family_labels))
    membership.index = family_labels
    with codecs.open(join(input_dir, file_name), 'r', encoding='utf-8') as f:
        for line in f:
            membership[line.strip('\n').strip('\r')] = 1
    context_from_gSpan[graph_number] = membership
    counter += 2
    printProgressBar(counter, len(file_names), prefix = 'Progress:', suffix = 'Complete', length = 50)
context_from_gSpan.index = family_labels
context_from_maxgraph = pd.read_csv('context_from_maxgraph.csv', header=0, index_col=0)
context_from_maxgraph.rename(columns=lambda s: s.replace('G', 'F'), inplace=True)
context = pd.concat([context_from_gSpan, context_from_maxgraph], axis=1)
context.shape

In [None]:
context['total_graph'] = context.sum(axis=1)
context.loc[context['total_graph'] > 0]

In [None]:
context.drop(columns=['total_graph'], inplace=True)

In [None]:
context_file_name = 'context_from_gSpan.csv'
context.to_csv(context_file_name, header=True, index=True)

# context from families' maximal graphs

In [None]:
input_dir = 'graph_binary'
input_files = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]
input_files.sort()
subgroup_prev = ''
context = pd.DataFrame()
counter = 1
input_files_count = len(input_files)
for subgraph in input_files:
    printProgressBar(counter, input_files_count, prefix = 'Progress:', suffix = 'Complete', length = 50)
    counter += 1
    subgroup_id = subgraph.split('-')[0]
    #if subgroup_id != 'F03814':
        #continue
    if subgroup_id == subgroup_prev:
        continue
    membership = list()
    G2 = pickle.load(open(join(input_dir, subgraph), 'rb'))
    G2_node_count = len(G2)
    supergroup_prev = ''
    is_subgraph = 0
    for supergraph in input_files:
        supergroup_id = supergraph.split('-')[0]
        if supergroup_id == subgroup_id:
            membership.append(1)
            continue
        if supergroup_id == supergroup_prev:
            membership.append(membership[-1])
            continue
        G1 = pickle.load(open(join(input_dir, supergraph), 'rb'))
        if len(G1) < G2_node_count:
            membership.append(0)
        else:
            GM = isomorphism.DiGraphMatcher(G1, G2, node_match=lambda v1,v2: v1['label'] == v2['label'], edge_match=lambda e1,e2: e1['label'] == e2['label'])
            if GM.subgraph_is_isomorphic():
                membership.append(1)
            else:
                membership.append(0)
        supergroup_prev = supergroup_id
    subgroup_prev = subgroup_id
#     context[subgroup_id.replace('F', 'G')] = membership
    membership = pd.Series(membership, name=subgroup_id.replace('F', 'G'))
    context = pd.concat([context, membership], axis=1)
context.index = input_files

In [None]:
context.to_csv('context_from_maxgraph.csv')

In [None]:
context.shape

In [None]:
context

# Run AOC-poset

In [None]:
context_with_header = pd.read_csv('context_from_maxgraph.csv', header=0, index_col=0)
context_without_header = 'context_without_header.csv'
context_with_header.to_csv(context_without_header, header=False, index=False)

In [None]:
os.system('java -jar AOCPosetBuilder.jar -i ' + context_without_header + ' -a HERMES -d posets/families_simplified_maxgraph.dot -f SIMPLIFIED -z')
#os.system('dot -Tpdf posets/families_simplified.dot -o posets/families_simplified.pdf')
os.system('java -jar AOCPosetBuilder.jar -i ' + context_without_header + ' -a HERMES -d posets/families_full_maxgraph.dot -f FULL -z')
#os.system('java -jar AOCPosetBuilder.jar -i ' + context_without_header + ' -a HERMES -d posets/families_minimal.dot -f MINIMAL')

Adding extent & intent size to the minimal version

In [None]:
file_min = codecs.open('posets/families_minimal.dot', 'w')
file_simp = 'posets/families_simplified.dot'
with codecs.open(file_simp) as f:
    for line in f:
        if 'graph' in line or 'rankdir' in line or '->' in line or line == '}':
            file_min.write(line)
            continue
        concept_id = re.search('<(.*)>', line).group(1)
        line_elements = line.split('|')
        line_elements[0] = line_elements[0].replace('> (', '> ' + concept_id + '|(').replace('I', 'Graphs').replace('E', 'Families')
        to_be_written = '|'.join(line_elements) + '\n'
        file_min.write(line_elements[0] + '}\"];\n')
file_min.close()
os.system('dot -Tpdf posets/families_minimal.dot -o posets/families_minimal.pdf')

# AOC-poset properties

In [None]:
L = nx.DiGraph()
with codecs.open(join('posets', 'families_simplified_maxgraph.dot'), 'r', encoding='utf-8') as f:
    for line in f:
        if 'Attribute' in line:
            L.add_node(line.split()[0])
        elif '->' in line:
            elements = line.split()
            L.add_edge(elements[0], elements[2])
count_in = 0
sum_in = 0
count_out = 0
sum_out = 0
no_edge = 0
top = 0
max_parent = 0
max_child = 0
min_parent = 1000
min_child = 1000
for node in L:
    if L.in_degree(node) > max_child:
        max_child = L.in_degree(node)
    if L.out_degree(node) > max_parent:
        max_parent = L.out_degree(node)
    if L.in_degree(node) < min_child and L.in_degree(node) > 0:
        min_child = L.in_degree(node)
    if L.out_degree(node) < min_parent and L.out_degree(node) > 0:
        min_parent = L.out_degree(node)
    if L.in_degree(node) == 0 and L.out_degree(node) == 0:
        no_edge += 1
    if L.out_degree(node) == 0 and L.in_degree(node) > 0:
        top += 1
    if L.in_degree(node) > 0:
        count_in += 1
        sum_in += L.in_degree(node)
    if L.out_degree(node) > 0:
        count_out += 1
        sum_out += L.out_degree(node)
print('number of concepts: ' + str(len(L.nodes)))
print('concept without parent/child: ' + str(no_edge))
print('top concepts:', top)
print('max_parent:', max_parent)
print('max_child:', max_child)
print('min_parent', min_parent)
print('min_child', min_child)
print('average number of parents: ' + str(sum_out / count_out))
print('average number of children: ' + str(sum_in / count_in))
print('number of levels:' + str(len(nx.dag_longest_path(L))))