In [1]:
import codecs
import networkx as nx
import pandas as pd
import pickle
import re
from os import listdir
from os.path import isfile, join
from utils import printProgressBar



# Using AOC-posets from sub fingerprint

In [None]:
context_with_header = pd.read_csv('context_from_gSpan.csv', header=0, index_col=0)
col_names = context_with_header.columns
family_ids = context_with_header.index

families = [f for f in listdir('families_fingerprint') if isfile(join('families_fingerprint', f))]
group_dict = dict()  # contains a dot filename for a given family group
prev_group = ''
for f in families:
    elements = f.replace('.txt', '').split()
    group_id = elements[0].split('-')[0]
    if group_id == prev_group:
        continue
    prev_group = group_id
    group_dict[group_id] = f.replace('.txt', '.dot')

In [None]:
L = nx.DiGraph()
intent_dict = dict()
extent_dict = dict()
with codecs.open(join('posets', 'families_full_s10_l5_fingerprint.dot'), 'r', encoding='utf-8') as f:
    for line in f:
        if '->' in line:
            v = line.split()
            L.add_edge(v[2], v[0])
        elif 'shape' in line:
            vertex_id = line.split()[0]
            intent_dict[vertex_id] = set(line.split('|')[1].replace('Attribute ', 'G').split('\\n')[:-1])
            object_list = line.split('|')[2].replace('Object ', '').split('\\n')[:-1]
            extent_dict[vertex_id] = set()
            for obj in object_list:
                extent_dict[vertex_id].add(family_ids[int(obj)])
            extent_size = re.search('E: (.*)\)', line).group(1)
            L.add_node(vertex_id, extent_size=int(extent_size))

In [None]:
df_lattice = pd.DataFrame(columns=['parent_concept', 'child_concept', 'parent_int', 'child-parent_int', 'parent_ext_size', 'child_ext_size', 'ext_diff', 'parent-child_ext', 'child_ext'])
vertices = list(L.nodes)
edge_counter = 0
L_size = L.size()
for vertex in vertices:
    #if vertex != '1567705314':
        #continue
    current_extent_size = int(L.nodes[vertex]['extent_size'])
    children = L.neighbors(vertex)
    for child in children:
        df_lattice = df_lattice.append(pd.Series({
                'parent_concept': '',
                'child_concept': '',
                'parent_int': intent_dict[vertex],
                'child-parent_int': intent_dict[child] - intent_dict[vertex],
                'parent_ext_size': current_extent_size,
                'child_ext_size': len(extent_dict[child]),
                'ext_diff': (current_extent_size - len(extent_dict[child])) / current_extent_size,
                'parent-child_ext': extent_dict[vertex] - extent_dict[child],
                'child_ext': extent_dict[child]
            }), ignore_index=True)
        edge_counter += 1
        printProgressBar(edge_counter, L_size, prefix = 'Progress:', suffix = 'complete', length = 50, decimals = 2)
df_lattice.to_excel('concept_comparison_gSpan.xlsx', index=False)
df_lattice.shape

# Using AOC-posets from max graph

In [2]:
context_with_header = pd.read_csv('context_from_maxgraph.csv', header=0, index_col=0)
col_names = context_with_header.columns
family_ids = context_with_header.index

families = [f for f in listdir('families') if isfile(join('families', f))]
group_dict = dict()  # contains a dot filename for a given family group
prev_group = ''
for f in families:
    elements = f.replace('.txt', '').split()
    group_id = elements[0].split('-')[0]
    if group_id == prev_group:
        continue
    prev_group = group_id
    group_dict[group_id] = f.replace('.txt', '.dot')

In [3]:
node_count = dict()
input_dir = 'graph_binary'
graphs = [g for g in listdir(input_dir) if isfile(join(input_dir, g))]
prev_group = ''
for graph in graphs:
    group = graph.split('-')[0]
    if group == prev_group:
        continue
    prev_group = group
    G = pickle.load(open(join(input_dir, graph), 'rb'))
    node_count[group.replace('F', 'G')] = len(G)

In [4]:
L = nx.DiGraph()
df_lattice = pd.DataFrame(columns=['parent', 'child', 'parent_ext', 'child_ext', 'parent_len', 'child_len'])
with codecs.open(join('posets', 'families_simplified_maxgraph.dot'), 'r', encoding='utf-8') as f:
    for line in f:
        if '->' in line:
            v = line.split()
            L.add_edge(v[2], v[0])
        elif 'Attribute' in line:
            vertex_id = line.split()[0]
            proper_intent = col_names[int(line.split('|')[1].replace('Attribute ', '').replace('\\n', ''))]
            extent_size = re.search('E: (.*)\)', line).group(1)
            L.add_node(vertex_id, proper_intent=proper_intent, extent_size=extent_size)

In [5]:
vertices = list(L.nodes)
edge_counter = 0
L_size = L.size()
for vertex in vertices:
    #if vertex != '1567705314':
        #continue
    current_extent_size = int(L.nodes[vertex]['extent_size'])
    current_graph_len = node_count[L.nodes[vertex]['proper_intent']]
    children = L.neighbors(vertex)
    for child in children:
        family_group_of_parent = L.nodes[vertex]['proper_intent'].replace('G', 'F')
        family_group_of_child = L.nodes[child]['proper_intent'].replace('G', 'F')
        df_lattice = df_lattice.append(pd.Series({
                'parent': '=HYPERLINK("graph_visualization\\' + group_dict[family_group_of_parent] + '", "' + group_dict[family_group_of_parent] + '")',
                'child': '=HYPERLINK("graph_visualization\\' + group_dict[family_group_of_child] + '", "' + group_dict[family_group_of_child] + '")',
                'parent_ext': current_extent_size,
                'child_ext': int(L.nodes[child]['extent_size']),
                'parent_len': current_graph_len,
                'child_len': node_count[L.nodes[child]['proper_intent']]
            }), ignore_index=True)
        edge_counter += 1
        printProgressBar(edge_counter, L_size, prefix = 'Progress:', suffix = 'complete', length = 50, decimals = 2)
df_lattice['child_parent_ext_ratio'] = df_lattice['child_ext'] / df_lattice['parent_ext']
df_lattice['graph_len_diff'] = df_lattice['child_len'] - df_lattice['parent_len']
df_lattice.to_excel('concept_comparison_maxgraph.xlsx', index=False)

Progress: |██████████████████████████████████████████████████| 100.00% complete




# Using AOC-posets
## looking at two concepts having one-node difference

In [None]:
node_count = dict()
input_dir = 'fingerprint_binary'
graphs = [g for g in listdir(input_dir) if isfile(join(input_dir, g))]
prev_group = ''
for graph in graphs:
    group = graph.split('-')[0]
    if group == prev_group:
        continue
    prev_group = group
    G = pickle.load(open(join(input_dir, graph), 'rb'))
    node_count[group.replace('F', 'G')] = len(G)

In [None]:
context_with_header = pd.read_csv('context_from_fingerprint.csv', header=0, index_col=0)
col_names = context_with_header.columns
family_ids = context_with_header.index

families = [f for f in listdir('families_fingerprint') if isfile(join('families_fingerprint', f))]
families_dict = dict()  # contains a word for a given family
for f in families:
    elements = f.replace('.txt', '').split()
    families_dict[elements[0]] = elements[1].split('_')[0]
L = nx.DiGraph()
with codecs.open(join('posets', 'families_simplified_fingerprint.dot'), 'r', encoding='utf-8') as f:
    for line in f:
        if '->' in line:
            v = line.split()
            L.add_edge(v[2], v[0])
        elif 'Attribute' in line:
            vertex_id = line.split()[0]
            proper_intent = col_names[int(line.split('|')[1].replace('Attribute ', '').replace('\\n', ''))]
            graph_size = node_count[proper_intent]
            L.add_node(vertex_id, proper_intent=proper_intent, graph_size=graph_size)

In [None]:
vertices = list(L.nodes)
for vertex in vertices:
    #if vertex != '1472682156':
        #continue
    current_graph_size = int(L.nodes[vertex]['graph_size'])
    children = L.neighbors(vertex)
    for child in children:
        child_graph_size = int(L.nodes[child]['graph_size'])
        if float(child_graph_size - current_graph_size) == 1 and child_graph_size > 6:
            print(L.nodes[vertex]['proper_intent'], L.nodes[child]['proper_intent'],\
                  str(current_graph_size) + '->' + str(child_graph_size))

# Using apriori principle

In [None]:
import codecs
import pandas as pd
import os
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder
from os import listdir
from os.path import isfile, join
from utils import list2lists

# column number
rid = 0
fid = 1
lid_1 = 2
graph_1 = 3
ori_graph_1 = 4
lid_2 = 5
graph_2 = 6
ori_graph_2 = 7
cat_1 = 8
ori_cat_1 = 9
cat_2 = 10
ori_cat_2 = 11
ori_cple = 12
type_cstr_1 = 13
cstr_1 = 14
ori_cstr_1 = 15
type_cstr_2 = 16
cstr_2 = 17
ori_cstr_2 = 18
complexite = 19
ori_complexite = 20
orientation = 21
ori_orientation = 22
semty_1 = 23
ori_semty_1 = 24
semty_2 = 25
ori_semty_2 = 26
sous_semty_1 = 27
sous_semty_2 = 28
ori_sous_semty_1 = 29
ori_sous_semty_2 = 30
semtyrss_1 = 31
semtyrss_2 = 32
ori_semtyrss_1 = 33
ori_semtyrss_2 = 34
rel_sem_n1 = 35
rel_sem_n2 = 36
ori_relsem = 37
def_conc = 38
ori_def_conc = 39
def_abs = 40
ori_def_abs = 41
commentaires = 42
fichier_origine = 43

In [None]:
#  options1: Vertex: category, cstr. Edge: orientation, complexite
input_dir = 'families'
input_files = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]

transactions = list()
index_for_df = list()
fam_prec = ''
for input_file in input_files:
    transaction = list()
    pairs = list()
    if '-' in input_file and input_file.split('-')[0] == fam_prec:
        continue
    fam_prec = input_file.split('-')[0]
    with codecs.open(join(input_dir, input_file), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num >= 2:
                elements = line.replace('\n','').replace(' ','').split('\t')
                
                lex = list()
                #lex.append(elements[cat_1] + '_' + elements[cstr_1])
                #lex.append(elements[cat_2] + '_' + elements[cstr_2])
                if elements[orientation] == 'indirect' or elements[orientation] == 'NA':
                    continue
                    '''lex.sort()
                    item = lex[0] + ' - ' + elements[complexite] + '_' + elements[orientation] + ' - ' + lex[1]
                    transaction.append(item)'''
                if elements[orientation] == 'as2de' or elements[orientation] == 'as2des':
                    item = elements[graph_1] + '-' + elements[graph_2] + ':' + elements[cat_1] + ' -> ' + \
                    elements[complexite] + '_' + elements[cstr_1] + '-' + elements[cstr_2] + ' -> ' + elements[cat_2]
                elif elements[orientation] == 'de2as' or elements[orientation] == 'des2as':
                    item = elements[graph_2] + '-' + elements[graph_1] + ':' + elements[cat_2] + ' -> ' + \
                    elements[complexite] + '_' + elements[cstr_2] + '-' + elements[cstr_1] + ' -> ' + elements[cat_1]
                if item not in pairs:
                    pairs.append(item)
                    transaction.append(item.split(':')[1])
                lex.clear()
    list_tr = list2lists(transaction)
    if len(list_tr) == 1:
        transactions.append(list_tr[0])
        index_for_df.append(input_file.split()[0])
    else:
        for tr_id, tr in enumerate(list_tr):
            transactions.append(tr)
            index_for_df.append(input_file.split()[0] + '_' + str(tr_id))
    #index_for_df.append(input_file.split()[0])
    #transactions.append(transaction)
    
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)
df.index = index_for_df
df

In [None]:
#  options1: Vertex: categorie, semty, cstr. Edges: rel_sem
input_dir = 'families'
input_files = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]

transactions = list()
for input_file in input_files:
    transaction = list()
    #if input_file != 'F03649 contraceptif_Adj.txt':
        #continue
    with codecs.open(join(input_dir, input_file), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num >= 2:
                line_elements = line.replace('\n','').replace(' ','').split('\t')
                if line_elements[23] == '' or line_elements[25] == '' or line_elements[36] == '':
                    continue
                
                lex1 = list()
                lex2 = list()
                if '|' in line_elements[23]:
                    lex1 = line_elements[23].split('|')
                else:
                    lex1.append(line_elements[23])
                if '|' in line_elements[25]:
                    lex2 = line_elements[25].split('|')
                else:
                    lex2.append(line_elements[25])
                if line_elements[21] == 'as2de' or line_elements[21] == 'as2des':
                    for l1 in lex1:
                        for l2 in lex2:
                            item = line_elements[8] + '_' + l1 + '_' + line_elements[14] \
                            + ' -> ' + line_elements[36] + ' -> ' + line_elements[10] \
                            + '_' + l2 + '_' + line_elements[17]
                    transaction.append(item)
                elif line_elements[21] == 'de2as' or line_elements[21] == 'des2as':
                    for l1 in lex1:
                        for l2 in lex2:
                            item = line_elements[10] + '_' + l2 + '_' + line_elements[17] \
                            + ' -> ' + line_elements[36] + ' -> ' + line_elements[8] \
                            + '_' + l1 + '_' + line_elements[14]
                    transaction.append(item)
                '''else:
                    for l1 in lex1:
                        for l2 in lex2:
                            l12 = sorted([line_elements[8] + '_' + l1 + '_' + line_elements[14], \
                            line_elements[10] + '_' + l2 + '_' + line_elements[17]])
                            item = l12[0] + ' - ' + line_elements[36] + ' - ' + l12[1]
                    transaction.append(item)'''
                lex1.clear()
                lex1.clear()
    transactions.append(transaction)
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)
df.index = input_files
df

In [None]:
frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets['# of families'] = frequent_itemsets['support'].apply(lambda x: x*df.shape[0])
frequent_itemsets

In [None]:
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.8)
#rules['antecedent length'] = rules['antecedents'].apply(lambda x: len(x))
#rules['consequent length'] = rules['consequents'].apply(lambda x: len(x))
rules.drop(columns=['antecedent support', 'consequent support', 'lift', 'leverage', 'conviction'], inplace=True)
rules

In [None]:
families_ant_con = list()
families_ant = list()
for chosen_rule_id in rules.index:
    set_of_antecedents = rules.loc[chosen_rule_id, 'antecedents']
    set_of_consequents = rules.loc[chosen_rule_id, 'consequents']
    conditions_ant = list()
    conditions_ant_con = list()
    for ante in set_of_antecedents:
        conditions_ant.append(df[ante])
        conditions_ant_con.append(df[ante])
    for cons in set_of_consequents:
        conditions_ant.append(~df[cons])
        conditions_ant_con.append(df[cons])

    joined_ant = pd.Series([True] * df.shape[0], index=df.index)
    for c in conditions_ant:
        joined_ant &= c
    indices = df.loc[joined_ant].index
    ret_str = str(len(indices)) + ' families: '
    for ind in indices:
        ret_str += ind.split(' ')[0] + ' '
    families_ant.append(ret_str)

    joined_ant_con = pd.Series([True] * df.shape[0], index=df.index)
    for c in conditions_ant_con:
        joined_ant_con &= c
    indices = df.loc[joined_ant_con].index
    ret_str = str(len(indices)) + ' families: '
    for ind in indices:
        ret_str += ind.split(' ')[0] + ' '
    families_ant_con.append(ret_str)
rules['families with ante & cons'] = families_ant_con
rules['families with ante only'] = families_ant
rules

In [None]:
#rules.sort_values(by=['confidence'], ascending=False, inplace=True)
rules.to_excel('rules_double.xlsx')