# Using AOC-posets

In [1]:
import codecs
import networkx as nx
import pandas as pd
import re
from os import listdir
from os.path import isfile, join



In [2]:
context_with_header = pd.read_csv('context_from_fingerprint.csv', header=0, index_col=0)
col_names = context_with_header.columns
family_ids = context_with_header.index

families = [f for f in listdir('families_fingerprint') if isfile(join('families_fingerprint', f))]
families_dict = dict()  # contains a word for a given family
for f in families:
    elements = f.replace('.txt', '').split()
    families_dict[elements[0]] = elements[1].split('_')[0]

In [3]:
L = nx.DiGraph()
with codecs.open(join('posets', 'families_simplified_fingerprint.dot'), 'r', encoding='utf-8') as f:
    for line in f:
        if '->' in line:
            v = line.split()
            L.add_edge(v[2], v[0])
        elif 'Attribute' in line:
            vertex_id = line.split()[0]
            proper_intent = col_names[int(line.split('|')[1].replace('Attribute ', '').replace('\\n', ''))]
            extent_size = re.search('E: (.*)\)', line).group(1)
            L.add_node(vertex_id, proper_intent=proper_intent, extent_size=extent_size)

In [4]:
vertices = list(L.nodes)
for vertex in vertices:
    #if vertex != '1472682156':
        #continue
    current_extent_size = int(L.nodes[vertex]['extent_size'])
    children = L.neighbors(vertex)
    for child in children:
        child_extent_size = int(L.nodes[child]['extent_size'])
        if float(child_extent_size / current_extent_size) > 0.9:
            print(L.nodes[vertex]['proper_intent'], L.nodes[child]['proper_intent'],\
                  str(current_extent_size) + '->' + str(child_extent_size))

G00497 G03303 13->12
G01816 G03427 13->12
G01585 G02245 17->16
G01802 G03576 21->20
G02514 G03682 22->21
G02258 G03714 84->83
G01800 G03657 134->128
G01040 G03756 142->137
G03621 G03805 279->259
G03615 G03791 302->283
G03677 G03797 344->321
G03654 G03789 461->433
G03624 G03793 641->589
G03688 G03775 1074->996


# Using apriori principle

In [1]:
import codecs
import pandas as pd
import os
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder
from os import listdir
from os.path import isfile, join
from utils import list2lists

# column number
rid = 0
fid = 1
lid_1 = 2
graph_1 = 3
ori_graph_1 = 4
lid_2 = 5
graph_2 = 6
ori_graph_2 = 7
cat_1 = 8
ori_cat_1 = 9
cat_2 = 10
ori_cat_2 = 11
ori_cple = 12
type_cstr_1 = 13
cstr_1 = 14
ori_cstr_1 = 15
type_cstr_2 = 16
cstr_2 = 17
ori_cstr_2 = 18
complexite = 19
ori_complexite = 20
orientation = 21
ori_orientation = 22
semty_1 = 23
ori_semty_1 = 24
semty_2 = 25
ori_semty_2 = 26
sous_semty_1 = 27
sous_semty_2 = 28
ori_sous_semty_1 = 29
ori_sous_semty_2 = 30
semtyrss_1 = 31
semtyrss_2 = 32
ori_semtyrss_1 = 33
ori_semtyrss_2 = 34
rel_sem_n1 = 35
rel_sem_n2 = 36
ori_relsem = 37
def_conc = 38
ori_def_conc = 39
def_abs = 40
ori_def_abs = 41
commentaires = 42
fichier_origine = 43



In [2]:
#  options1: Vertex: category, cstr. Edge: orientation, complexite
input_dir = 'families'
input_files = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]

transactions = list()
index_for_df = list()
fam_prec = ''
for input_file in input_files:
    transaction = list()
    pairs = list()
    if '-' in input_file and input_file.split('-')[0] == fam_prec:
        continue
    fam_prec = input_file.split('-')[0]
    with codecs.open(join(input_dir, input_file), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num >= 2:
                elements = line.replace('\n','').replace(' ','').split('\t')
                
                lex = list()
                #lex.append(elements[cat_1] + '_' + elements[cstr_1])
                #lex.append(elements[cat_2] + '_' + elements[cstr_2])
                if elements[orientation] == 'indirect' or elements[orientation] == 'NA':
                    continue
                    '''lex.sort()
                    item = lex[0] + ' - ' + elements[complexite] + '_' + elements[orientation] + ' - ' + lex[1]
                    transaction.append(item)'''
                if elements[orientation] == 'as2de' or elements[orientation] == 'as2des':
                    item = elements[graph_1] + '-' + elements[graph_2] + ':' + elements[cat_1] + ' -> ' + \
                    elements[complexite] + '_' + elements[cstr_1] + '-' + elements[cstr_2] + ' -> ' + elements[cat_2]
                elif elements[orientation] == 'de2as' or elements[orientation] == 'des2as':
                    item = elements[graph_2] + '-' + elements[graph_1] + ':' + elements[cat_2] + ' -> ' + \
                    elements[complexite] + '_' + elements[cstr_2] + '-' + elements[cstr_1] + ' -> ' + elements[cat_1]
                if item not in pairs:
                    pairs.append(item)
                    transaction.append(item.split(':')[1])
                lex.clear()
    list_tr = list2lists(transaction)
    if len(list_tr) == 1:
        transactions.append(list_tr[0])
        index_for_df.append(input_file.split()[0])
    else:
        for tr_id, tr in enumerate(list_tr):
            transactions.append(tr)
            index_for_df.append(input_file.split()[0] + '_' + str(tr_id))
    #index_for_df.append(input_file.split()[0])
    #transactions.append(transaction)
    
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)
df.index = index_for_df
df

Unnamed: 0,Adj -> motiv-form_X-Xistique -> Adj,Adj -> motiv-sem_X-Xariser -> V,Adj -> motiv-sem_X-Xique -> Adj,Adj -> motiv-sem_X-Xétique -> Adj,Adj -> motiv-sem_X-antiXaire -> Nm,Adj -> motiv-sem_X-dé1Xifier -> V,Adj -> motiv-sem_X-dé1Xiser -> V,Adj -> motiv-sem_X-orthoXaire -> Adj,Adj -> simple_X-X -> Nf,Adj -> simple_X-X -> Nm,...,V -> simple_X-enX -> V,V -> simple_X-hypoX -> V,V -> simple_X-interX -> V,V -> simple_X-paraX -> V,V -> simple_X-préX -> V,V -> simple_X-reX -> V,V -> simple_X-surX -> V,V -> simple_X-synX -> V,V -> simple_X-transX -> V,V -> simple_X-éX -> V
F00000_0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
F00000_1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
F00001_0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
F00001_1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
F00002_0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
F03915-000,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
F03916-000,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
F03917-000,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
F03918-000,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [25]:
#  options1: Vertex: categorie, semty, cstr. Edges: rel_sem
input_dir = 'families'
input_files = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]

transactions = list()
for input_file in input_files:
    transaction = list()
    #if input_file != 'F03649 contraceptif_Adj.txt':
        #continue
    with codecs.open(join(input_dir, input_file), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num >= 2:
                line_elements = line.replace('\n','').replace(' ','').split('\t')
                if line_elements[23] == '' or line_elements[25] == '' or line_elements[36] == '':
                    continue
                
                lex1 = list()
                lex2 = list()
                if '|' in line_elements[23]:
                    lex1 = line_elements[23].split('|')
                else:
                    lex1.append(line_elements[23])
                if '|' in line_elements[25]:
                    lex2 = line_elements[25].split('|')
                else:
                    lex2.append(line_elements[25])
                if line_elements[21] == 'as2de' or line_elements[21] == 'as2des':
                    for l1 in lex1:
                        for l2 in lex2:
                            item = line_elements[8] + '_' + l1 + '_' + line_elements[14] \
                            + ' -> ' + line_elements[36] + ' -> ' + line_elements[10] \
                            + '_' + l2 + '_' + line_elements[17]
                    transaction.append(item)
                elif line_elements[21] == 'de2as' or line_elements[21] == 'des2as':
                    for l1 in lex1:
                        for l2 in lex2:
                            item = line_elements[10] + '_' + l2 + '_' + line_elements[17] \
                            + ' -> ' + line_elements[36] + ' -> ' + line_elements[8] \
                            + '_' + l1 + '_' + line_elements[14]
                    transaction.append(item)
                '''else:
                    for l1 in lex1:
                        for l2 in lex2:
                            l12 = sorted([line_elements[8] + '_' + l1 + '_' + line_elements[14], \
                            line_elements[10] + '_' + l2 + '_' + line_elements[17]])
                            item = l12[0] + ' - ' + line_elements[36] + ' - ' + l12[1]
                    transaction.append(item)'''
                lex1.clear()
                lex1.clear()
    transactions.append(transaction)
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)
df.index = input_files
df

Unnamed: 0,Adj_Modifier_X -> Ap -> Adj_Relational_Xique,Adj_Modifier_X -> Ap -> Adj_Relational_Xétique,Adj_Modifier_X -> causatif -> V_Situation_X,Nf_Abstract_entity_X -> synonymie -> Adj_Modifier_Xal,Nf_Abstract_entity_X -> synonymie -> Adj_Modifier_Xoïdal,Nf_Act_X -> synonymie -> Adj_Modifier_Xal,Nf_Animal_X -> synonymie -> Adj_Modifier_Xal,Nf_Artifact_X -> synonymie -> Adj_Modifier_Xal,Nf_Body_X -> synonymie -> Adj_Modifier_Xal,Nf_Cognition_X -> synonymie -> Adj_Modifier_Xal,...,V_Situation_X -> synonymie -> Nfp_Situation_Xaille,V_Situation_X -> synonymie -> Nm_Situation_X,V_Situation_X -> synonymie -> Nm_Situation_Xage,V_Situation_X -> synonymie -> Nm_Situation_Xet,V_Situation_X -> synonymie -> Nm_Situation_Xing,V_Situation_X -> synonymie -> Nm_Situation_Xion,V_Situation_X -> synonymie -> Nm_Situation_Xis,V_Situation_X -> synonymie -> Nm_Situation_Xisme,V_Situation_X -> synonymie -> Nm_Situation_Xment,V_Situation_X -> synonymie -> Nm_Situation_Xon
F00000 abaissable_Adj.txt,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,True,False
F00001 abandon_Nm.txt,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
F00002 abat_Nm.txt,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,True,False
F00003 abcès_Nm.txt,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
F00004 abîme_Nm.txt,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
F13174 volubilaire_Nf.txt,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
F13175 volvaire_Nf.txt,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
F13176 blessure_Nf.txt,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
F13177 Wikiversité_Nf.txt,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [3]:
frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets['# of families'] = frequent_itemsets['support'].apply(lambda x: x*df.shape[0])
frequent_itemsets

Unnamed: 0,support,itemsets,length,# of families
0,0.038630,(Adj -> simple_X-Xité -> Nf),1,212.0
1,0.063958,(Adj -> simple_X-inX -> Adj),1,351.0
2,0.039541,(Nf -> simple_X-X -> V),1,217.0
3,0.051203,(Nf -> simple_X-Xaire -> Adj),1,281.0
4,0.026786,(Nf -> simple_X-Xaire -> Nm),1,147.0
...,...,...,...,...
420,0.011480,"(Adj -> simple_X-inX -> Adj, V -> simple_X-Xag...",5,63.0
421,0.011115,"(Nf -> simple_X-Xette -> Nf, V -> simple_X-Xag...",5,61.0
422,0.019315,"(V -> simple_X-Xage -> Nm, V -> simple_X-Xable...",5,106.0
423,0.015488,"(V -> simple_X-Xette -> Nf, V -> simple_X-Xage...",5,85.0


In [4]:
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.8)
#rules['antecedent length'] = rules['antecedents'].apply(lambda x: len(x))
#rules['consequent length'] = rules['consequents'].apply(lambda x: len(x))
rules.drop(columns=['antecedent support', 'consequent support', 'lift', 'leverage', 'conviction'], inplace=True)
rules

Unnamed: 0,antecedents,consequents,support,confidence
0,(Adj -> simple_X-Xité -> Nf),(V -> simple_X-Xable -> Adj),0.031159,0.806604
1,(Adj -> simple_X-inX -> Adj),(V -> simple_X-Xable -> Adj),0.062318,0.974359
2,(Nf -> simple_X-Xinette -> Nf),(Nf -> simple_X-Xette -> Nf),0.012937,0.825581
3,(Nf -> simple_X-Xier -> Adj),(Nf -> simple_X-Xier -> Nm),0.051749,0.962712
4,(V -> simple_X-Xif -> Adj),(Nf -> simple_X-Xif -> Adj),0.019497,0.906780
...,...,...,...,...
225,"(Adj -> simple_X-inX -> Adj, V -> simple_X-Xab...",(V -> simple_X-Xage -> Nm),0.011480,0.807692
226,"(Adj -> simple_X-inX -> Adj, V -> simple_X-Xag...","(V -> simple_X-Xeur -> Nm, V -> simple_X-Xable...",0.011480,1.000000
227,"(Adj -> simple_X-inX -> Adj, V -> simple_X-Xab...","(V -> simple_X-Xeur -> Nm, V -> simple_X-Xage ...",0.011480,0.807692
228,"(Adj -> simple_X-inX -> Adj, V -> simple_X-Xeu...","(V -> simple_X-Xage -> Nm, V -> simple_X-Xable...",0.011480,0.807692


In [5]:
families_ant_con = list()
families_ant = list()
for chosen_rule_id in rules.index:
    set_of_antecedents = rules.loc[chosen_rule_id, 'antecedents']
    set_of_consequents = rules.loc[chosen_rule_id, 'consequents']
    conditions_ant = list()
    conditions_ant_con = list()
    for ante in set_of_antecedents:
        conditions_ant.append(df[ante])
        conditions_ant_con.append(df[ante])
    for cons in set_of_consequents:
        conditions_ant.append(~df[cons])
        conditions_ant_con.append(df[cons])

    joined_ant = pd.Series([True] * df.shape[0], index=df.index)
    for c in conditions_ant:
        joined_ant &= c
    indices = df.loc[joined_ant].index
    ret_str = str(len(indices)) + ' families: '
    for ind in indices:
        ret_str += ind.split(' ')[0] + ' '
    families_ant.append(ret_str)

    joined_ant_con = pd.Series([True] * df.shape[0], index=df.index)
    for c in conditions_ant_con:
        joined_ant_con &= c
    indices = df.loc[joined_ant_con].index
    ret_str = str(len(indices)) + ' families: '
    for ind in indices:
        ret_str += ind.split(' ')[0] + ' '
    families_ant_con.append(ret_str)
rules['families with ante & cons'] = families_ant_con
rules['families with ante only'] = families_ant
rules

Unnamed: 0,antecedents,consequents,support,confidence,families with ante & cons,families with ante only
0,(Adj -> simple_X-Xité -> Nf),(V -> simple_X-Xable -> Adj),0.031159,0.806604,171 families: F00035_0 F00069_0 F00071_0 F0007...,41 families: F00069_1 F00240_1 F00402_1 F00411...
1,(Adj -> simple_X-inX -> Adj),(V -> simple_X-Xable -> Adj),0.062318,0.974359,342 families: F00004_0 F00014_0 F00015_0 F0002...,9 families: F01801 F01861_1 F01942_0 F01966_1 ...
2,(Nf -> simple_X-Xinette -> Nf),(Nf -> simple_X-Xette -> Nf),0.012937,0.825581,71 families: F00164 F00173 F00179_0 F00188_0 F...,15 families: F00193 F00346 F00453 F00455_0 F00...
3,(Nf -> simple_X-Xier -> Adj),(Nf -> simple_X-Xier -> Nm),0.051749,0.962712,284 families: F00000_0 F00004_0 F00015_0 F0001...,11 families: F00313 F00781 F00811 F01054 F0149...
4,(V -> simple_X-Xif -> Adj),(Nf -> simple_X-Xif -> Adj),0.019497,0.906780,107 families: F00026_0 F00091_0 F00101 F00103 ...,11 families: F00071_0 F00381_2 F00381_3 F00508...
...,...,...,...,...,...,...
225,"(Adj -> simple_X-inX -> Adj, V -> simple_X-Xab...",(V -> simple_X-Xage -> Nm),0.011480,0.807692,63 families: F00004_0 F00014_0 F00071_0 F00096...,15 families: F00102_0 F00302_0 F00501_0 F00506...
226,"(Adj -> simple_X-inX -> Adj, V -> simple_X-Xag...","(V -> simple_X-Xeur -> Nm, V -> simple_X-Xable...",0.011480,1.000000,63 families: F00004_0 F00014_0 F00071_0 F00096...,0 families:
227,"(Adj -> simple_X-inX -> Adj, V -> simple_X-Xab...","(V -> simple_X-Xeur -> Nm, V -> simple_X-Xage ...",0.011480,0.807692,63 families: F00004_0 F00014_0 F00071_0 F00096...,0 families:
228,"(Adj -> simple_X-inX -> Adj, V -> simple_X-Xeu...","(V -> simple_X-Xage -> Nm, V -> simple_X-Xable...",0.011480,0.807692,63 families: F00004_0 F00014_0 F00071_0 F00096...,0 families:


In [6]:
#rules.sort_values(by=['confidence'], ascending=False, inplace=True)
rules.to_excel('rules_double.xlsx')

