In [1]:
import codecs
import networkx as nx
import os
import pandas as pd
import re
from os import listdir
from os.path import isfile, join
from utils import printProgressBar



In [2]:
context_with_header = pd.read_csv('context_from_maxgraph.csv', header=0, index_col=0)
col_names = context_with_header.columns
family_ids = context_with_header.index

families = [f for f in listdir('families') if isfile(join('families', f))]
families_dict = dict()  # contains the representative word for a given family
for f in families:
    elements = f.replace('.txt', '').split()
    families_dict[elements[0]] = elements[1].split('_')[0]

# Get parents and children from a concept

In [4]:
full_dot = 'families_full_s20_l3.dot'
simplified_dot = 'families_simplified_s20_l3.dot'
directory = 'posets'
out_directory = 'posets_gSpan'
L1 = nx.DiGraph()
L2 = nx.DiGraph()
with codecs.open(join(directory, simplified_dot), 'r', encoding='utf-8') as f:
    for line in f:
        if '->' in line:  # a line showing edges between concepts
            elements = line.split()
            L1.add_edge(elements[0], elements[2])   
            L2.add_edge(elements[2], elements[0])  
        elif 'shape' in line:  # a line describing a concept 
            L1.add_node(line.split()[0])
            L2.add_node(line.split()[0])
            
families = [f for f in listdir('families') if isfile(join('families', f))]
group_prev = ''
counter = 0
for file_name in families:
    #if file_name != 'F01317 abducteur.txt':
        #continue
    family_id = file_name.split()[0]
    group_id = family_id.split('-')[0]
    if group_id == group_prev:
        continue
    group_prev = group_id
    
    # find vertex that contains the intended family and its parents+children
    try:
        object_id = family_ids.get_loc(family_id)
    except KeyError:
        #  from fingerprint, family without direct relation
        continue
    with codecs.open(join(directory, simplified_dot), 'r', encoding='utf-8') as f:
        for line in f:
            if ('Object ' + str(object_id) + '\\n') in line:
                vertex = line.split()[0]
                break
    parents = nx.descendants(L1, vertex)
    children = nx.descendants(L2, vertex)
    selected_vertices = parents.union(children)
    selected_vertices.add(vertex)
    
    # find all vertex connected to the intended family, and write to dot
    out_file_name = 'poset_' + family_id + '_simplified' + '.dot'
    out_file = codecs.open(join(out_directory, out_file_name), 'w')
    with codecs.open(join(directory, simplified_dot), 'r', encoding='utf-8') as f:
        for line in f:
            if 'graph' in line or 'rankdir' in line or line == '}':
                out_file.write(line)
                continue
            if '->' in line:
                elements = line.split()
                v1 = elements[0]
                v2 = elements[2]
                if v1 in selected_vertices and v2 in selected_vertices:
                    out_file.write(line)
                continue
            vertex_id = line.split()[0]
            concept_id = re.search('<(.*)>', line).group(1)
            if vertex_id in selected_vertices:
                to_be_written = line.split('|')[0] + '|'
                attribute_string = line.split('|')[1]
                if 'Attribute' not in attribute_string: # empty intent
                    #to_be_written += '|'
                    pass
                else:
                    attributes = attribute_string.split('\\n')
                    for attribute in attributes:
                        if attribute == '':
                            continue
                        to_be_written += col_names[int(attribute.split()[1])] + '\\n'
                to_be_written += '|'
                object_string = line.split('|')[2]
                if 'Object' not in object_string: # empty extent
                    #to_be_written += '|'
                    pass
                else:
                    objects = object_string.split('\\n')
                    for obj in objects:
                        if obj == '' or '}' in obj:
                            continue
                        to_be_written += families_dict[family_ids[int(obj.split()[1])]] + '\\n'
                to_be_written += '}"];\n'
                to_be_written = re.sub('\(I.*\)\|', concept_id + '|', to_be_written)
                to_be_written = to_be_written.replace(',fillcolor=orange', '').replace(',fillcolor=lightblue', '')
                if vertex_id == vertex:
                    to_be_written = to_be_written.replace('style=filled', 'style=filled,fillcolor=orange')
                out_file.write(to_be_written)
    out_file.close()
    counter += 1
    printProgressBar(counter, 3903, prefix = 'Progress:', suffix = 'Complete', length = 50)

Progress: |████████████████████████████████████████████------| 89.9% Complete

In [None]:
# PDF generation
dot_dir = 'posets_all'
dot_files = [f for f in listdir(dot_dir) if isfile(join(dot_dir, f))]
for dot_file in dot_files:
    os.system('dot -Tpdf "' + join(dot_dir, dot_file) + '" -o "' + join(dot_dir, dot_file.replace('.dot', '.pdf')) + '"')

# Get common children from two concepts
### assumption: group id = column id

In [3]:
L = nx.DiGraph()
with codecs.open(join('posets', 'families_simplified_maxgraph.dot'), 'r', encoding='utf-8') as f:
    for line in f:
        if '->' in line:  # a line showing edges between concepts
            elements = line.split()
            L.add_edge(elements[2], elements[0])
        elif 'shape' in line:  # a line describing a concept 
            L.add_node(line.split()[0])

In [4]:
g1 = '3359'
g2 = '3902'
selected_pair = list()
with codecs.open(join('posets', 'families_simplified_maxgraph.dot'), 'r', encoding='utf-8') as f:
    for line in f:
        if ('Attribute ' + g1 + '\\') in line or ('Attribute ' + g2 + '\\') in line:
            selected_pair.append(line.split()[0])
            if len(selected_pair) == 2:
                break 
print(selected_pair)
children1 = nx.descendants(L, selected_pair[0])
children2 = nx.descendants(L, selected_pair[1])
common_children = children1.intersection(children2)
print(str(len(common_children)) + ' common superfamilies')
vertices_to_children = set()
for common_child in common_children:
    for v in selected_pair:
        for path in nx.all_simple_paths(L, v, common_child):
            for p in range(1, len(path)-1):
                vertices_to_children.add(path[p])

# find all vertex connected to the intended family, and write to dot
out_file_name = 'posets_' + g1 + '-' + g2 + '_simplified' + '.dot'
out_file = codecs.open(join('posets_common', out_file_name), 'w')
selected_vertices = set(selected_pair).union(common_children).union(vertices_to_children)
with codecs.open(join('posets', 'families_simplified_maxgraph.dot'), 'r', encoding='utf-8') as f:
    for line in f:
        if 'graph' in line or 'rankdir' in line or line == '}':
            out_file.write(line)
            continue
        if '->' in line:
            elements = line.split()
            v1 = elements[0]
            v2 = elements[2]
            if v1 in selected_vertices and v2 in selected_vertices:
                out_file.write(line)
            continue
        vertex_id = line.split()[0]
        concept_id = re.search('<(.*)>', line).group(1)
        if vertex_id in selected_vertices:
            to_be_written = line.split('|')[0] + '|'
            attribute_string = line.split('|')[1]
            if 'Attribute' not in attribute_string: # empty intent
                to_be_written += '|'
            else:
                attributes = attribute_string.split('\\n')
                for attribute in attributes:
                    if attribute == '':
                        continue
                    to_be_written += col_names[int(attribute.split()[1])] + '\\n'
            to_be_written += '|'
            object_string = line.split('|')[2]
            if 'Object' not in object_string: # empty extent
                to_be_written += '|'
            else:
                objects = object_string.split('\\n')
                for obj in objects:
                    if obj == '' or '}' in obj:
                        continue
                    to_be_written += families_dict[family_ids[int(obj.split()[1])]] + '\\n'
            to_be_written += '}"];\n'
            to_be_written = re.sub('\(I.*\)\|', concept_id + '|', to_be_written)
            to_be_written = to_be_written.replace(',fillcolor=orange', '')
            if vertex_id in common_children:
                to_be_written = to_be_written.replace('style=filled', 'style=filled,fillcolor=lightblue')
            if vertex_id in selected_pair:
                to_be_written = to_be_written.replace('style=filled', 'style=filled,fillcolor=orange')
            out_file.write(to_be_written)
out_file.close()

['22805895', '1129944640']
4 common superfamilies


# get children difference

In [3]:
g1 = 'G03220'
g2 = 'G03470'
attr1 = col_names.get_loc(g1)
attr2 = col_names.get_loc(g2)
with codecs.open(join('posets', 'families_full_maxgraph.dot'), 'r', encoding='utf-8') as f:
    for line in f:
        if ('Attribute ' + str(attr1) + '\\') in line:
            parent_vertex = line.split()[0]
            parent_object_string = line.split('|')[2]
            parent_objects = set(parent_object_string.split('\\n'))
        if ('Attribute ' + str(attr2) + '\\') in line:
            child_vertex = line.split()[0]
            child_object_string = line.split('|')[2]
            child_objects = set(child_object_string.split('\\n'))
object_diff = parent_objects - child_objects
print(len(object_diff))
for obj in object_diff:
    print(families_dict[family_ids[int(obj.split()[1])]])

13
vibromassage
radeau
broquillage
pavillon
viser
touraillage
orpailleur
mareyage
pastille
lamanage
sexage
courir
hortillonnage


# misc

In [None]:
# filter by family, full
family_id = 'F01942'
full_dot = 'families_full_s100_l3.dot'
simplified_dot = 'families_simplified_s100_l3.dot'
directory = 'posets'

family_number = int(family_id.strip('F'))
out_file_name = 'posets_' + family_id + '_full' + '.dot'
out_file = codecs.open(join(directory, out_file_name), 'w')

object_concept = ''
with codecs.open(join(directory, simplified_dot), 'r', encoding='utf-8') as f:
    for line in f:
        if ('Object ' + str(family_number)) in line:
            object_concept = line.split()[0]
            break
            
attributes_of_object_concept = list()
with codecs.open(join(directory, full_dot), 'r', encoding='utf-8') as f:
    for line in f:
        if object_concept in line:
            attributes_of_object_concept = line.split('|')[1].split('\\n')[0:-1]
            break
        
selected_concepts = list()    
with codecs.open(join(directory, full_dot), 'r', encoding='utf-8') as f:
    for line in f:
        if 'graph' in line or 'rankdir' in line or line == '}':
            out_file.write(line)
            continue
            
        if '->' in line:
            elements = line.split()
            conc1 = elements[0]
            conc2 = elements[2]
            if conc1 in selected_concepts and conc2 in selected_concepts:
                out_file.write(line)
            continue
                
        vertex_id = line.split()[0]
        concept_id = re.search('<(.*)>', line).group(1)
        if ('Object ' + str(family_number)) in line:
            selected_concepts.append(vertex_id)
            if 'E: 3920' in line: # assumption : number of families = 3920, top concept has empty intent
                elements = line.split('|')
                elements[2] = '||all families}"];\n'
                to_be_written = ''.join(elements)
                to_be_written = re.sub('\(.*\)', concept_id, to_be_written)
                out_file.write(to_be_written)
            else:
                to_be_written = line.replace('Attribute', 'Graph').replace('Object ', 'F')
                to_be_written = re.sub('\(.*\)', concept_id, to_be_written)
                out_file.write(to_be_written)
        else:
            selected = True
            for att in attributes_of_object_concept:
                if att not in line:
                    selected = False
                    break
            if selected:
                selected_concepts.append(vertex_id)
                to_be_written = line.replace('Attribute', 'Graph').replace('Object ', 'F')
                to_be_written = re.sub('\(.*\)', concept_id, to_be_written)
                out_file.write(to_be_written)
out_file.close()
os.system('dot -Tpdf ' + join(directory, out_file_name) + ' -o ' + join(directory, out_file_name).replace('.dot', '.pdf'))

In [None]:
# filter by family, minimal
family_id = 'F01942'
simplified_dot = 'families_simplified_s100_l3.dot'
full_dot = 'families_full_s100_l3.dot'
directory = 'posets'

family_number = int(family_id.strip('F'))
out_file_name = 'posets_' + family_id + '_minimal' + '.dot'
out_file = codecs.open(join(directory, out_file_name), 'w')

object_concept = ''
with codecs.open(join(directory, simplified_dot), 'r', encoding='utf-8') as f:
    for line in f:
        if ('Object ' + str(family_number)) in line:
            object_concept = line.split()[0]
            break
            
attributes_of_object_concept = list()
with codecs.open(join(directory, full_dot), 'r', encoding='utf-8') as f:
    for line in f:
        if object_concept in line:
            attributes_of_object_concept = line.split('|')[1].split('\\n')[0:-1]
            break
        
selected_concepts = list()    
with codecs.open(join(directory, full_dot), 'r', encoding='utf-8') as f:
    for line in f:
        if 'graph' in line or 'rankdir' in line or line == '}':
            out_file.write(line)
            continue
            
        if '->' in line:
            elements = line.split()
            conc1 = elements[0]
            conc2 = elements[2]
            if conc1 in selected_concepts and conc2 in selected_concepts:
                out_file.write(line)
            continue
                
        vertex_id = line.split()[0]
        concept_id = re.search('<(.*)>', line).group(1)
        if ('Object ' + str(family_number)) in line:
            selected_concepts.append(vertex_id)
            elements = line.split('|')
            elements[0] = elements[0].replace('> (', '> ' + concept_id + '|(').replace('I', 'Graphs').replace('E', 'Families')
            #to_be_written = '|'.join(elements) + '\n'
            out_file.write(elements[0] + '}"];\n')
        else:
            selected = True
            for att in attributes_of_object_concept:
                if att not in line:
                    selected = False
                    break
            if selected:
                selected_concepts.append(vertex_id)
                elements = line.split('|')
                elements[0] = elements[0].replace('> (', '> ' + concept_id + '|(').replace('I', 'Graphs').replace('E', 'Families')
                out_file.write(elements[0] + '}"];\n')
out_file.close()
os.system('dot -Tpdf ' + join(directory, out_file_name) + ' -o ' + join(directory, out_file_name).replace('.dot', '.pdf'))