In [1]:
import codecs
import glob
import networkx as nx
import pandas as pd
import pickle
from os import listdir
from os.path import isfile, join
from networkx.drawing.nx_pydot import write_dot
from utils import printProgressBar

# column number Demonette
graph_1 = 3
graph_2 = 6
cat_1 = 8
cat_2 = 10
cstr_1 = 14
cstr_2 = 17
complexite = 19
orientation = 21
fichier_origine = 43

In [None]:
def category_shortening(cat):
    if cat != 'Num' and cat[0] == 'N':
        if cat[1] == 'p':  # nom propre
            return 'Np'
        return 'N'  # nom
    return cat

# create binary file for graphs

In [None]:
input_dir = 'D-families'
input_files = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]
output_dir = 'D-graph-binary'

for input_file in input_files:
    fam_id = input_file.split()[0]
    group_id = fam_id.split('-')[0]
    H = nx.DiGraph()
    with codecs.open(join(input_dir, input_file), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num >= 2:
                elements = line.replace('\n','').replace(' ','').split('\t')
                va = elements[graph_1] + '_' + elements[cat_1]
                vb = elements[graph_2] + '_' + elements[cat_2]
                if H.has_edge(va, vb) or H.has_edge(vb, va):
                    continue
                H.add_node(va, label=category_shortening(elements[cat_1]))
                H.add_node(vb, label=category_shortening(elements[cat_2]))
                if elements[orientation] == 'as2de' or elements[orientation] == 'as2des':
                    H.add_edge(va, vb, label=elements[cstr_1] + '-' + elements[cstr_2])
                elif elements[orientation] == 'de2as' or elements[orientation] == 'des2as':
                    H.add_edge(vb, va, label=elements[cstr_2] + '-' + elements[cstr_1])
                else:
                    H.add_edge(va, vb, label=elements[cstr_1] + '-' + elements[cstr_2] + '_' + elements[orientation])
                    H.add_edge(vb, va, label=elements[cstr_2] + '-' + elements[cstr_1] + '_' + elements[orientation])
    graph_file = open(join(output_dir, fam_id), 'wb')
    pickle.dump(H, graph_file)
    graph_file.close()
    print(fam_id, end='\r')

#  create fully-oriented graphs

In [None]:
input_dir = 'D-graph-binary'
input_files = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]

In [None]:
directed_edge_count = dict()
for graph in input_files:
    G = pickle.load(open(join(input_dir, graph), 'rb'))
    for e in G.edges(data=True):
        if 'indirect' in e[2]['label'].split('$')[0] or 'NA' in e[2]['label'].split('$')[0]:
            continue
        label = G.nodes[e[0]]['label'] + '>' + e[2]['label'].split('$')[0] + '>' + G.nodes[e[1]]['label']
        if label not in directed_edge_count:
            directed_edge_count[label] = 1
        else:
            directed_edge_count[label] += 1
    print(graph, end='\r')

In [None]:
output_dir = 'D-graph-oriented'
original_graph_viz_dir = 'D-graph'
new_directed_edge_count = directed_edge_count.copy()  # taking into account directed edges that are originally non-directed
for graph in input_files:
    L = nx.DiGraph()
    G = pickle.load(open(join(input_dir, graph), 'rb'))
    for n in G.nodes(data=True):
        L.add_node(n[0], label=n[0].split('_')[0] + '\n' + n[1]['label'])
    for e in G.edges(data=True):
        if 'indirect' in e[2]['label'].split('$')[0] or 'NA' in e[2]['label'].split('$')[0]:
            label = e[2]['label'].split('$')[0].replace('_indirect', '').replace('_NA', '')
            if L.has_edge(e[1], e[0]) or L.has_edge(e[0], e[1]):
                continue
            patterns = label.split('-')
            label1 = G.nodes[e[0]]['label'] + '>' + patterns[0] + '-' + patterns[1] + '>' + G.nodes[e[1]]['label']
            label2 = G.nodes[e[1]]['label'] + '>' + patterns[1] + '-' + patterns[0] + '>' + G.nodes[e[0]]['label']
            if directed_edge_count.get(label1, 0) == directed_edge_count.get(label2, 0):
                if patterns[0] < patterns[1]:
                    L.add_edge(e[0], e[1], label=patterns[0] + '-' + patterns[1])
                    try:
                        new_directed_edge_count[label1] += 1
                    except KeyError:
                        new_directed_edge_count[label1] = 1
                else:
                    L.add_edge(e[1], e[0], label=patterns[1] + '-' + patterns[0])
                    try:
                        new_directed_edge_count[label2] += 1
                    except KeyError:
                        new_directed_edge_count[label2] = 1
            elif directed_edge_count.get(label1, 0) > directed_edge_count.get(label2, 0):
                L.add_edge(e[0], e[1], label=patterns[0] + '-' + patterns[1])
                new_directed_edge_count[label1] += 1
            else:
                L.add_edge(e[1], e[0], label=patterns[1] + '-' + patterns[0])
                new_directed_edge_count[label2] += 1
        else:
            L.add_edge(e[0], e[1], label=e[2]['label'].split('$')[0])
    filename = glob.glob(join(original_graph_viz_dir, graph + '*'))[0]
    write_dot(L, filename.replace(original_graph_viz_dir, output_dir))
    print(graph, end='\r')

In [None]:
out_file = codecs.open('D_edge_count_directed.txt', 'w')
for c in directed_edge_count:
    out_file.write(c + ',' + str(directed_edge_count[c]) + '\n')
out_file.close()
out_file = codecs.open('D_edge_count_all.txt', 'w')
for c in new_directed_edge_count:
    out_file.write(c + ',' + str(new_directed_edge_count[c]) + '\n')
out_file.close()

# generate branchings

In [2]:
input_dir = 'D-graph-binary'
input_files = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]

In [3]:
directed_edge_count = dict()
with codecs.open('D_edge_count_directed.txt', 'r', encoding='latin-1') as f:
    for line in f:
        elements = line.replace('\n', '').split(',')
        directed_edge_count[elements[0]] = int(elements[1])
new_directed_edge_count = dict()
with codecs.open('D_edge_count_all.txt', 'r', encoding='latin-1') as f:
    for line in f:
        elements = line.replace('\n', '').split(',')
        new_directed_edge_count[elements[0]] = int(elements[1])

## visualization (dot files)

In [9]:
original_graph_viz_dir = 'D-graph'
output_dir = 'D-branching'
output_binary_dir = 'D-branching-binary'
counter = -1
for graph in input_files:
#     if graph != 'F00019':
#         continue
#     counter += 1
#     if counter < 3340:
#         continue
    L = nx.DiGraph()
    G = pickle.load(open(join(input_dir, graph), 'rb'))
    for n in G.nodes(data=True):
        L.add_node(n[0], label=n[0].split('_')[0] + '\n' + n[1]['label'])
    for e in G.edges(data=True):
        if 'indirect' in e[2]['label'].split('$')[0] or 'NA' in e[2]['label'].split('$')[0]:
            label = e[2]['label'].split('$')[0].replace('_indirect', '').replace('_NA', '')
            if L.has_edge(e[1], e[0]) or L.has_edge(e[0], e[1]):
                continue
            patterns = label.split('-')
            label1 = G.nodes[e[0]]['label'] + '>' + patterns[0] + '-' + patterns[1] + '>' + G.nodes[e[1]]['label']
            label2 = G.nodes[e[1]]['label'] + '>' + patterns[1] + '-' + patterns[0] + '>' + G.nodes[e[0]]['label']
            if new_directed_edge_count.get(label1, 0) == new_directed_edge_count.get(label2, 0):
                if patterns[0] < patterns[1]:
                    w = new_directed_edge_count.get(label1, 0)
                    L.add_edge(e[0], e[1], weight=w, label=patterns[0] + '-' + patterns[1])
                else:
                    w = new_directed_edge_count.get(label2, 0)
                    L.add_edge(e[1], e[0], weight=w, label=patterns[1] + '-' + patterns[0])
            elif new_directed_edge_count.get(label1, 0) > new_directed_edge_count.get(label2, 0):
                w = new_directed_edge_count.get(label1, 0)
                L.add_edge(e[0], e[1], weight=w, label=patterns[0] + '-' + patterns[1])
            else:
                w = new_directed_edge_count.get(label2, 0)
                L.add_edge(e[1], e[0], weight=w, label=patterns[1] + '-' + patterns[0])
        else:
            edge_type = G.nodes[e[0]]['label'] + '>' + e[2]['label'].split('$')[0] + '>' + G.nodes[e[1]]['label']
            w = new_directed_edge_count.get(edge_type, 0)
            L.add_edge(e[0], e[1], weight=w, label=e[2]['label'].split('$')[0])
    edmonds = nx.algorithms.tree.branchings.Edmonds(L)
    B = edmonds.find_optimum(preserve_attrs=True)
    for n in B.nodes(data=True):
        n[1]['label'] = L.nodes[n[0]]['label']
    filename = glob.glob(join(original_graph_viz_dir, graph + '*'))[0]
    write_dot(B, filename.replace(original_graph_viz_dir, output_dir))
    for n in B.nodes(data=True):
        n[1]['label'] = G.nodes[n[0]]['label']
    graph_file = open(join(output_binary_dir, graph), 'wb')
    pickle.dump(B, graph_file)
    graph_file.close()
    print(graph, end='\r')

F06656-5283