In [1]:
import codecs
import networkx as nx
import pickle
from os import listdir
from os.path import isfile, join

In [2]:
G = nx.Graph()

In [3]:
input_dir = 'pairs'
input_files = [f for f in listdir(input_dir) if isfile(join(input_dir, f))]
input_files

['converts-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv',
 'demonette1-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv',
 'denomXaire-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv',
 'denomXal-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv',
 'denomXique-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv',
 'derifde1X-formate-V4-nosem-utf8-lid-newtagset-entete-new-rid.csv',
 'derifXable-formate-V4-nosem-utf8-lid-newtagset-entete-new-rid.csv',
 'dimocXaie-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv',
 'dimocXat-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv',
 'dimocXet-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv',
 'dimocXier-formate-V4-avecsem-utf8-lid-newtagset-entete-new-rid.csv']

In [4]:
header = ''
for input_file in input_files:
    with codecs.open(join(input_dir, input_file), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num == 0:
                header = line.replace('\n','')
            elif line_num >= 2:
                line_elements = line.replace('\n','').split('\t')
                G.add_edge(line_elements[3] + '_' + line_elements[8], line_elements[6] + '_' + line_elements[10])

In [5]:
lexeme_families = list(nx.connected_components(G))
len(lexeme_families)

8141

In [9]:
output_folder = 'families'
lexeme_dict = dict()
f_summary = codecs.open('summary_of_families_with_sem.txt', 'w+', encoding='utf-8')
f_summary.write('family_id\tnumber_of_lexemes\tnumber_of_relations\tlexemes\n')
for family_id, lexeme_family in enumerate(lexeme_families):
    first_member = sorted(list(lexeme_family))[0] # first alphabetically
    H = nx.subgraph(G, lexeme_family)
    f_summary.write('F' + str(family_id).rjust(5, '0') + '\t' + str(len(lexeme_family)) + '\t' + str(len(H.edges)) + '\t' + str(lexeme_family) + '\n')
    for lexeme in lexeme_family:
        filename = 'F' + str(family_id).rjust(5, '0') + ' ' + first_member + '.txt'
        lexeme_dict[lexeme] = filename
        f_out = codecs.open(join(output_folder, filename), 'w+', encoding='utf-8')
        f_out.write(header + '\n\n')
        f_out.close()
f_summary.close()
#lexeme_dict['pommier']

In [10]:
for input_file in input_files:
    with codecs.open(join(input_dir, input_file), 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            if line_num < 2:
                continue
            line_elements = line.split('\t')
            lexeme1 = line_elements[3] + '_' + line_elements[8]
            output_filename = lexeme_dict[lexeme1]
            f_out = codecs.open(join(output_folder, output_filename), 'a+', encoding='utf-8')
            f_out.write(line)
            f_out.close()

In [None]:
graph_file = open('G.p', 'wb')
dict_file = open('lexeme_dict.p', 'wb')
pickle.dump(G, graph_file)
pickle.dump(lexeme_dict, dict_file)
graph_file.close()
dict_file.close()