In [1]:
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
import os
import pickle

In [2]:
os.chdir('..')

In [3]:
filenames_graph_full = ['data_arab/graph_collapse.txt', 'data_juncea/graph_collapse_b_juncea_full.txt', 'data_nigra/graph_collapse_b_nigra.txt', 'data_rapa/graph_collapse_b_rapa.txt']
filenames_graph_node_traits = ['data_arab/graph_collapse_node_traits_2.txt', 'data_juncea/graph_collapse_node_traits_b_juncea_full.txt', 'data_nigra/graph_collapse_node_traits_b_nigra.txt', 'data_rapa/graph_collapse_node_traits_b_rapa.txt']
names = ['arab', 'juncea', 'nigra', 'rapa']

In [4]:
def load_info(filename_graph, filename_node_traits, name):
    G = nx.Graph()
    
    # Загрузка графа
    with open(filename_graph, 'r') as file:
        for line in file:
            node1, node2 = line.strip().split()
            G.add_edge(node1, node2)
    
    # Загрузка информации о вершинах
    df = pd.read_csv(filename_node_traits, delimiter='\t')
    df['fam'] = df['fam'].replace(['LTR/Copia', 'LTR/Gypsy'], 'LTR')
    
    node_family_dict = df.set_index('node')['fam'].to_dict()
    node_freq_dict = df.set_index('node')['cnt'].to_dict()
    unique_fams = set(node_family_dict.values())
    # print(f"Number of unique families: {len(unique_fams)}")
    
    # Находим компоненты связности
    components = list(nx.connected_components(G))
    filtered_components = []
    for component in components:
        if len(component) >= 4:
            filtered_components.append(component)
    largest_component = max(components, key=len)
    
    # Создаем подграфы
    G_largest_component = G.subgraph(largest_component)
    G_small_components = G.copy()
    
    # Удаляем вершины крупнейшей компоненты из исходного графа
    for node in largest_component:
        G_small_components.remove_node(node)
    
    # print(f"Number of components: {len(components) - 1}")
    
    # Создание директории для сохранения данных, если она еще не существует
    output_dir = 'data'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Сохраняем данные в pickle файл
    data_filename = os.path.join(output_dir, f'data_{name}.pkl')
    if not os.path.exists(data_filename):
        with open(data_filename, 'wb') as f:
            pickle.dump({
                'graph': G,
                'largest_component': G_largest_component,
                'small_components': G_small_components,
                'node_family_dict': node_family_dict,
                'node_freq_dict': node_freq_dict,
                'unique_fams': unique_fams
            }, f)
    
    return G_largest_component, G_small_components, node_family_dict, node_freq_dict, unique_fams


In [5]:
for i, (filename_graph, filename_node_traits) in enumerate(zip(filenames_graph_full, filenames_graph_node_traits)):
    name = names[i]

    G_largest_component, G_small_components, node_family_dict, node_freq_dict, unique_fams = load_info(
        filename_graph, filename_node_traits, name
    )  
    print(f"Finished processing {name}")

Finished processing arab
Finished processing juncea
Finished processing nigra
Finished processing rapa
