In [1]:
%load_ext autoreload
%autoreload 2

here the objective is to fit the logit graph model to multiple different graph domains and then visualize the original vs fitted graphs

In [2]:

import sys
import os

#Graph imports
sys.path.append('../..')
import src.logit_graph.graph as graph
import src.logit_graph.logit_estimator as estimator
import src.logit_graph.utils as utils
import src.logit_graph.model_selection as model_selection
import src.logit_graph.gic as gic
import src.logit_graph.param_estimator as pe
import src.logit_graph.graph as graph
import src.logit_graph.model_selection as ms

from src.logit_graph.simulation import LogitGraphFitter, GraphModelComparator

# usual imports
import matplotlib.pyplot as plt
import pickle
import math
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import gaussian_kde
import numpy as np
import pandas as pd
import seaborn as sns
import gc
import random
import networkx as nx
from numpy import errstate

from IPython.display import display
from pyvis.network import Network
from mpl_toolkits.axes_grid1 import make_axes_locatable


In [3]:
data_directory = f'../../data/twitch/'
os.listdir(data_directory)

['RU',
 'ENGB',
 'citing.txt',
 'PTBR',
 'DE',
 'README.txt',
 'FR',
 'ES',
 'graphs_processed']

In [4]:
# Load the graph data from each country's edge list
import networkx as nx
import pandas as pd

# Dictionary to store graphs for each country
edges_data = {}

# Load edge lists from each country's CSV file
for folder in os.listdir(data_directory):
    if folder.endswith('.txt') or folder == 'citing.txt':
        continue
    edges_path = os.path.join(os.path.join(data_directory, folder), f'musae_{folder}_edges.csv')
    print(edges_path)
    if os.path.exists(edges_path):
        # Read edges CSV file
        edges_df = pd.read_csv(edges_path)
        # Convert edges to list of tuples
        edges = list(zip(edges_df['from'], edges_df['to']))
        edges_data[folder] = edges


../../data/twitch/RU/musae_RU_edges.csv
../../data/twitch/ENGB/musae_ENGB_edges.csv
../../data/twitch/PTBR/musae_PTBR_edges.csv
../../data/twitch/DE/musae_DE_edges.csv
../../data/twitch/FR/musae_FR_edges.csv
../../data/twitch/ES/musae_ES_edges.csv
../../data/twitch/graphs_processed/musae_graphs_processed_edges.csv


In [5]:
# Convert edge lists to networkx graphs
graph_data = {}
for country, edges in edges_data.items():
    # Create empty graph
    G = nx.Graph()
    # Add edges from the edge list
    G.add_edges_from(edges)
    # Store in dictionary
    graph_data[country] = G
    print(f'Number of nodes: {G.number_of_nodes()}, Number of edges: {G.number_of_edges()} , Country: {country}')

# Print first graph as verification
print(next(iter(graph_data.values())))


Number of nodes: 4385, Number of edges: 37304 , Country: RU
Number of nodes: 7126, Number of edges: 35324 , Country: ENGB
Number of nodes: 1912, Number of edges: 31299 , Country: PTBR
Number of nodes: 9498, Number of edges: 153138 , Country: DE
Number of nodes: 6549, Number of edges: 112666 , Country: FR
Number of nodes: 4648, Number of edges: 59382 , Country: ES
Graph with 4385 nodes and 37304 edges


In [6]:
graphs_processed_dir = os.path.join(data_directory, 'graphs_processed')
os.makedirs(graphs_processed_dir, exist_ok=True)
for country, G in graph_data.items():
    output_path = os.path.join(graphs_processed_dir, f'{country}_graph.edges')
    edge_list = list(G.edges())
    with open(output_path, 'w') as f:
        for edge in edge_list:
            f.write(f'{edge[0]} {edge[1]}\n')
    
    print(f'Saved graph for {country} to {output_path}')


Saved graph for RU to ../../data/twitch/graphs_processed/RU_graph.edges
Saved graph for ENGB to ../../data/twitch/graphs_processed/ENGB_graph.edges
Saved graph for PTBR to ../../data/twitch/graphs_processed/PTBR_graph.edges
Saved graph for DE to ../../data/twitch/graphs_processed/DE_graph.edges
Saved graph for FR to ../../data/twitch/graphs_processed/FR_graph.edges
Saved graph for ES to ../../data/twitch/graphs_processed/ES_graph.edges


### Fitting single graph

Loading the different graphs into a networkx graph

In [None]:
# Let's proceed with the first loaded graph if available
if not graphs:
    print("\nNo graphs were loaded. Exiting example.")
    exit()

target_graph = graphs[1]
target_filepath = graph_filepaths[target_graph]

# For a single fit, you need to specify a single `d`
fitter_params = {
    "d": 0,
    "n_iteration": 8000,
    "patience": 10,
    "er_p": 0.05,
    "edge_delta": 50,
    "min_gic_threshold": 0.1,
    "verbose": True
}

# Initialize the fitter with the parameters

lg_fitter = LogitGraphFitter(**fitter_params)
lg_fitter.fit(target_graph)


In [None]:
# Log the gic also with the spectrum digg and edges
plt.plot(lg_fitter.metadata['gic_values'])

In [None]:
def compare_graphs_visually(original_graph, fitted_graph, metadata=None, figsize=(16, 8), node_size=50, edge_width=0.6):
    """
    Generates a side-by-side plot of the original and fitted graphs.

    Args:
        original_graph (nx.Graph): The original graph.
        fitted_graph (nx.Graph): The fitted graph (can be None if fitting failed).
        metadata (dict, optional): Fitting metadata to display in titles.
        figsize (tuple): The size of the matplotlib figure.
        node_size (int): The size of the nodes in the plot.
        edge_width (float): The width of the edges in the plot.

    Returns:
        matplotlib.figure.Figure: The figure object for the plot.
    """
    fig, axes = plt.subplots(1, 2, figsize=figsize)
    
    # Define colors and use a consistent layout for better comparison
    original_node_color = '#4472C4'
    fitted_node_color = '#E74C3C'
    edge_color = '#8B8B8B'
    
    # Calculate layout for original graph
    pos_orig = nx.spring_layout(original_graph, k=1, iterations=50, seed=42)
    
    # 1. Plot Original Graph
    ax_orig = axes[0]
    stats_orig = f"{original_graph.number_of_nodes()} nodes, {original_graph.number_of_edges()} edges"
    nx.draw_networkx_edges(original_graph, pos_orig, ax=ax_orig, edge_color=edge_color, width=edge_width, alpha=0.7)
    nx.draw_networkx_nodes(original_graph, pos_orig, ax=ax_orig, node_color=original_node_color, node_size=node_size, alpha=0.9)
    ax_orig.set_title(f'Original Graph\n({stats_orig})', fontweight='bold', pad=10)
    ax_orig.axis('off')

    # 2. Plot Fitted Graph
    ax_fitted = axes[1]
    title = 'Fitted Logit Graph'
    if fitted_graph and metadata and metadata.get('fit_success'):
        stats_fitted = f"{fitted_graph.number_of_nodes()} nodes, {fitted_graph.number_of_edges()} edges"
        gic_val = metadata.get('gic_value', 'N/A')
        gic_text = f"GIC: {gic_val:.4f}" if isinstance(gic_val, (int, float)) else f"GIC: {gic_val}"
        title = f'{title}\n({stats_fitted})\n{gic_text}'
        
        # Calculate separate layout for fitted graph to avoid position mismatch
        pos_fitted = nx.spring_layout(fitted_graph, k=1, iterations=50, seed=42)
        
        nx.draw_networkx_edges(fitted_graph, pos_fitted, ax=ax_fitted, edge_color=edge_color, width=edge_width, alpha=0.7)
        nx.draw_networkx_nodes(fitted_graph, pos_fitted, ax=ax_fitted, node_color=fitted_node_color, node_size=node_size, alpha=0.9)
    else:
        title = f'{title}\n(Fitting Failed)'
        ax_fitted.text(0.5, 0.5, 'Fitting Failed', horizontalalignment='center', verticalalignment='center', transform=ax_fitted.transAxes, fontsize=14, color='red')

    ax_fitted.set_title(title, fontweight='bold', pad=10)
    ax_fitted.axis('off')
    
    fig.suptitle('Original vs. Fitted Graph Comparison', fontsize=16, fontweight='bold')
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    
    return fig


# 5. Visualize the results
if lg_fitter.metadata['fit_success']:
    comparison_figure = compare_graphs_visually(target_graph, lg_fitter.fitted_graph, lg_fitter.metadata)
    plt.show()
else:
    print("Fitting failed. No visualization available.")


In [None]:
# Let's proceed with the first loaded graph if available
if not graphs:
    print("\nNo graphs were loaded. Exiting example.")
    exit()

target_graph = graphs[2]
target_filepath = graph_filepaths[target_graph]

# For a single fit, you need to specify a single `d`
sim_params = {
    "d_list": [0, 1, 2],
    "lg_params": {
        "max_iterations": 8000,
        "patience": 10,
        "gic_dist_type": 'KL',
        "er_p": 0.05,
        "min_gic_threshold": 0.1,
        "edge_delta": 50,
        "verbose": True,
    },
    "other_model_n_runs": 5,
    "other_model_params": [
        {'lo': 0.01, 'hi': 0.2},  # ER (probability)
        {'k': {'lo': 2, 'hi': 3, 'step': 1}, 'p': {'lo': 0.01, 'hi': 0.2}},  # WS
        {'lo': 1, 'hi': 3},      # GRG (alpha)
        {'lo': 3, 'hi': 5},     # BA (m)
    ],
    "verbose": True,
}



# Initialize the fitter with the parameters

comparator = GraphModelComparator(**sim_params)
comparator.compare(target_graph, target_filepath)


In [None]:
comparator.summary_df

### Comparison other models

comparing multiple models

In [7]:
graph_files = os.listdir(f'{data_directory}/graphs_processed')
graph_files = [os.path.join(data_directory, 'graphs_processed', f) for f in graph_files]

graph_files_with_sizes = []
for graph_file in graph_files:
    try:
        g = nx.read_edgelist(graph_file, nodetype=int)
        graph_files_with_sizes.append((graph_file, g.number_of_nodes()))
    except:
        graph_files_with_sizes.append((graph_file, 0))

graph_files_with_sizes.sort(key=lambda x: x[1])
graph_files = [f for f, _ in graph_files_with_sizes]
graph_files

['../../data/twitch/graphs_processed/PTBR_graph.edges',
 '../../data/twitch/graphs_processed/RU_graph.edges',
 '../../data/twitch/graphs_processed/ES_graph.edges',
 '../../data/twitch/graphs_processed/FR_graph.edges',
 '../../data/twitch/graphs_processed/ENGB_graph.edges',
 '../../data/twitch/graphs_processed/DE_graph.edges']

In [9]:
sim_params = {
    "d_list": [0, 1],
    "lg_params": {
        "max_iterations": 8000,
        "patience": 10,
        "gic_dist_type": 'KL',
        "er_p": 0.01,
        "min_gic_threshold": 0.1,
        "edge_delta": 50,
        "verbose": True,
    },
    "other_model_n_runs": 1,
    "other_model_grid_points": 1,
    "other_models": ["ER", "WS", "BA", "GRG"],
    "other_model_params": [
        {'lo': 0.01, 'hi': 0.2},  # ER (probability) 
        {'k': {'lo': 2, 'hi': 3, 'step': 1}, 'p': {'lo': 0.01, 'hi': 0.2}},  # WS
        {'lo': 1, 'hi': 3},  # GRG (radius/alpha)
        {'lo': 1, 'hi': 3},  # BA (m)
    ],
    "verbose": True,

}

In [None]:
comparators = []
folder_name = 'runs/fitted_graphs_comparison_twitch'
os.makedirs(folder_name, exist_ok=True)

for graph_file in graph_files[::-1]:
    try:
        print(f"Loading graph from: {graph_file}")
        # check if the graph is already pprocessed
        region = graph_file.split('/')[-1].split('.')[0].replace('_graph', '')
        print(region)
        if os.path.exists(f'{folder_name}/comparators_{region}_graph.pkl'):
            print(f'Graph {region} already processed')
            continue

        original_graph = nx.read_edgelist(graph_file, nodetype=int)
        print(f'Number of nodes: {original_graph.number_of_nodes()}, Number of edges: {original_graph.number_of_edges()}')

        ########################
        # Calculate ER probability based on graph density
        n = original_graph.number_of_nodes()
        m = original_graph.number_of_edges()
        er_p = (2 * m) / (n * (n - 1))  # p = 2|E|/(|V|(|V|-1))
        er_p = er_p / 2
        print(f'ER probability: {er_p}')
        sim_params['lg_params']['er_p'] = er_p
        ########################
        
        n_iteration = 10
        sim_params['lg_params']['max_iterations'] = n_iteration

        # Run the comparison
        comparator = GraphModelComparator(**sim_params)
        comparator.compare(original_graph, graph_file)
    except Exception as e:
        print(f"Error graph from {graph_file}: {e}")
        print(e)
        continue

    comparators.append(comparator)
    import pickle
    graph_name = graph_file.split('/')[-1].split('.')[0]
    with open(f'{folder_name}/comparators_{graph_name}_sample.pkl', 'wb') as f:
        pickle.dump(comparators, f)



Loading graph from: ../../data/twitch/graphs_processed/DE_graph.edges
DE
Number of nodes: 9498, Number of edges: 153138
ER probability: 0.0016977133650810101


--- Fitting Logit Graph (LG) model ---
Running LG generation for d=0...


🔄 Optimizing Graph:   0%|          | 0/10 [00:00<?, ?it/s] 

⏰ Max iterations (10) reached. Stopping.

🏁 Stopping Condition Met
   📍 Reason: Max iterations (10) reached.
   📈 Results Summary
   🏆 Best iteration found: 9
   📊 Best spectrum difference: 8170.9582
   🔗 Edges in best graph: 76737 (Real graph edges: 153138)
d=0: GIC=0.1855, sigma=-6.1496
Running LG generation for d=1...


🔄 Optimizing Graph:   0%|          | 0/10 [00:00<?, ?it/s] 

⏰ Max iterations (10) reached. Stopping.

🏁 Stopping Condition Met
   📍 Reason: Max iterations (10) reached.
   📈 Results Summary
   🏆 Best iteration found: 9
   📊 Best spectrum difference: 8164.1611
   🔗 Edges in best graph: 77067 (Real graph edges: 153138)
d=1: GIC=0.1843, sigma=-7.0523
Best LG fit found with GIC: 0.1843

--- Fitting other random graph models ---
Testing the selected model for ER
ER gic: 1.4908092867219833
Testing the selected model for WS
WS gic: 1.3490875100051447
Testing the selected model for BA
BA gic: 0.9720938975688622
Testing the selected model for GRG
GRG gic: 6.021175892450195
BA fitting - GIC: 0.9721, Param: 1.0000
WS fitting - GIC: 1.3491, Param: 20.0100
ER fitting - GIC: 1.4908, Param: 0.0100


In [None]:
'finish'