In [46]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


here the objective is to fit the logit graph model to multiple different graph domains and then visualize the original vs fitted graphs

In [47]:

import sys
import os

#Graph imports
sys.path.append('../..')
import src.logit_graph.graph as graph
import src.logit_graph.logit_estimator as estimator
import src.logit_graph.utils as utils
import src.logit_graph.model_selection as model_selection
import src.logit_graph.gic as gic
import src.logit_graph.param_estimator as pe
import src.logit_graph.graph as graph
import src.logit_graph.model_selection as ms

from src.logit_graph.simulation import LogitGraphFitter, GraphModelComparator

# usual imports
import matplotlib.pyplot as plt
import pickle
import math
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import gaussian_kde
import numpy as np
import pandas as pd
import seaborn as sns
import gc
import random
import networkx as nx
from numpy import errstate

from IPython.display import display
from pyvis.network import Network
from mpl_toolkits.axes_grid1 import make_axes_locatable


In [48]:
data_directory = f'../../data/twitch/'
os.listdir(data_directory)

['RU',
 'ENGB',
 'citing.txt',
 'PTBR',
 'DE',
 'README.txt',
 'FR',
 'ES',
 'graphs_processed']

### Comparison other models

comparing multiple models excluding the simulation of LG because it was alaready simulated

In [49]:
graph_files = os.listdir(f'{data_directory}/graphs_processed')
graph_files = [os.path.join(data_directory, 'graphs_processed', f) for f in graph_files]

graph_files_with_sizes = []
for graph_file in graph_files:
    try:
        g = nx.read_edgelist(graph_file, nodetype=int)
        graph_files_with_sizes.append((graph_file, g.number_of_nodes()))
    except:
        graph_files_with_sizes.append((graph_file, 0))

graph_files_with_sizes.sort(key=lambda x: x[1])
graph_files = [f for f, _ in graph_files_with_sizes]
graph_files

['../../data/twitch/graphs_processed/PTBR_graph.edges',
 '../../data/twitch/graphs_processed/RU_graph.edges',
 '../../data/twitch/graphs_processed/ES_graph.edges',
 '../../data/twitch/graphs_processed/FR_graph.edges',
 '../../data/twitch/graphs_processed/ENGB_graph.edges',
 '../../data/twitch/graphs_processed/DE_graph.edges']

In [50]:
import os
import math
import networkx as nx

# --- Provided Setup ---
comparators = {}
folder_name = 'runs/fitted_graphs_comparison_twitch'
os.makedirs(folder_name, exist_ok=True)

# --- Barabasi-Albert (BA) Model Functions (Provided) ---
def _expected_ba_edges(n, m):
    # networkx.barabasi_albert_graph starts with a complete graph of m nodes,
    # then adds (n - m) nodes, each with m edges
    return m * (n - m) + (m * (m - 1)) // 2

def estimate_ba_m(n, e):
    # Solve: m^2 - (2n - 1)m + 2e = 0
    D = (2 * n - 1) ** 2 - 8 * e
    candidates = []
    if D >= 0:
        m_real = ((2 * n - 1) - math.sqrt(D)) / 2.0
        candidates.extend([math.floor(m_real), math.ceil(m_real)])
    # heuristic fallback
    candidates.append(round(e / n if n > 0 else 1))
    # expand neighborhood and clamp
    uniq = set(int(m) for m in candidates)
    expanded = set()
    for m in uniq:
        for dm in (-1, 0, 1):
            expanded.add(m + dm)
    valid = [m for m in expanded if 1 <= m < n]
    if not valid:
        return 1
    # pick m minimizing edge discrepancy
    best_m = min(valid, key=lambda m: abs(_expected_ba_edges(n, m) - e))
    return best_m

# --- Erdős-Rényi (ER) Model Estimation ---

def estimate_er_p(n, e):
    """Estimates the optimal edge probability 'p' for an Erdős-Rényi graph."""
    if n < 2:
        return 0.0
    # Expected edges E = p * C(n, 2) => p = E / C(n, 2)
    p = (2 * e) / (n * (n - 1))
    # Clamp p to the valid probability range [0, 1]
    return max(0.0, min(1.0, p))

# --- Watts-Strogatz (WS) Model Estimation ---

def estimate_ws_params(n, e):
    """
    Estimates optimal parameters 'k' and 'p' for a Watts-Strogatz graph.
    'k' is chosen to best match the edge count. 'p' is set to a default value.
    """
    # WS graph requires n > k, and k must be an even integer >= 2.
    if n <= 2:
        return 0, 0.1

    # The number of edges in a WS graph is exactly n*k/2.
    # We find the even integer 'k' that makes n*k/2 closest to e.
    k_target = 2 * e / n
    
    # Check the two nearest even integers to the target average degree.
    k_floor = int(k_target // 2 * 2)
    k_ceil = int(k_target // 2 * 2 + 2)

    candidates = [k for k in [k_floor, k_ceil] if 2 <= k < n]

    if not candidates:
        # Fallback if no valid k can be found (e.g., n is too small).
        return (2, 0.1) if n > 2 else (0, 0.1)

    # Pick the candidate 'k' that minimizes the edge discrepancy.
    best_k = min(candidates, key=lambda k: abs(n * k / 2 - e))

    # The rewiring probability 'p' cannot be determined from n and e alone.
    # A common default value for generating small-world networks is used.
    p = 0.1
    
    return best_k, p

# --- Main Loop for Graph Generation and Comparison ---

for graph_file in graph_files:
    original_graph = nx.read_edgelist(graph_file, nodetype=int)
    n = original_graph.number_of_nodes()
    e = original_graph.number_of_edges()

    # --- Barabasi-Albert (BA) Model ---
    m_opt = estimate_ba_m(n, e)
    ba_graph = nx.barabasi_albert_graph(n=n, m=m_opt, seed=42)

    # --- Erdős-Rényi (ER) Model ---
    p_opt = estimate_er_p(n, e)
    er_graph = nx.erdos_renyi_graph(n=n, p=p_opt, seed=42)
    
    # --- Watts-Strogatz (WS) Model ---
    k_opt, p_ws = estimate_ws_params(n, e)
    if k_opt > 0:
        ws_graph = nx.watts_strogatz_graph(n=n, k=k_opt, p=p_ws, seed=42)
    else:
        # Create an empty graph if a valid k could not be found.
        ws_graph = nx.Graph()
        ws_graph.add_nodes_from(range(n))

    # Store all generated model graphs for comparison
    model_graphs = {
        'BA': ba_graph,
        'ER': er_graph,
        'WS': ws_graph
    }
    comparators[os.path.basename(graph_file)] = model_graphs

    # Print a summary of the fitted parameters and resulting edge counts
    print(
        f"{os.path.basename(graph_file)}: n={n}, e={e}\n"
        f"  BA -> m*={m_opt}, edges={ba_graph.number_of_edges()}\n"
        f"  ER -> p*={p_opt:.4f}, edges={er_graph.number_of_edges()}\n"
        f"  WS -> k*={k_opt}, p={p_ws}, edges={ws_graph.number_of_edges()}"
    )

PTBR_graph.edges: n=1912, e=31299
  BA -> m*=16, edges=30336
  ER -> p*=0.0171, edges=31275
  WS -> k*=32, p=0.1, edges=30592
RU_graph.edges: n=4385, e=37304
  BA -> m*=9, edges=39384
  ER -> p*=0.0039, edges=36933
  WS -> k*=18, p=0.1, edges=39465
ES_graph.edges: n=4648, e=59382
  BA -> m*=13, edges=60255
  ER -> p*=0.0055, edges=59096
  WS -> k*=26, p=0.1, edges=60424
FR_graph.edges: n=6549, e=112666
  BA -> m*=17, edges=111044
  ER -> p*=0.0053, edges=112358
  WS -> k*=34, p=0.1, edges=111333
ENGB_graph.edges: n=7126, e=35324
  BA -> m*=5, edges=35605
  ER -> p*=0.0014, edges=35025
  WS -> k*=10, p=0.1, edges=35630
DE_graph.edges: n=9498, e=153138
  BA -> m*=16, edges=151712
  ER -> p*=0.0034, edges=153256
  WS -> k*=32, p=0.1, edges=151968


In [51]:
# Compute GIC (KL) between real graphs and all model simulations
from math import isfinite

gic_results = {}
for graph_file in graph_files:
    basename = os.path.basename(graph_file)
    if basename not in comparators:
        continue
        
    original_graph = nx.read_edgelist(graph_file, nodetype=int)
    
    # Initialize results dictionary for this graph
    gic_results[basename] = {}
    
    # Compute GIC for each model type
    for model_name in ['BA', 'ER', 'WS']:
        if model_name not in comparators[basename]:
            continue
            
        model_graph = comparators[basename][model_name]
        
        gic_calc = gic.GraphInformationCriterion(original_graph, model=model_name, dist='KL')
        model_den, _ = gic_calc.compute_spectral_density(model_graph)
        gic_value = gic_calc.calculate_gic(model_den=model_den)
        
        gic_results[basename][model_name] = gic_value
        print(f"{basename}: GIC_KL(real vs {model_name}) = {gic_value:.6f}")

PTBR_graph.edges: GIC_KL(real vs BA) = 0.642153
PTBR_graph.edges: GIC_KL(real vs ER) = 0.641195
PTBR_graph.edges: GIC_KL(real vs WS) = 0.454637
RU_graph.edges: GIC_KL(real vs BA) = 0.350353
RU_graph.edges: GIC_KL(real vs ER) = 0.332763
RU_graph.edges: GIC_KL(real vs WS) = 1.214782
ES_graph.edges: GIC_KL(real vs BA) = 1.094191
ES_graph.edges: GIC_KL(real vs ER) = 1.090295
ES_graph.edges: GIC_KL(real vs WS) = 0.696158
FR_graph.edges: GIC_KL(real vs BA) = 0.343022
FR_graph.edges: GIC_KL(real vs ER) = 0.343815
FR_graph.edges: GIC_KL(real vs WS) = 0.387554
ENGB_graph.edges: GIC_KL(real vs BA) = 1.158150
ENGB_graph.edges: GIC_KL(real vs ER) = 0.284632
ENGB_graph.edges: GIC_KL(real vs WS) = 0.704208
DE_graph.edges: GIC_KL(real vs BA) = 0.590522
DE_graph.edges: GIC_KL(real vs ER) = 0.591768
DE_graph.edges: GIC_KL(real vs WS) = 0.532612


In [137]:
# Save the gic results and the comparators
import pickle
folder = 'runs/fitted_graphs_comparison_twitch_simple'  
os.makedirs(folder, exist_ok=True)
with open(f'{folder}/gic_results.pkl', 'wb') as f:
    pickle.dump(gic_results, f)
with open(f'{folder}/comparators.pkl', 'wb') as f:
    pickle.dump(comparators, f)


In [53]:
#TODO: Get the LG graph already fiteed ont he folder
folder_name = 'runs/fitted_graphs_comparison_twitch'
os.makedirs(folder_name, exist_ok=True)

In [54]:
sim_params = {
    "d_list": [0, 1],
    "lg_params": {
        "max_iterations": 8000,
        "patience": 10,
        "gic_dist_type": 'KL',
        "er_p": 0.01,
        "min_gic_threshold": 0.1,
        "edge_delta": 50,
        "verbose": True,
    },
    "other_model_n_runs": 5,
    "other_model_params": [
        {'lo': 0.01, 'hi': 0.2},  # ER (probability) — must be first
        {'k': {'lo': 2, 'hi': 3, 'step': 1}, 'p': {'lo': 0.01, 'hi': 0.2}},  # WS
        {'lo': 1, 'hi': 3},  # GRG (radius/alpha)
        {'lo': 1, 'hi': 3},  # BA (m)
    ],
    "verbose": True,
}

fitter_params = {
    "d": 1,
    "n_iteration": 100,
    "patience": 10,
    "er_p": 0.01,
    "edge_delta": 50,
    "min_gic_threshold": 0.1,
    "init_graph": None,
    "verbose": True
}


In [64]:
import gc
comparators_lg = {}
for i, graph_file in enumerate(graph_files):
    original_graph = nx.read_edgelist(graph_file, nodetype=int)
    n = original_graph.number_of_nodes()
    e = original_graph.number_of_edges()

    # # --- Erdős-Rényi (ER) Model ---
    # p_opt = estimate_er_p(n, e)
    # p_opt = p_opt * 0.9
    # print(f"ER p: {p_opt}")
    # fitter_params['er_p'] = p_opt

    # --- Barabasi-Albert (BA) Model ---
    m_opt = estimate_ba_m(n, e)
    ba_graph = nx.barabasi_albert_graph(n=n, m=m_opt, seed=42)
    # Randomly select 80% of edges and remove them
    edges_to_remove = int(ba_graph.number_of_edges() * 0.1)
    edges_to_remove = random.sample(list(ba_graph.edges()), edges_to_remove)
    ba_graph.remove_edges_from(edges_to_remove)

    fitter_params['init_graph'] = ba_graph
    # Decrease linearly with graph_file len until reach 100
    fitter_params['n_iteration'] = int(20 - 20 * (i / len(graph_files)) + 5)

    lg_fitter = LogitGraphFitter(**fitter_params)
    lg_fitter.fit(original_graph)

    model_graphs = {
        'LG': lg_fitter,
    }
    comparators_lg[os.path.basename(graph_file)] = model_graphs


Original graph - Nodes: 1912, Edges: 31299
Running LG generation for d=1...


🔄 Optimizing Graph:   0%|          | 0/25 [00:00<?, ?it/s] 

⏰ Max iterations (25) reached. Stopping.

🏁 Stopping Condition Met
   📍 Reason: Max iterations (25) reached.
   📈 Results Summary
   🏆 Best iteration found: 24
   📊 Best spectrum difference: 1375.4335
   🔗 Edges in best graph: 27327 (Real graph edges: 31299)
Fitting successful - GIC: 0.6621, Best iteration: 24
Fitted graph - Nodes: 1912, Edges: 27327

Original graph - Nodes: 4385, Edges: 37304
Running LG generation for d=1...


🔄 Optimizing Graph:   0%|          | 0/21 [00:00<?, ?it/s] 

⏰ Max iterations (21) reached. Stopping.

🏁 Stopping Condition Met
   📍 Reason: Max iterations (21) reached.
   📈 Results Summary
   🏆 Best iteration found: 2
   📊 Best spectrum difference: 1703.7920
   🔗 Edges in best graph: 35449 (Real graph edges: 37304)
Fitting successful - GIC: 0.3417, Best iteration: 2
Fitted graph - Nodes: 4385, Edges: 35449

Original graph - Nodes: 4648, Edges: 59382
Running LG generation for d=1...


🔄 Optimizing Graph:   0%|          | 0/18 [00:00<?, ?it/s] 

⏰ Max iterations (18) reached. Stopping.

🏁 Stopping Condition Met
   📍 Reason: Max iterations (18) reached.
   📈 Results Summary
   🏆 Best iteration found: 8
   📊 Best spectrum difference: 1742.2361
   🔗 Edges in best graph: 54239 (Real graph edges: 59382)
Fitting successful - GIC: 1.0981, Best iteration: 8
Fitted graph - Nodes: 4648, Edges: 54239

Original graph - Nodes: 6549, Edges: 112666
Running LG generation for d=1...


🔄 Optimizing Graph:   0%|          | 0/15 [00:00<?, ?it/s] 

⏰ Max iterations (15) reached. Stopping.

🏁 Stopping Condition Met
   📍 Reason: Max iterations (15) reached.
   📈 Results Summary
   🏆 Best iteration found: 11
   📊 Best spectrum difference: 3850.7218
   🔗 Edges in best graph: 99952 (Real graph edges: 112666)
Fitting successful - GIC: 0.3561, Best iteration: 11
Fitted graph - Nodes: 6549, Edges: 99952

Original graph - Nodes: 7126, Edges: 35324
Running LG generation for d=1...


🔄 Optimizing Graph:   0%|          | 0/11 [00:00<?, ?it/s] 

⏰ Max iterations (11) reached. Stopping.

🏁 Stopping Condition Met
   📍 Reason: Max iterations (11) reached.
   📈 Results Summary
   🏆 Best iteration found: 7
   📊 Best spectrum difference: 839.2913
   🔗 Edges in best graph: 32053 (Real graph edges: 35324)
Fitting successful - GIC: 0.1501, Best iteration: 7
Fitted graph - Nodes: 7126, Edges: 32053

Original graph - Nodes: 9498, Edges: 153138
Running LG generation for d=1...


🔄 Optimizing Graph:   0%|          | 0/8 [00:00<?, ?it/s] 

⏰ Max iterations (8) reached. Stopping.

🏁 Stopping Condition Met
   📍 Reason: Max iterations (8) reached.
   📈 Results Summary
   🏆 Best iteration found: 2
   📊 Best spectrum difference: 5594.4609
   🔗 Edges in best graph: 136544 (Real graph edges: 153138)
Fitting successful - GIC: 0.6005, Best iteration: 2
Fitted graph - Nodes: 9498, Edges: 136544


In [65]:
comparators_lg

{'PTBR_graph.edges': {'LG': <src.logit_graph.simulation.LogitGraphFitter at 0x2b398d9d0>},
 'RU_graph.edges': {'LG': <src.logit_graph.simulation.LogitGraphFitter at 0x17cb40c10>},
 'ES_graph.edges': {'LG': <src.logit_graph.simulation.LogitGraphFitter at 0x283da0970>},
 'FR_graph.edges': {'LG': <src.logit_graph.simulation.LogitGraphFitter at 0x2a5360bb0>},
 'ENGB_graph.edges': {'LG': <src.logit_graph.simulation.LogitGraphFitter at 0x282ae4670>},
 'DE_graph.edges': {'LG': <src.logit_graph.simulation.LogitGraphFitter at 0x39adb3b20>}}

In [138]:
folder = 'runs/fitted_graphs_comparison_twitch_simple'
os.makedirs(folder, exist_ok=True)
with open(f'{folder}/comparators_lg.pkl', 'wb') as f:
    pickle.dump(comparators_lg, f)

In [67]:
'finish'

'finish'

### Load the data

In [139]:
import pickle
folder = 'runs/fitted_graphs_comparison_twitch_simple'
os.makedirs(folder, exist_ok=True)
with open(f'{folder}/comparators_lg.pkl', 'rb') as f:
    comparators_lg = pickle.load(f)
with open(f'{folder}/gic_results.pkl', 'rb') as f:
    gic_results = pickle.load(f)
with open(f'{folder}/comparators.pkl', 'rb') as f:
    comparators = pickle.load(f)
    

In [140]:
# Create lists to store the data
data = []

# Iterate through each graph
for graph_name in comparators_lg.keys():
    row = {
        'graph': graph_name
    }
    
    # Add metrics from LogitGraphFitter
    lg_model = comparators_lg[graph_name]['LG']

    # Read the original graph
    graph_file = '../../data/twitch/graphs_processed/'+graph_name
    original_graph = nx.read_edgelist(graph_file, nodetype=int)

    # Add metrics for the original graph
    row.update({
        'true_nodes': original_graph.number_of_nodes(),
        'true_edges': original_graph.number_of_edges(),
        'true_density': nx.density(original_graph),
        'true_avg_clustering': nx.average_clustering(original_graph),
        'true_avg_degree': sum(dict(original_graph.degree()).values())/original_graph.number_of_nodes()
    })

    # Add metrics for LogitGraph
    lg_graph = lg_model.fitted_graph
    row.update({
        'lg_edges': lg_graph.number_of_edges(),
        'lg_density': nx.density(lg_graph),
        'lg_avg_clustering': nx.average_clustering(lg_graph),
        'lg_avg_degree': sum(dict(lg_graph.degree()).values())/lg_graph.number_of_nodes()
    })
            
    # Add metrics from other graph families in comparators
    if graph_name in comparators:
        for model_name, model in comparators[graph_name].items():
                graph = comparators[graph_name][model_name]
                row[f'{model_name}_edges'] = graph.number_of_edges()
                row[f'{model_name}_density'] = nx.density(graph)
                row[f'{model_name}_avg_clustering'] = nx.average_clustering(graph)
                row[f'{model_name}_avg_degree'] = sum(dict(graph.degree()).values())/graph.number_of_nodes()
        
    data.append(row)

results_df = pd.DataFrame(data)

In [143]:
results_df.to_csv(f'{folder}/results_df.csv', index=False)


In [144]:
results_df

Unnamed: 0,graph,true_nodes,true_edges,true_density,true_avg_clustering,true_avg_degree,lg_edges,lg_density,lg_avg_clustering,lg_avg_degree,...,BA_avg_clustering,BA_avg_degree,ER_edges,ER_density,ER_avg_clustering,ER_avg_degree,WS_edges,WS_density,WS_avg_clustering,WS_avg_degree
0,PTBR_graph.edges,1912,31299,0.017132,0.319895,32.73954,27327,0.014958,0.046809,28.584728,...,0.052372,31.732218,31275,0.017119,0.017308,32.714435,30592,0.016745,0.531481,32.0
1,RU_graph.edges,4385,37304,0.003881,0.165797,17.014367,35449,0.003688,0.017086,16.168301,...,0.018523,17.963056,36933,0.003842,0.00381,16.845154,39465,0.004106,0.514314,18.0
2,ES_graph.edges,4648,59382,0.005499,0.222496,25.551635,54239,0.005022,0.021625,23.33864,...,0.024169,25.927281,59096,0.005472,0.005428,25.428571,60424,0.005595,0.523867,26.0
3,FR_graph.edges,6549,112666,0.005255,0.221706,34.407085,99952,0.004662,0.020064,30.524355,...,0.022226,33.911742,112358,0.00524,0.005197,34.313025,111333,0.005192,0.530813,34.0
4,ENGB_graph.edges,7126,35324,0.001391,0.130928,9.914117,32053,0.001263,0.009793,8.996071,...,0.010746,9.992983,35025,0.00138,0.001439,9.830199,35630,0.001404,0.489491,10.0
5,DE_graph.edges,9498,153138,0.003395,0.200886,32.246368,136544,0.003027,0.014137,28.752158,...,0.015686,31.946094,153256,0.003398,0.003408,32.271215,151968,0.003369,0.529149,32.0


In [142]:
'finish'

'finish'