In [54]:
%load_ext autoreload
%autoreload 2

import sys, importlib, pickle, gc
sys.path.append('../..')
sys.modules['src.simulation'] = importlib.import_module('src.logit_graph.simulation')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [55]:

import sys
import os

#Graph imports
sys.path.append('../..')
import src.logit_graph.graph as graph
import src.logit_graph.logit_estimator as estimator
import src.logit_graph.utils as utils
import src.logit_graph.model_selection as model_selection
import src.logit_graph.gic as gic
import src.logit_graph.param_estimator as pe
import src.logit_graph.graph as graph
import src.logit_graph.model_selection as ms

from src.logit_graph.simulation import LogitGraphFitter, GraphModelComparator

# usual imports
import matplotlib.pyplot as plt
import pickle
import math
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import gaussian_kde
import numpy as np
import pandas as pd
import seaborn as sns
import gc
import random
import networkx as nx
from numpy import errstate

from IPython.display import display
from pyvis.network import Network
from mpl_toolkits.axes_grid1 import make_axes_locatable


In [56]:
data_directory = f'../../data/twitch/'
os.listdir(data_directory)

['RU',
 'ENGB',
 'citing.txt',
 'PTBR',
 'DE',
 'README.txt',
 'FR',
 'ES',
 'graphs_processed']

In [57]:
from src.logit_graph.simulation import estimate_sigma_many

# Configuration
an_d = 0  # set the desired d

n_repeats = 50

max_edges = None
max_non_edges = None

l1_wt = 1 # The L1 weight for the regularization
alpha = 0 # The regularization parameter

seed = 42

# Collect Twitch processed graph files
graphs_processed_dir = os.path.join(data_directory, 'graphs_processed')
region_files = [f for f in os.listdir(graphs_processed_dir) if f.endswith('_graph.edges')]
region_files = sorted(region_files)


In [None]:
records = []
#files_to_analyse = region_files[-2:]
files_to_analyse = region_files[::-1]
print(files_to_analyse)

for file_name in files_to_analyse:
    region = file_name.replace('_graph.edges', '')
    file_path = os.path.join(graphs_processed_dir, file_name)
    print(f"Estimating sigmas for region={region} from {file_path}")

    G = nx.read_edgelist(file_path, nodetype=int)
    sigmas = estimate_sigma_many(
        G,
        d=an_d,
        n_repeats=n_repeats,
        max_edges=max_edges,
        max_non_edges=max_non_edges,
        l1_wt=l1_wt,
        alpha=alpha,
        seed=seed,
        verbose=False,
    )

    for idx, s in enumerate(sigmas):
        records.append({
            'region': region,
            'd': an_d,
            'repeat_index': idx,
            'sigma': float(s),
            'num_nodes': G.number_of_nodes(),
            'num_edges': G.number_of_edges(),
        })

['RU_graph.edges', 'PTBR_graph.edges', 'FR_graph.edges', 'ES_graph.edges', 'ENGB_graph.edges', 'DE_graph.edges']
Estimating sigmas for region=RU from ../../data/twitch/graphs_processed/RU_graph.edges


100%|██████████| 50/50 [00:56<00:00,  1.13s/it]


Estimating sigmas for region=PTBR from ../../data/twitch/graphs_processed/PTBR_graph.edges


 94%|█████████▍| 47/50 [00:18<00:01,  2.59it/s]

In [None]:
sigma_df = pd.DataFrame(records)
run_dir = 'runs'
os.makedirs(run_dir, exist_ok=True)
out_path = os.path.join(run_dir, f'twitch_sigma_estimates_d{an_d}_N{n_repeats}.csv')
sigma_df.to_csv(out_path, index=False)
print(f"Saved sigma estimates to {out_path}")
sigma_df.head()


Saved sigma estimates to runs/twitch_sigma_estimates_d0_N30.csv


Unnamed: 0,region,d,repeat_index,sigma,num_nodes,num_edges
0,PTBR,0,0,-2.217076,1912,31299
1,PTBR,0,1,-2.246119,1912,31299
2,PTBR,0,2,-2.251397,1912,31299
3,PTBR,0,3,-2.193722,1912,31299
4,PTBR,0,4,-2.228464,1912,31299


## running the anova

In [None]:
import os
import pandas as pd
from scipy import stats

# Match the save path used in the dataset notebook
an_d = 0
n_repeats = 30
run_dir = os.path.join('runs')
filename = f'twitch_sigma_estimates_d{an_d}_N{n_repeats}.csv'
path = os.path.join(run_dir, filename)
print(f"Loading: {path}")


Loading: runs/twitch_sigma_estimates_d0_N30.csv


In [None]:
df = pd.read_csv(path)
subset = df[df['d'] == an_d].copy()

In [None]:
# Prepare groups by region
groups = subset.groupby('region')['sigma'].apply(list)
regions = list(groups.index)
values = [groups[r] for r in regions]


In [None]:
# One-way ANOVA
F, p = stats.f_oneway(*values)
print(f"ANOVA results (d={an_d}, N={n_repeats}): F={F:.4f}, p={p:.6g}")

ANOVA results (d=0, N=30): F=9408.8639, p=7.06444e-66


In [None]:
# Optional: quick summary per region
summary = subset.groupby('region')['sigma'].agg(['mean', 'std', 'count']).sort_index()
display(summary)


Unnamed: 0_level_0,mean,std,count
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PTBR,-2.239222,0.016866,30
RU,-1.768417,0.02055,30


### Quick read
- PTBR mean sigma: -2.2392 (std 0.0169, n=30)
- RU mean sigma: -1.7684 (std 0.0206, n=30)
- Difference (RU − PTBR): +0.4708

### Inference
- 95% CI for PTBR mean: [-2.2455, -2.2329]
- 95% CI for RU mean: [-1.7761, -1.7608]
- 95% CI for difference: [≈0.461, ≈0.481]
- Cohen’s d (using pooled SD across repeats): ≈ 25.0
- Practical reading: RU’s intercept is much less negative than PTBR’s, implying higher baseline log-odds of an edge when degree-based features are held fixed in the model.

### Caveats
- Sigma is the intercept of a logistic regression with sum-of-degrees features; comparing intercepts across graphs is sensitive to:
  - Differences in the feature distributions (sum-of-degrees varies across graphs).
  - Potential differences in the other coefficients (α, β), which you haven’t compared.
- The repeats are resamples from the same graph (not independent networks), so tiny within-group std makes “significance” and effect size explode. Treat these p-values/effect sizes as reflecting estimator stability, not population variability.

### Recommendations
- Report the means (as you did) with their CIs; the difference is clearly large and precise.
- Also extract and compare α, β per region; run ANOVA on all three parameters or a MANOVA to avoid over-interpreting sigma alone.
- Standardize features before estimation (e.g., z-score the two degree-sum features within each graph). This makes intercepts more comparable across graphs.
- Or compute a model-based comparable quantity, e.g., predicted edge probability at a common reference feature vector (median or standardized 0,0), then compare those across regions (ANOVA or pairwise tests).
- If you want a conventional test for two groups only, a two-sample t-test on the 30 repeats per region will be astronomically significant; focus on effect size and the standardized comparison above.

In [None]:
'finish all analysis'

'finish analysis'