In [96]:

%load_ext autoreload
%autoreload 2

import sys
import os
sys.path.append('../')

# Graph imports
import src.graph as graph
import src.logit_estimator as estimator
import src.utils as utils
import src.model_selection as model_selection
import src.gic as gic
import src.param_estimator as pe
import src.graph as graph

# usual imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import gc
import random
import networkx as nx

from IPython.display import display
from pyvis.network import Network

import pickle
import os

np.random.seed(42)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [97]:
final_df = pd.read_csv('estimation_results_connectomes.csv')

In [98]:
final_df.head()

Unnamed: 0.1,Unnamed: 0,AIC,BIC,sigma,Std Error of sigma,0.025 CI of sigma,0.975 CI of sigma,Depth (d),species
0,p.pacificus_neural.synaptic_2.graphml,694.6735,710.0056,-3.823346,0.203345,-4.221894,-3.424798,1,worm
1,rhesus_cerebral.cortex_1.graphml,3726.1077,3745.0602,-3.968125,0.121986,-4.207213,-3.729037,1,macaque
2,rattus.norvegicus_brain_1.graphml,33848.4629,33877.701,-6.583911,0.039424,-6.661181,-6.506641,1,rat
3,mixed.species_brain_1.graphml,2209.2066,2226.127,-4.622072,0.22356,-5.060242,-4.183902,1,cat
4,mouse_visual.cortex_2.graphml,1511.9046,1535.4477,-6.392157,0.162297,-6.710253,-6.074061,1,mouse


# Prepare the dataset

In [99]:
def get_logit_graph(real_graph, d):
    # Estimation
    est = estimator.LogitRegEstimator(real_graph, d=d)
    features, labels = est.get_features_labels()
    result, params, pvalue = est.estimate_parameters(l1_wt=1, alpha=0, features=features, labels=labels)
    
    sigma = params[0]
    sigma_stddev = result.bse[0]  # standard error of the parameter estimate
    return sigma, sigma_stddev

In [100]:
real_graph = nx.to_numpy_array(nx.read_graphml(datasets + connectomes[0]))
sigma, sigma_stddev = get_logit_graph(real_graph=real_graph, d=0)
samples = np.random.normal(loc=sigma, scale=sigma_stddev, size=100)

                         Results: Logit
Model:              Logit            Method:           MLE       
Dependent Variable: y                Pseudo R-squared: 0.155     
Date:               2024-07-22 11:47 AIC:              745.0726  
No. Observations:   1225             BIC:              760.4047  
Df Model:           2                Log-Likelihood:   -369.54   
Df Residuals:       1222             LL-Null:          -437.39   
Converged:          1.0000           LLR p-value:      3.4081e-30
No. Iterations:     11.0000          Scale:            1.0000    
-------------------------------------------------------------------
          Coef.    Std.Err.      z       P>|z|     [0.025    0.975]
-------------------------------------------------------------------
const    -2.9218     0.1393   -20.9761   0.0000   -3.1948   -2.6488
x1        0.0210     0.0025     8.4585   0.0000    0.0161    0.0259
x2        0.0437     0.0059     7.3947   0.0000    0.0321    0.0553



In [101]:
datasets = f'../data/connectomes/'
connectomes = os.listdir(datasets)

# Species
species_mapping = {
    'p.pacificus_neural.synaptic_2.graphml': 'worm',
    'rhesus_cerebral.cortex_1.graphml': 'macaque',
    'rattus.norvegicus_brain_1.graphml': 'rat',
    'mixed.species_brain_1.graphml': 'cat',
    'mouse_visual.cortex_2.graphml': 'mouse',
    'rattus.norvegicus_brain_2.graphml': 'rat',
    'p.pacificus_neural.synaptic_1.graphml': 'worm',
    'mouse_visual.cortex_1.graphml': 'mouse',
    'rattus.norvegicus_brain_3.graphml': 'rat',
    'rhesus_interareal.cortical.network_2.graphml': 'macaque',
    'rhesus_brain_1.graphml': 'macaque',
    'mouse_retina_1.graphml': 'mouse',
    'kasthuri_graph_v4.graphml': 'mouse',
    'mouse_brain_1.graphml': 'mouse',
    'drosophila_medulla_1.graphml': 'fly',
    'c.elegans.herm_pharynx_1.graphml': 'celegans',
    'rhesus_brain_2.graphml': 'macaque',
    'c.elegans_neural.male_1.graphml': 'celegans'
}


In [102]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

n_runs = 5
data = []

for i in range(len(connectomes)):
    np.random.seed(i)
    real_graph = nx.read_graphml(datasets + connectomes[i])
    real_graph = nx.to_numpy_array(real_graph)

    sigma, sigma_stddev = get_logit_graph(real_graph=real_graph, d=0)
    samples = np.random.normal(loc=sigma, scale=sigma_stddev, size=n_runs)

    data.append({'species': species_mapping[connectomes[i]], 'sigmas': samples, 'data': connectomes[i]})

df = pd.DataFrame(data)

                         Results: Logit
Model:              Logit            Method:           MLE       
Dependent Variable: y                Pseudo R-squared: 0.155     
Date:               2024-07-22 11:47 AIC:              745.0726  
No. Observations:   1225             BIC:              760.4047  
Df Model:           2                Log-Likelihood:   -369.54   
Df Residuals:       1222             LL-Null:          -437.39   
Converged:          1.0000           LLR p-value:      3.4081e-30
No. Iterations:     11.0000          Scale:            1.0000    
-------------------------------------------------------------------
          Coef.    Std.Err.      z       P>|z|     [0.025    0.975]
-------------------------------------------------------------------
const    -2.9218     0.1393   -20.9761   0.0000   -3.1948   -2.6488
x1        0.0210     0.0025     8.4585   0.0000    0.0161    0.0259
x2        0.0437     0.0059     7.3947   0.0000    0.0321    0.0553

                       

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


                         Results: Logit
Model:              Logit            Method:           MLE       
Dependent Variable: y                Pseudo R-squared: 0.206     
Date:               2024-07-22 11:47 AIC:              95295.4301
No. Observations:   126253           BIC:              95324.6682
Df Model:           2                Log-Likelihood:   -47645.   
Df Residuals:       126250           LL-Null:          -59974.   
Converged:          1.0000           LLR p-value:      0.0000    
No. Iterations:     18.0000          Scale:            1.0000    
-------------------------------------------------------------------
         Coef.    Std.Err.       z       P>|z|     [0.025    0.975]
-------------------------------------------------------------------
const   -3.6090     0.0209   -172.9344   0.0000   -3.6499   -3.5681
x1       0.0120     0.0001     87.4042   0.0000    0.0118    0.0123
x2       0.0086     0.0001     79.0788   0.0000    0.0084    0.0088

                       

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


                         Results: Logit
Model:              Logit            Method:           MLE       
Dependent Variable: y                Pseudo R-squared: 0.235     
Date:               2024-07-22 11:47 AIC:              95369.1270
No. Observations:   126262           BIC:              95398.3653
Df Model:           2                Log-Likelihood:   -47682.   
Df Residuals:       126259           LL-Null:          -62346.   
Converged:          1.0000           LLR p-value:      0.0000    
No. Iterations:     17.0000          Scale:            1.0000    
-------------------------------------------------------------------
         Coef.    Std.Err.       z       P>|z|     [0.025    0.975]
-------------------------------------------------------------------
const   -3.4610     0.0197   -175.8804   0.0000   -3.4996   -3.4225
x1       0.0099     0.0001     83.2883   0.0000    0.0097    0.0102
x2       0.0087     0.0001     81.4852   0.0000    0.0085    0.0089

                       

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


                         Results: Logit
Model:              Logit            Method:           MLE       
Dependent Variable: y                Pseudo R-squared: 0.247     
Date:               2024-07-22 11:47 AIC:              95393.9940
No. Observations:   122769           BIC:              95423.1481
Df Model:           2                Log-Likelihood:   -47694.   
Df Residuals:       122766           LL-Null:          -63371.   
Converged:          1.0000           LLR p-value:      0.0000    
No. Iterations:     16.0000          Scale:            1.0000    
-------------------------------------------------------------------
         Coef.    Std.Err.       z       P>|z|     [0.025    0.975]
-------------------------------------------------------------------
const   -3.5358     0.0194   -182.4247   0.0000   -3.5737   -3.4978
x1       0.0073     0.0001    100.4483   0.0000    0.0072    0.0075
x2       0.0093     0.0001     89.2948   0.0000    0.0091    0.0095

                       

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


                          Results: Logit
Model:              Logit            Method:           MLE        
Dependent Variable: y                Pseudo R-squared: 0.147      
Date:               2024-07-22 11:47 AIC:              443127.5623
No. Observations:   630003           BIC:              443161.6228
Df Model:           2                Log-Likelihood:   -2.2156e+05
Df Residuals:       630000           LL-Null:          -2.5982e+05
Converged:          1.0000           LLR p-value:      0.0000     
No. Iterations:     14.0000          Scale:            1.0000     
--------------------------------------------------------------------
          Coef.    Std.Err.       z       P>|z|     [0.025    0.975]
--------------------------------------------------------------------
const    -2.7324     0.0060   -455.0805   0.0000   -2.7441   -2.7206
x1        0.0006     0.0000    168.7557   0.0000    0.0006    0.0006
x2        0.0013     0.0000    169.6182   0.0000    0.0012    0.0013

        

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


                         Results: Logit
Model:              Logit            Method:           MLE       
Dependent Variable: y                Pseudo R-squared: 0.103     
Date:               2024-07-22 11:47 AIC:              24397.0716
No. Observations:   22731            BIC:              24421.1660
Df Model:           2                Log-Likelihood:   -12196.   
Df Residuals:       22728            LL-Null:          -13594.   
Converged:          1.0000           LLR p-value:      0.0000    
No. Iterations:     15.0000          Scale:            1.0000    
-------------------------------------------------------------------
          Coef.    Std.Err.      z       P>|z|     [0.025    0.975]
-------------------------------------------------------------------
const    -1.8538     0.0570   -32.5221   0.0000   -1.9656   -1.7421
x1        0.0143     0.0004    35.9305   0.0000    0.0135    0.0151
x2        0.0142     0.0004    36.4636   0.0000    0.0134    0.0149



  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


                          Results: Logit
Model:              Logit            Method:           MLE        
Dependent Variable: y                Pseudo R-squared: 0.078      
Date:               2024-07-22 11:47 AIC:              102513.9458
No. Observations:   1585195          BIC:              102550.7745
Df Model:           2                Log-Likelihood:   -51254.    
Df Residuals:       1585192          LL-Null:          -55598.    
Converged:          1.0000           LLR p-value:      0.0000     
No. Iterations:     25.0000          Scale:            1.0000     
--------------------------------------------------------------------
          Coef.    Std.Err.       z       P>|z|     [0.025    0.975]
--------------------------------------------------------------------
const    -5.6000     0.0127   -440.9338   0.0000   -5.6249   -5.5751
x1        0.0049     0.0001     97.2226   0.0000    0.0048    0.0050
x2        0.0093     0.0001     64.7099   0.0000    0.0090    0.0096

        

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q * linpred)))


                         Results: Logit
Model:              Logit            Method:           MLE       
Dependent Variable: y                Pseudo R-squared: 0.148     
Date:               2024-07-22 11:47 AIC:              17597.3064
No. Observations:   36919            BIC:              17622.8559
Df Model:           2                Log-Likelihood:   -8795.7   
Df Residuals:       36916            LL-Null:          -10320.   
Converged:          1.0000           LLR p-value:      0.0000    
No. Iterations:     15.0000          Scale:            1.0000    
-------------------------------------------------------------------
          Coef.    Std.Err.      z       P>|z|     [0.025    0.975]
-------------------------------------------------------------------
const    -4.5287     0.0552   -82.0133   0.0000   -4.6369   -4.4205
x1        0.0559     0.0012    46.4901   0.0000    0.0536    0.0583
x2        0.0465     0.0016    29.4004   0.0000    0.0434    0.0496



In [103]:
df_exploded = df.explode('sigmas').reset_index(drop=True)
df_exploded['sigmas'] = df_exploded['sigmas'].astype(float)
df_exploded

Unnamed: 0,species,sigmas,data
0,worm,-2.676116,p.pacificus_neural.synaptic_2.graphml
1,worm,-2.866098,p.pacificus_neural.synaptic_2.graphml
2,worm,-2.785506,p.pacificus_neural.synaptic_2.graphml
3,worm,-2.609696,p.pacificus_neural.synaptic_2.graphml
4,worm,-2.661699,p.pacificus_neural.synaptic_2.graphml
...,...,...,...
85,celegans,-4.513446,c.elegans_neural.male_1.graphml
86,celegans,-4.631112,c.elegans_neural.male_1.graphml
87,celegans,-4.494250,c.elegans_neural.male_1.graphml
88,celegans,-4.465458,c.elegans_neural.male_1.graphml


In [104]:
df_exploded.groupby('species')['sigmas'].mean()

species
cat        -3.273113
celegans   -4.028650
fly        -5.600668
macaque    -3.443531
mouse      -3.945672
rat        -3.541718
worm       -2.961179
Name: sigmas, dtype: float64

# Make the ANOVA test

In [105]:
model = ols('sigmas ~ C(species)', data=df_exploded).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

In [106]:
"""
sum_sq (Sum of Squares): This column shows the variability in the data. The "C(species)" row represents the 
variability between species, and the "Residual" row represents the variability 
within species (i.e., unexplained variability).

df (Degrees of Freedom): This column represents the number of independent values that can vary. The degrees 
of freedom for "C(species)" is the number of species minus one, and for "Residual," it is the total number 
of observations minus the number of groups.

F (F-statistic): This is a ratio of the variance between the groups (species) to the variance 
within the groups (residual). A higher F value generally indicates a significant difference between 
the groups.

PR(>F) (p-value): This column represents the probability that the observed F-statistic (or one more extreme) would occur 
if the null hypothesis were true. The null hypothesis typically states that there are no differences between the groups (species).
"""
anova_table

Unnamed: 0,sum_sq,df,F,PR(>F)
C(species),28.672381,6.0,2.925495,0.012243
Residual,135.578618,83.0,,
