In [1]:
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
import seaborn as sns
import sys
import copy
from tqdm.notebook import tqdm
from numba import jit
from scipy import stats
import networkx as nx
import re


import warnings
warnings.filterwarnings('ignore')

In [2]:
plt.rcParams["text.usetex"] = True
plt.rcParams['text.latex.preamble'] = r'\usepackage{amssymb,amsmath}'

plt.rcParams["figure.figsize"] = 11.7, 8.3
plt.rcParams["figure.dpi"] = 75

plt.rcParams["font.size"] = 28
plt.rcParams["font.family"] = "sans-serif"
plt.rcParams["font.sans-serif"] = ["Fira Sans", 'PT Sans', 'Open Sans', 'Roboto', 'DejaVu Sans', 'Liberation Sans', 'sans-serif']

plt.rcParams["legend.frameon"] = True
plt.rcParams["legend.fancybox"] = True
plt.rcParams["legend.fontsize"] = "small"

plt.rcParams["lines.linewidth"] = 2.5
plt.rcParams["lines.markersize"] = 14
plt.rcParams["lines.markeredgewidth"] = 2

plt.rcParams["xtick.major.size"] = 8
plt.rcParams["ytick.major.size"] = 8

In [3]:
import re
import pandas as pd

def extract_info_from_file(filename):
    """
    Extract parameters from the given filename and file content.
    
    Extracted parameters:
      - model: constant "SGFormer"
      - with_features: not in filename (set to None)
      - nc_test_size: not in filename (set to None)
      - Beta_s: extracted from pattern "B_s_"
      - gamma_s: extracted from pattern "g_s_"
      - Ns_obs: extracted from pattern "Ns_obs_"
      - kmean_s: extracted from pattern "k_s_"
      - gamma_n: extracted from pattern "g_n_"
      - kmean_n: extracted from pattern "k_n_"
      - gamma_f: extracted from pattern "g_f_"
      - N_f: extracted from pattern "N_f_"
      - Beta_bi: extracted from pattern "B_bi_"
      - nu: extracted from pattern "nu_"
      - alpha: extracted from pattern "alpha_" (handles negatives via "alpha_neg")
      - N_labels: extracted from pattern "N_l_"
      - i: extracted from pattern "i_"
      - accuracy: extracted from file content by looking for "Final Test:" value (ignores standard deviation)
    
    Parameters:
        filename (str): The name of the file to process.
    
    Returns:
        dict: A dictionary with all the extracted parameters.
    """
    
    # Define patterns for parameters in the filename (except for alpha which is handled separately)
    patterns = {
        'Beta_s': r'B_s_([\d\.]+)',
        'gamma_s': r'g_s_([\d\.]+)',
        'Ns_obs': r'Ns_obs_([\d\.]+)',
        'kmean_s': r'k_s_([\d\.]+)',
        'gamma_n': r'g_n_([\d\.]+)',
        'kmean_n': r'k_n_([\d\.]+)',
        'gamma_f': r'g_f_([\d\.]+)',
        'N_f': r'N_f_([\d\.]+)',
        'Beta_bi': r'B_bi_([\d\.]+)',
        'nu': r'nu_([\d\.]+)',
        'N_labels': r'N_l_([\d\.]+)',
        'i': r'_i_([\d\.]+)'
    }
    
    extracted = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, filename)
        extracted[key] = match.group(1) if match else None

    # Special handling for alpha: it can be like "alpha_5" or "alpha_neg1" (for -1).
    alpha_match = re.search(r'alpha_(neg)?([\d\.]+)', filename)
    if alpha_match:
        if alpha_match.group(1):  # "neg" was found, so use negative value.
            extracted['alpha'] = '-' + alpha_match.group(2)
        else:
            extracted['alpha'] = alpha_match.group(2)
    else:
        extracted['alpha'] = None

    # Set parameters not provided in the filename.
    extracted['with_features'] = None
    extracted['nc_test_size'] = None
    extracted['model'] = "SGFormer"

    # Read file content and extract accuracy (Final Test value).
    try:
        with open(filename, 'r') as file:
            content = file.read()
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found.")
        extracted['accuracy'] = None
        return extracted

    # Extract Final Test accuracy (ignoring the standard deviation)
    accuracy_match = re.search(r'Final Test:\s*([\d\.]+)', content)
    if accuracy_match:
        extracted['accuracy'] = accuracy_match.group(1)
    else:
        extracted['accuracy'] = None

    return extracted

def group_and_compute_mean_accuracy(df):
    """
    Group the DataFrame by all parameters except 'i' and 'accuracy',
    then compute the mean accuracy for each group.

    Returns:
        A new DataFrame with the grouped parameters and the mean accuracy.
    """
    # Define the grouping columns (all columns except 'i' and 'accuracy')
    group_columns = [col for col in df.columns if col not in ['i', 'accuracy']]
    
    # Group by these columns and compute the mean of accuracy
    grouped_df = df.groupby(group_columns, as_index=False).mean()
    return grouped_df

def process_files(file_list):
    """
    Process a list of filenames, extract parameters from each file, 
    and return a pandas DataFrame containing the results.
    
    DataFrame columns:
      ['model', 'with_features', 'nc_test_size', 'Beta_s', 'gamma_s', 'Ns_obs', 
       'kmean_s', 'gamma_n', 'kmean_n', 'gamma_f', 'N_f', 'Beta_bi', 'nu', 
       'alpha', 'N_labels', 'i', 'accuracy']
    
    Parameters:
        file_list (list of str): List of filenames to process.
    
    Returns:
        pd.DataFrame: DataFrame with the extracted parameters.
    """
    data = []
    for filename in file_list:
        info = extract_info_from_file(filename)
        data.append(info)
    
    columns = ['model', 'with_features', 'nc_test_size', 'Beta_s', 'gamma_s', 'Ns_obs',
               'kmean_s', 'gamma_n', 'kmean_n', 'gamma_f', 'N_f', 'Beta_bi', 'nu',
               'alpha', 'N_labels', 'i', 'accuracy']
    df = pd.DataFrame(data, columns=columns)
    
    df['accuracy'] = df['accuracy'].astype(float)
    df['i'] = df['i'].astype(int)
    return df

In [4]:
files = list(glob.glob("/home/rob/repo/SGFormer-HypNF/medium/results/*"))

df = process_files(files)
df.head()

Unnamed: 0,model,with_features,nc_test_size,Beta_s,gamma_s,Ns_obs,kmean_s,gamma_n,kmean_n,gamma_f,N_f,Beta_bi,nu,alpha,N_labels,i,accuracy
0,SGFormer,,,3,2.1,5000,30,2.1,3,3.5,2000,1.1,0.0,-1,2,0,72.22
1,SGFormer,,,3,3.5,5000,3,3.5,3,3.5,2000,3.0,0.0,1,2,0,52.48
2,SGFormer,,,3,3.5,5000,30,3.5,3,3.5,2000,1.1,0.0,1,10,0,41.69
3,SGFormer,,,3,3.5,5000,30,3.5,3,2.1,2000,3.0,0.0,1,6,2,46.31
4,SGFormer,,,3,3.5,5000,3,3.5,30,3.5,2000,3.0,0.0,5,2,1,82.71


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20480 entries, 0 to 20479
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   model          20480 non-null  object 
 1   with_features  0 non-null      object 
 2   nc_test_size   0 non-null      object 
 3   Beta_s         20480 non-null  object 
 4   gamma_s        20480 non-null  object 
 5   Ns_obs         20480 non-null  object 
 6   kmean_s        20480 non-null  object 
 7   gamma_n        20480 non-null  object 
 8   kmean_n        20480 non-null  object 
 9   gamma_f        20480 non-null  object 
 10  N_f            20480 non-null  object 
 11  Beta_bi        20480 non-null  object 
 12  nu             20480 non-null  object 
 13  alpha          20480 non-null  object 
 14  N_labels       20480 non-null  object 
 15  i              20480 non-null  int64  
 16  accuracy       20480 non-null  float64
dtypes: float64(1), int64(1), object(15)
memory usage: 

In [6]:
group_columns = [col for col in df.columns if col not in ['i', 'accuracy', 'with_features', 'nc_test_size']]
group_columns

['model',
 'Beta_s',
 'gamma_s',
 'Ns_obs',
 'kmean_s',
 'gamma_n',
 'kmean_n',
 'gamma_f',
 'N_f',
 'Beta_bi',
 'nu',
 'alpha',
 'N_labels']

In [7]:
df_grouped = df.groupby(group_columns, as_index=False).mean()

df_grouped['with_features'] = 1
df_grouped['nc_test_size'] = 0.7 # wait for results
df_grouped

Unnamed: 0,model,Beta_s,gamma_s,Ns_obs,kmean_s,gamma_n,kmean_n,gamma_f,N_f,Beta_bi,nu,alpha,N_labels,with_features,nc_test_size,i,accuracy
0,SGFormer,1.1,2.1,5000,3,2.1,3,2.1,2000,1.1,0.0,-1,10,1,0.7,4.5,15.110
1,SGFormer,1.1,2.1,5000,3,2.1,3,2.1,2000,1.1,0.0,-1,2,1,0.7,4.5,64.901
2,SGFormer,1.1,2.1,5000,3,2.1,3,2.1,2000,1.1,0.0,-1,3,1,0.7,4.5,48.585
3,SGFormer,1.1,2.1,5000,3,2.1,3,2.1,2000,1.1,0.0,-1,6,1,0.7,4.5,25.403
4,SGFormer,1.1,2.1,5000,3,2.1,3,2.1,2000,1.1,0.0,1,10,1,0.7,4.5,34.206
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2043,SGFormer,3,3.5,5000,30,3.5,30,3.5,2000,3,0.0,10,6,1,0.7,4.5,93.417
2044,SGFormer,3,3.5,5000,30,3.5,30,3.5,2000,3,0.0,5,10,1,0.7,4.5,82.844
2045,SGFormer,3,3.5,5000,30,3.5,30,3.5,2000,3,0.0,5,2,1,0.7,4.5,81.800
2046,SGFormer,3,3.5,5000,30,3.5,30,3.5,2000,3,0.0,5,3,1,0.7,4.5,88.578


In [8]:
# df_grouped.to_csv("../results/sgformer_nc.csv", index=False)