# Analyze distribution of lambda_2 in general graphs.

In [None]:
from tqdm.contrib.concurrent import process_map
from my_graphs_dataset import GraphDataset, GraphType
import networkx as nx
import numpy as np
import plotly.express as px
import pandas as pd
from scipy.stats import zscore, skew, boxcox, yeojohnson


def algebraic_connectivity(G):
    L = nx.laplacian_matrix(G).toarray()
    lambdas = sorted(np.linalg.eigvalsh(L))
    return lambdas[1]


def norm_algebraic_connectivity(G, N):
    return algebraic_connectivity(G) / N


def spectral_radius(G):
    L = nx.laplacian_matrix(G).toarray()
    lambdas = np.linalg.eigvalsh(L)
    return max(abs(lambdas))


def worker(graph):
    G = GraphDataset.parse_graph6(graph)
    num_nodes = G.number_of_nodes()
    # l2 = algebraic_connectivity(G)
    l2 = norm_algebraic_connectivity(G, num_nodes)
    return l2, num_nodes

In [None]:
selection = {
    3: -1,
    4: -1,
    5: -1,
    6: -1,
    7: -1,
    8: -1,
    9: 10000,
    10: 10000,
    GraphType.RANDOM_MIX: (10000, range(11, 21)),
}
loader = GraphDataset(selection=selection)
all_results = []


# If batch_size="auto", loader yields all graphs from individual files.
for graphs in loader.graphs(raw=True, batch_size=10000):
    # Process map runs the multiprocessing pool and displays a progress bar with tqdm.
    result = process_map(worker, graphs, chunksize=1000)
    all_results.extend(result)

# Save the results to a file.
import pickle
with open("graphs.pkl", "wb") as f:
    pickle.dump(all_results, f)

In [None]:
# Load the results from a file.
import pickle
with open("graphs.pkl", "rb") as f:
    all_results = pickle.load(f)

In [None]:
# Prepare data for plotting
df = pd.DataFrame(all_results, columns=['metric', 'num_nodes'])

# Perform z-score normalization within each group of num_nodes
df['metric_zscore'] = df.groupby('num_nodes')['metric'].transform(zscore)

# Define a function to apply transformations and calculate skewness for each transformation
def calculate_skewness(group):
    transformations = {}
    # Original metric
    transformations['original'] = skew(group['metric'])
    # Log transformation (add 1 to avoid log(0))
    transformations['log'] = skew(np.log1p(group['metric']))
    # Square root transformation
    transformations['sqrt'] = skew(np.sqrt(group['metric']))
    # Cube root transformation
    transformations['cbrt'] = skew(np.cbrt(group['metric']))
    # Box-Cox transformation (only on positive data)
    transformations['boxcox'] = skew(boxcox(group['metric'])[0])
    # Yeo-Johnson transformation (handles all values)
    transformations['yeojohnson'] = skew(yeojohnson(group['metric'])[0])

    return pd.Series(transformations)

# Apply the function to each group of `num_nodes` and create the new skewness DataFrame
skewness_df = df.groupby('num_nodes').apply(calculate_skewness).reset_index()
skewness_df = skewness_df.round(5)

df['metric_unskewed'] = df.groupby('num_nodes')['metric'].transform(lambda x: boxcox(x)[0])

# Create ridgeline plot
fig = px.violin(df, x='metric', y='num_nodes', points=False, box=True)
fig.update_traces(orientation='h', side='positive', width=3, spanmode="hard")
fig.show()

fig = px.violin(df, x='metric_unskewed', y='num_nodes', points=False, box=True)
fig.update_traces(orientation='h', side='positive', width=3, spanmode="hard")
fig.show()

skewness_df
