# Model Training

In [1]:
import networkx as nx
from networkx.algorithms.link_prediction import jaccard_coefficient

import random

import numpy as np
import matplotlib
import matplotlib.pylab as plt
import pickle
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score, classification_report



from concurrent.futures import ProcessPoolExecutor

import copy

In [2]:
# Import normalized data
with open('03-normalized-data/predictors.pkl', 'rb') as f:
    predictors = pickle.load(f)

baseline_p = predictors[5] # Remove the baseline predictor from the predictors

predictors = predictors[:5]


# Save data for later use



# Import edgenonedge data
with open('04-edgenonedge-data/edgenonedge_data.pkl', 'rb') as f:
    edgenonedge_data = pickle.load(f)

# Save data 
with open('05-edge-metadata/edge_metadata.pkl', 'rb') as f:
    edge_metadata = pickle.load(f)


    

predictor_names = ['Jaccard Coefficient', 'Adamic-Adar Index', 'Geodesic Distance',
                   'Common Neighbors', 'Degree Product']



In [3]:
p = 0
g = 0
e = 5
print('Sample edge prediction score:', predictors[p][g][e])

Sample edge prediction score: [2.86248008e-11]


# Create Features and Labels

In [4]:

# FEATURES
# edge_metadata is the features, we don't need to edit it
# Lets examine the data rq...
print('Shape of edge metadata for first 10 graphs')
for i in range(10):
    print(edge_metadata[i].shape)

Shape of edge metadata for first 10 graphs
(363242, 16)
(353268, 16)
(280912, 16)
(179705, 16)
(99249, 16)
(313918, 16)
(174885, 16)
(507329, 16)
(296717, 16)
(168504, 16)


In [5]:
# LABELS
# The labels will be the index of the 'most correct' predictor
# If the edge actually exists, the 'most correct' predictor is the one with the highest score.
# If it doesn't exist, the 'most correct' predictor has the lowest score.

mces = []  # Place to store the most correct estimators for each graph

for g in range(len(edgenonedge_data)):  # Iterate over each graph
    g1_predictions = []

    for predictor in predictors:
        g1_predictions.append(predictor[g])
    g1_predictions = np.array(g1_predictions)

    g_ene = edgenonedge_data[g]





    # Find the most correct predictor for each edge (column-wise)
    most_correct_estimator = []
    edge_is_real = g_ene

    for i in range(g1_predictions.shape[1]):  # Iterate over each edge (column index)
        if edge_is_real[i]: # If missing edge is an edge in the origional graph
            index = np.argmax(g1_predictions[:, i])  # Find the predictor (row) with the maximum value for the edge
            most_correct_estimator.append(index)
        else:
            index = np.argmin(g1_predictions[:, i])  # Find the predictor (row) with the lowest value for the edge
            most_correct_estimator.append(index)

    # Convert the result to a NumPy array
    most_correct_estimator = np.array(most_correct_estimator).reshape(-1, 1)

    # Append to the global list
    mces.append(most_correct_estimator)


In [6]:
# Display the shape of the most correct estimator data for the first 10 graphs
print("Shape of Most-Correct-Estimator data for first 10 graphs:")
for i in range(min(10, len(mces))):  # Limit to 10 graphs or fewer if fewer graphs exist
    print(mces[i].shape)

print("Sample mce data for first graph:", mces[0][:20])

Shape of Most-Correct-Estimator data for first 10 graphs:
(363242, 1)
(353268, 1)
(280912, 1)
(179705, 1)
(99249, 1)
(313918, 1)
(174885, 1)
(507329, 1)
(296717, 1)
(168504, 1)
Sample mce data for first graph: [[0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [3]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [4]]


# Run Models

In [7]:
def inspect_custom_train_test_split_inputs(edge_metadata, mces, test_size, random_state):
    """
    Inspect the inputs to custom_train_test_split to verify their validity.

    Args:
        edge_metadata (list of np.ndarray): Feature data for each graph.
        mces (list of np.ndarray): Label data for each graph.
        test_size (float): Fraction of the data to be used for testing.
        random_state (int, optional): Random seed for reproducibility.
    """
    print("Inspecting Inputs to custom_train_test_split...")

    # Check test_size and random_state
    print(f"Test size: {test_size}")
    print(f"Random state: {random_state}")

    # Check edge_metadata
    print(f"Number of graphs in edge_metadata: {len(edge_metadata)}")
    for i in range(min(3, len(edge_metadata))):  # Inspect the first few graphs
        print(f"\nInspecting edge_metadata[{i}]:")
        print(f"  Shape: {edge_metadata[i].shape}")
        print(f"  First 3 sample rows:")
        for j in range(min(3, edge_metadata[i].shape[0])):  # Print first 3 rows if available
            print(f"    Row {j}: {edge_metadata[i][j]}")

    # Check mces
    print(f"\nNumber of graphs in mces: {len(mces)}")
    for i in range(min(3, len(mces))):  # Inspect the first few graphs
        print(f"\nInspecting mces[{i}]:")
        print(f"  Shape: {mces[i].shape}")
        print(f"  First 3 sample labels: {mces[i][:3]}")

    # Check alignment of edge_metadata and mces
    assert len(edge_metadata) == len(mces), "Mismatch: edge_metadata and mces must have the same number of graphs!"
    for i in range(len(edge_metadata)):
        assert edge_metadata[i].shape[0] == mces[i].shape[0], (
            f"Mismatch: edge_metadata[{i}] and mces[{i}] must have the same number of rows!"
        )

    print("\nInput inspection complete.")

# Call the function
inspect_custom_train_test_split_inputs(edge_metadata, mces, test_size=0.2, random_state=None)

Inspecting Inputs to custom_train_test_split...
Test size: 0.2
Random state: None
Number of graphs in edge_metadata: 26

Inspecting edge_metadata[0]:
  Shape: (363242, 16)
  First 3 sample rows:
    Row 0: [ 1.61000000e+02  2.00000000e+00  4.00512687e-02  6.20650058e-04
  8.55000000e+02  1.84300000e+03  5.04813947e-03  4.31111111e+00
 -1.00000000e+00  4.00000000e+00  1.08891926e-01  6.17283951e-03
  2.41363138e-01  2.00000000e+00  1.00000000e+00  3.22000000e+02]
    Row 1: [ 1.61000000e+02  3.00000000e+00  4.00512687e-02  7.79806973e-04
  8.55000000e+02  1.84300000e+03  5.04813947e-03  4.31111111e+00
 -1.00000000e+00  4.00000000e+00  1.08891926e-01  6.13496933e-03
  2.19593112e-01  2.00000000e+00  1.00000000e+00  4.83000000e+02]
    Row 2: [ 1.61000000e+02  4.00000000e+00  4.00512687e-02  9.72611570e-04
  8.55000000e+02  1.84300000e+03  5.04813947e-03  4.31111111e+00
 -1.00000000e+00  4.00000000e+00  1.08891926e-01  6.09756098e-03
  2.71085031e-01  2.00000000e+00  1.00000000e+00  6.440

In [8]:
import numpy as np
from sklearn.utils import shuffle  # Utility for shuffling data

def custom_train_test_split(edge_metadata, mces, test_size=0.2, random_state=None):
    """
    Custom function to split graph-level data into training and testing sets.

    Parameters:
        edge_metadata (list of np.ndarray): Feature data for each graph.
        mces (list of np.ndarray): Label data for each graph.
        test_size (float): Fraction of the data to be used for testing.
        random_state (int, optional): Random seed for reproducibility.

    Returns:
        X_train_combined (np.ndarray): Combined and shuffled training data for all graphs.
        y_train_combined (np.ndarray): Combined and shuffled training labels for all graphs.
        X_test (list of np.ndarray): List of testing data for each graph.
        y_test (list of np.ndarray): List of testing labels for each graph.
    """
    # Set random seed for reproducibility
    if random_state is not None:
        np.random.seed(random_state)

    # Determine the number of graphs
    num_graphs = len(edge_metadata)

    # Create a shuffled list of graph indices
    shuffled_indices = np.random.permutation(num_graphs)

    # Split indices into training and testing
    split_idx = int((1 - test_size) * num_graphs)
    train_indices = shuffled_indices[:split_idx]
    test_indices = shuffled_indices[split_idx:]

    # Split the data
    X_train = [edge_metadata[i] for i in train_indices]
    y_train = [mces[i] for i in train_indices]
    X_test = [edge_metadata[i] for i in test_indices]
    y_test = [mces[i] for i in test_indices]

    # Combine training data into single arrays
    X_train_combined = np.vstack(X_train)
    y_train_combined = np.vstack(y_train)

    # Shuffle the combined training data
    X_train_combined, y_train_combined = shuffle(X_train_combined, y_train_combined, random_state=random_state)

    return X_train_combined, y_train_combined, X_test, y_test

X_train, y_train, X_test, y_test =  custom_train_test_split(edge_metadata, mces, test_size=0.2, random_state=69)

In [9]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
for i in range(3):
    print("Sample X_train row:", X_train[i])
    print("Sample y_train label:", y_train[i])

X_train shape: (4810004, 16)
y_train shape: (4810004, 1)
Sample X_train row: [ 4.00000000e+00  3.00000000e+00  1.19261269e-03  7.92654288e-04
  8.68000000e+02  1.75300000e+03  4.65878951e-03  4.03917051e+00
 -1.00000000e+00  1.00000000e+01  8.49378538e-02  0.00000000e+00
  0.00000000e+00  4.00000000e+00  0.00000000e+00  1.20000000e+01]
Sample y_train label: [3]
Sample X_train row: [ 2.00000000e+00  2.00000000e+00  6.36009151e-04  6.97667192e-04
  7.61000000e+02  1.59200000e+03  5.50522166e-03  4.18396846e+00
 -1.00000000e+00  1.20000000e+01  1.05207474e-01  0.00000000e+00
  0.00000000e+00  3.00000000e+00  0.00000000e+00  4.00000000e+00]
Sample y_train label: [3]
Sample X_train row: [ 1.00000000e+00  3.00000000e+00  3.17028706e-04  9.00036755e-04
  1.01000000e+03  2.21600000e+03  4.34897801e-03  4.38811881e+00
 -1.00000000e+00  1.20000000e+01  9.90114922e-02  0.00000000e+00
  0.00000000e+00  5.00000000e+00  0.00000000e+00  3.00000000e+00]
Sample y_train label: [3]


In [10]:
for i in range(5):
    print(f"Row {i}: {X_train[i]}")

Row 0: [ 4.00000000e+00  3.00000000e+00  1.19261269e-03  7.92654288e-04
  8.68000000e+02  1.75300000e+03  4.65878951e-03  4.03917051e+00
 -1.00000000e+00  1.00000000e+01  8.49378538e-02  0.00000000e+00
  0.00000000e+00  4.00000000e+00  0.00000000e+00  1.20000000e+01]
Row 1: [ 2.00000000e+00  2.00000000e+00  6.36009151e-04  6.97667192e-04
  7.61000000e+02  1.59200000e+03  5.50522166e-03  4.18396846e+00
 -1.00000000e+00  1.20000000e+01  1.05207474e-01  0.00000000e+00
  0.00000000e+00  3.00000000e+00  0.00000000e+00  4.00000000e+00]
Row 2: [ 1.00000000e+00  3.00000000e+00  3.17028706e-04  9.00036755e-04
  1.01000000e+03  2.21600000e+03  4.34897801e-03  4.38811881e+00
 -1.00000000e+00  1.20000000e+01  9.90114922e-02  0.00000000e+00
  0.00000000e+00  5.00000000e+00  0.00000000e+00  3.00000000e+00]
Row 3: [ 2.00000000e+00  2.00000000e+00  6.09951159e-04  6.74972793e-04
  8.68000000e+02  1.75300000e+03  4.65878951e-03  4.03917051e+00
 -1.00000000e+00  1.00000000e+01  8.49378538e-02  0.0000000

In [None]:
from sklearn.naive_bayes import GaussianNB

# With all 0.8 training data from metabolic networks, took 7.5 min to train

# Initialize the Gaussian Naive Bayes Classifier
nb = GaussianNB()

# Train the Model
nb.fit(X_train, y_train.ravel())  # Ensure y_train is a 1D array for Naive Bayes

X_train shape: (4810004, 16)
y_train shape: (4810004, 1)
Unique rows in X_train: 3338305
Unique labels in y_train: [0 1 2 3 4]
First 5 rows of X_train:
 [[ 4.00000000e+00  3.00000000e+00  1.19261269e-03  7.92654288e-04
   8.68000000e+02  1.75300000e+03  4.65878951e-03  4.03917051e+00
  -1.00000000e+00  1.00000000e+01  8.49378538e-02  0.00000000e+00
   0.00000000e+00  4.00000000e+00  0.00000000e+00  1.20000000e+01]
 [ 2.00000000e+00  2.00000000e+00  6.36009151e-04  6.97667192e-04
   7.61000000e+02  1.59200000e+03  5.50522166e-03  4.18396846e+00
  -1.00000000e+00  1.20000000e+01  1.05207474e-01  0.00000000e+00
   0.00000000e+00  3.00000000e+00  0.00000000e+00  4.00000000e+00]
 [ 1.00000000e+00  3.00000000e+00  3.17028706e-04  9.00036755e-04
   1.01000000e+03  2.21600000e+03  4.34897801e-03  4.38811881e+00
  -1.00000000e+00  1.20000000e+01  9.90114922e-02  0.00000000e+00
   0.00000000e+00  5.00000000e+00  0.00000000e+00  3.00000000e+00]
 [ 2.00000000e+00  2.00000000e+00  6.09951159e-04  6

In [15]:
from sklearn.metrics import accuracy_score, classification_report

g = 0  # Test on the first graph of the testing graphs

# Make Predictions
y_pred = nb.predict(X_test[g])

# Evaluate the Model
accuracy = accuracy_score(y_test[g], y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("Classification Report:")
print(classification_report(y_test[g], y_pred))

# # Feature Importances
# print("\nFeature Importances:")
# feature_importances = nb.feature_importances_  # Get feature importances from the model
# feature_names = [f"Feature {i}" for i in range(X_train.shape[1])]  # Name the features generically

# for feature, importance in zip(feature_names, feature_importances):
#     print(f"{feature}: {importance:.4f}")

Accuracy: 0.49
Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.13      0.22      6811
           1       0.89      0.13      0.23     24495
           2       0.05      0.05      0.05        42
           3       0.46      0.99      0.63     22148
           4       0.04      0.28      0.07       158

    accuracy                           0.49     53654
   macro avg       0.40      0.32      0.24     53654
weighted avg       0.67      0.49      0.39     53654



In [22]:
import pickle
import numpy as np

# Import normalized data
with open('03-normalized-data/predictors.pkl', 'rb') as f:
    predictors = pickle.load(f)

baseline_p = predictors[5]  # Remove the baseline predictor from the predictors
predictors = predictors[:5]

# Import edgenonedge data
with open('04-edgenonedge-data/edgenonedge_data.pkl', 'rb') as f:
    edgenonedge_data = pickle.load(f)

# Import edge metadata
with open('05-edge-metadata/edge_metadata.pkl', 'rb') as f:
    edge_metadata = pickle.load(f)

# Predictor names
predictor_names = ['Jaccard Coefficient', 'Adamic-Adar Index', 'Geodesic Distance',
                   'Common Neighbors', 'Degree Product']


def compute_average_scores_from_split(edge_metadata, edgenonedge_data, predictors, predictor_names, test_size=0.2, random_state=69):
    """
    Compute the average score for real edges and non-edges for each predictor based on a custom train-test split.

    Parameters:
        edge_metadata (list of np.ndarray): Feature data for each graph.
        edgenonedge_data (list of np.ndarray): Label data (1 for real edges, 0 for non-edges).
        predictors (list of np.ndarray): Predictor scores for each graph.
        predictor_names (list): List of predictor names.
        test_size (float): Fraction of the data to be used for testing.
        random_state (int): Random seed for reproducibility.

    Returns:
        dict: A dictionary with average scores for real and non-edges for each predictor.
    """
    # Perform custom train-test split
    def custom_train_test_split(edge_metadata, edgenonedge_data, test_size=0.2, random_state=None):
        np.random.seed(random_state)
        num_graphs = len(edge_metadata)
        shuffled_indices = np.random.permutation(num_graphs)
        split_idx = int((1 - test_size) * num_graphs)
        train_indices = shuffled_indices[:split_idx]
        test_indices = shuffled_indices[split_idx:]
        return train_indices, test_indices

    # Split data
    train_indices, test_indices = custom_train_test_split(edge_metadata, edgenonedge_data, test_size, random_state)

    # Extract test data
    test_metadata = [edge_metadata[i] for i in test_indices]
    test_labels = [edgenonedge_data[i] for i in test_indices]
    test_predictors = [predictors[i] for i in test_indices]

    # Flatten test data for evaluation
    X_test = np.vstack(test_metadata)
    y_test = np.hstack(test_labels)
    predictor_scores = np.vstack(test_predictors)

    # Calculate average scores
    results = {}
    for i, name in enumerate(predictor_names):
        scores = predictor_scores[:, i]
        real_edge_scores = scores[y_test == 1]
        non_edge_scores = scores[y_test == 0]

        avg_real_edge_score = real_edge_scores.mean() if len(real_edge_scores) > 0 else 0
        avg_non_edge_score = non_edge_scores.mean() if len(non_edge_scores) > 0 else 0

        results[name] = {
            "Average Real Edge Score": avg_real_edge_score,
            "Average Non-Edge Score": avg_non_edge_score
        }

    return results


# Example usage
average_scores = compute_average_scores_from_split(edge_metadata, edgenonedge_data, predictors, predictor_names)

# Print results
for predictor, scores in average_scores.items():
    print(f"{predictor}:")
    print(f"  Average Real Edge Score: {scores['Average Real Edge Score']:.4f}")
    print(f"  Average Non-Edge Score: {scores['Average Non-Edge Score']:.4f}")

IndexError: list index out of range