In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import networkx as nx
import json
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import random

print("Libraries imported successfully.")


Libraries imported successfully.


In [None]:
# Section 2: Preprocessing and Graph Construction

# File paths
original_features_path = "../git_web_ml/musae_git_features.json"
original_edges_path = "../git_web_ml/musae_git_edges.csv"

# Load data
def load_data():
    print("Loading original features and edges...")
    with open(original_features_path, "r") as f:
        features = json.load(f)
    print(f"Loaded {len(features)} nodes with features.")
    
    edges = pd.read_csv(
        original_edges_path, names=["source", "target"], skiprows=1
    )  # Skip header row
    print(f"Loaded {len(edges)} edges.")
    return features, edges

# Create graph and extract largest connected component
def create_graph(features, edges):
    print("Creating graph from edges...")
    edges["source"] = edges["source"].astype(str)
    edges["target"] = edges["target"].astype(str)
    G = nx.from_pandas_edgelist(edges, source="source", target="target")
    print(f"Graph created with {len(G.nodes)} nodes and {len(G.edges)} edges.")

    print("Identifying the largest connected component...")
    largest_cc = max(nx.connected_components(G), key=len)
    G_lcc = G.subgraph(largest_cc).copy()
    print(f"Largest connected component has {len(G_lcc.nodes)} nodes and {len(G_lcc.edges)} edges.")

    print("Assigning features to nodes...")
    for node, feats in features.items():
        if node in G_lcc.nodes:
            G_lcc.nodes[node]["features"] = feats
    print("Node features assigned.")
    
    return G_lcc

# Standardize features using MultiLabelBinarizer and PCA
def standardize_features(G, output_dim=128):
    print("Standardizing features to fixed dimensions...")
    feature_list = [
        set(feats) for feats in nx.get_node_attributes(G, "features").values()
    ]
    mlb = MultiLabelBinarizer()
    binary_features = mlb.fit_transform(feature_list)
    print(f"Initial feature matrix shape: {binary_features.shape}")

    if binary_features.shape[1] > output_dim:
        print(f"Reducing dimensions to {output_dim} using PCA...")
        pca = PCA(n_components=output_dim)
        reduced_features = pca.fit_transform(binary_features)
        print(f"Feature matrix shape after PCA: {reduced_features.shape}")
    else:
        print(f"No dimensionality reduction needed. Retaining shape {binary_features.shape}")
        reduced_features = binary_features

    print("Assigning standardized features back to nodes...")
    for idx, node in enumerate(G.nodes):
        G.nodes[node]["features"] = reduced_features[idx]
    print("Feature standardization complete.")

# Load, process, and standardize graph
print("Starting graph preprocessing...")
features, edges = load_data()
G = create_graph(features, edges)
standardize_features(G, output_dim=128)
print(f"Graph preprocessing complete. Final graph has {len(G.nodes)} nodes and {len(G.edges)} edges.")


Starting graph preprocessing...
Loading original features and edges...
Loaded 37700 nodes with features.
Loaded 289003 edges.
Creating graph from edges...
Graph created with 37700 nodes and 289003 edges.
Identifying the largest connected component...
Largest connected component has 37700 nodes and 289003 edges.
Assigning features to nodes...
Node features assigned.
Standardizing features to fixed dimensions...
Initial feature matrix shape: (37700, 4005)
Reducing dimensions to 128 using PCA...
Feature matrix shape after PCA: (37700, 128)
Assigning standardized features back to nodes...
Feature standardization complete.
Graph preprocessing complete. Final graph has 37700 nodes and 289003 edges.


In [9]:
# Section 3A: Create Feature Vectors

def create_feature_vectors(G, edges):
    print("Creating feature vectors for ML tasks...")
    X, y = [], []

    print("Processing positive samples (existing edges)...")
    for i, (_, row) in enumerate(edges.iterrows()):
        node1, node2 = str(row["source"]), str(row["target"])
        if node1 in G.nodes and node2 in G.nodes:
            feature_vector = np.array(G.nodes[node1]["features"]) - np.array(G.nodes[node2]["features"])
            X.append(feature_vector)
            y.append(1)
        if i % 1000 == 0:
            print(f"Processed {i} positive samples.")

    print("Generating negative samples (random non-existing edges)...")
    all_nodes = list(G.nodes)
    for i in range(len(edges)):
        node1, node2 = np.random.choice(all_nodes, 2, replace=False)
        if not G.has_edge(node1, node2):
            feature_vector = np.array(G.nodes[node1]["features"]) - np.array(G.nodes[node2]["features"])
            X.append(feature_vector)
            y.append(0)
        if i % 1000 == 0:
            print(f"Generated {i} negative samples.")

    print(f"Feature vectors created. Total samples: {len(X)}")
    return np.array(X), np.array(y)

# Create and split feature vectors
print("Creating and splitting feature vectors...")
X, y = create_feature_vectors(G, edges)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")


Creating and splitting feature vectors...
Creating feature vectors for ML tasks...
Processing positive samples (existing edges)...
Processed 0 positive samples.
Processed 1000 positive samples.
Processed 2000 positive samples.
Processed 3000 positive samples.
Processed 4000 positive samples.
Processed 5000 positive samples.
Processed 6000 positive samples.
Processed 7000 positive samples.
Processed 8000 positive samples.
Processed 9000 positive samples.
Processed 10000 positive samples.
Processed 11000 positive samples.
Processed 12000 positive samples.
Processed 13000 positive samples.
Processed 14000 positive samples.
Processed 15000 positive samples.
Processed 16000 positive samples.
Processed 17000 positive samples.
Processed 18000 positive samples.
Processed 19000 positive samples.
Processed 20000 positive samples.
Processed 21000 positive samples.
Processed 22000 positive samples.
Processed 23000 positive samples.
Processed 24000 positive samples.
Processed 25000 positive samples

In [10]:
# Section 3B: Train the Neural Network

# Define the neural network
print("Defining the neural network model...")
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation="relu", input_shape=(X_train.shape[1],), name="Input_Layer"),
    tf.keras.layers.Dense(64, activation="relu", name="Hidden_Layer_1"),
    tf.keras.layers.Dense(32, activation="relu", name="Hidden_Layer_2"),
    tf.keras.layers.Dense(1, activation="sigmoid", name="Output_Layer"),
])
print("Model defined successfully.")
model.summary()

# Compile the model
print("Compiling the model...")
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
print("Model compiled successfully.")

# Define a custom callback for logging
class TrainingLogger(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(f"\nEpoch {epoch + 1}:")
        print(
            f"  Training Loss: {logs['loss']:.4f}, Training Accuracy: {logs['accuracy']:.4f}"
        )
        print(
            f"  Validation Loss: {logs['val_loss']:.4f}, Validation Accuracy: {logs['val_accuracy']:.4f}"
        )

# Train the model
print("Starting model training...")
history = model.fit(
    X_train,
    y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.2,
    callbacks=[TrainingLogger()],
    verbose=0  # Suppress default verbose to use custom logging
)
print("Model training complete.")

# Evaluate the model
print("Evaluating the model on the test set...")
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")


Defining the neural network model...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Model defined successfully.


Compiling the model...
Model compiled successfully.
Starting model training...

Epoch 1:
  Training Loss: 0.3217, Training Accuracy: 0.8577
  Validation Loss: 0.2887, Validation Accuracy: 0.8749

Epoch 2:
  Training Loss: 0.2754, Training Accuracy: 0.8810
  Validation Loss: 0.2765, Validation Accuracy: 0.8804

Epoch 3:
  Training Loss: 0.2611, Training Accuracy: 0.8872
  Validation Loss: 0.2709, Validation Accuracy: 0.8832

Epoch 4:
  Training Loss: 0.2522, Training Accuracy: 0.8910
  Validation Loss: 0.2631, Validation Accuracy: 0.8866

Epoch 5:
  Training Loss: 0.2456, Training Accuracy: 0.8935
  Validation Loss: 0.2618, Validation Accuracy: 0.8872

Epoch 6:
  Training Loss: 0.2400, Training Accuracy: 0.8955
  Validation Loss: 0.2612, Validation Accuracy: 0.8888

Epoch 7:
  Training Loss: 0.2352, Training Accuracy: 0.8981
  Validation Loss: 0.2637, Validation Accuracy: 0.8882

Epoch 8:
  Training Loss: 0.2309, Training Accuracy: 0.8998
  Validation Loss: 0.2629, Validation Accuracy: 

In [14]:
def predict_next_node(model, G, current_node, target_node, visited, prediction_cache):
    print(f"Predicting next node from current node: {current_node}, target: {target_node}")
    
    neighbors = list(G.neighbors(current_node))
    neighbors = [n for n in neighbors if n not in visited]
    
    if not neighbors:
        print("No unvisited neighbors available.")
        return None  # No unvisited neighbors

    # Check if the target node is one of the neighbors
    if target_node in neighbors:
        print(f"Target node {target_node} is a direct neighbor of {current_node}. Auto-selecting target.")
        return target_node  # Auto-select the target node

    # Cache predictions to avoid redundant computations
    if current_node not in prediction_cache:
        target_features = G.nodes[target_node]["features"]
        predictions = []
        for neighbor in neighbors:
            neighbor_features = G.nodes[neighbor]["features"]
            feature_vector = neighbor_features - target_features
            prob = model.predict(feature_vector.reshape(1, -1))[0][0]
            predictions.append((neighbor, prob))
            print(f"Prediction for neighbor {neighbor}: {prob:.4f}")
        predictions.sort(key=lambda x: x[1], reverse=True)
        prediction_cache[current_node] = predictions
    else:
        print(f"Using cached predictions for {current_node}")
        predictions = prediction_cache[current_node]

    # Select the next node based on predictions
    for neighbor, prob in predictions:
        if neighbor not in visited:
            print(f"Next node selected: {neighbor} with probability {prob:.4f}")
            return neighbor

    print("No valid next node found.")
    return None


# Find the path with a limit on the number of hops
def find_path(model, G, source, target, max_hops=40):
    print(f"Starting pathfinding from source: {source} to target: {target}, with max hops: {max_hops}")
    current_node = source
    visited = set()
    prediction_cache = {}  # Cache predictions to avoid recomputation
    path = [source]
    hops = 0

    while current_node != target:
        visited.add(current_node)
        next_node = predict_next_node(model, G, current_node, target, visited, prediction_cache)
        if next_node is None:
            print(f"Pathfinding failed: no valid neighbors from {current_node}.")
            return None  # No path found
        path.append(next_node)
        current_node = next_node
        hops += 1

        if hops > max_hops:
            print(f"Pathfinding terminated: exceeded max hops ({max_hops}).")
            return None

    print(f"Pathfinding complete. Path: {path}")
    return path

# Example test of pathfinding
source_node = random.choice(list(G.nodes))  # Random source node
target_node = random.choice(list(G.nodes))  # Random target node, ensure it's different
while target_node == source_node:
    target_node = random.choice(list(G.nodes))

print(f"Source: {source_node}, Target: {target_node}")
path = find_path(model, G, source_node, target_node)

if path:
    print(f"Path found: {path}")
    print(f"Number of hops: {len(path) - 1}")
else:
    print("No path found.")


Source: 28323, Target: 6822
Starting pathfinding from source: 28323 to target: 6822, with max hops: 40
Predicting next node from current node: 28323, target: 6822
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Prediction for neighbor 10761: 0.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Prediction for neighbor 13575: 0.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Prediction for neighbor 30962: 0.0006
Next node selected: 30962 with probability 0.0006
Predicting next node from current node: 30962, target: 6822
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Prediction for neighbor 19253: 0.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Prediction for neighbor 1407: 0.9211
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Prediction for neighbor 7027: 0.0341
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/st

In [None]:
# Section 5: Perform Multiple Pathfinding Runs

# Perform 20 runs with random start and target nodes
def evaluate_pathfinding(model, G, max_hops=40, num_runs=20):
    total_hops = 0
    successful_runs = 0

    for run in range(num_runs):
        print(f"\nRun {run + 1}/{num_runs}")
        
        # Randomly choose start and target nodes
        source_node, target_node = random.sample(list(G.nodes), 2)
        print(f"Source: {source_node}, Target: {target_node}")
        
        path = find_path(model, G, source_node, target_node, max_hops)
        
        if path:
            num_hops = len(path) - 1  # Number of hops is the length of the path minus 1
            print(f"Path found with {num_hops} hops: {path}")
            total_hops += num_hops
            successful_runs += 1
        else:
            print("Run terminated or no path found.")
    
    if successful_runs > 0:
        average_hops = total_hops / successful_runs
        success_rate = successful_runs / num_runs * 100
    else:
        average_hops = float('inf')  # No successful runs
        success_rate = 0.0
    
    print(f"\n--- Results ---")
    print(f"Successful runs: {successful_runs}/{num_runs}")
    print(f"Success rate: {success_rate:.2f}%")
    print(f"Average number of hops: {average_hops:.2f}" if successful_runs > 0 else "No successful runs.")
    
    return success_rate, average_hops

# Run evaluation
success_rate, average_hops = evaluate_pathfinding(model, G, max_hops=20, num_runs=20)
