# Imports

In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import json
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import random
import pickle



In [None]:
import tensorflow as tf
print("Libraries imported successfully.")

# Preprocessing

In [None]:
# Section 2: Preprocessing and Graph Construction

# File paths
original_features_path = "../git_web_ml/musae_git_features.json"
original_edges_path = "../git_web_ml/musae_git_edges.csv"

# Load data
def load_data():
    print("Loading original features and edges...")
    with open(original_features_path, "r") as f:
        features = json.load(f)
    print(f"Loaded {len(features)} nodes with features.")

    edges = pd.read_csv(
        original_edges_path, names=["source", "target"], skiprows=1
    )
    print(f"Loaded {len(edges)} edges.")

    # ML/Web developer targets
    targets = pd.read_csv("../git_web_ml/musae_git_target.csv")
    target_dict = dict(zip(targets["id"].astype(str), targets["ml_target"]))
    print(f"Loaded {len(targets)} developer targets.")

    return features, edges, target_dict

# Create graph and extract largest connected component
def create_graph(features, edges, target_dict):
    print("Creating graph from edges...")
    edges["source"] = edges["source"].astype(str)
    edges["target"] = edges["target"].astype(str)
    G = nx.from_pandas_edgelist(edges, source="source", target="target")
    print(f"Graph created with {len(G.nodes)} nodes and {len(G.edges)} edges.")

    print("Identifying the largest connected component...")
    largest_cc = max(nx.connected_components(G), key=len)
    G_lcc = G.subgraph(largest_cc).copy()
    print(f"Largest connected component has {len(G_lcc.nodes)} nodes and {len(G_lcc.edges)} edges.")

    print("Computing clustering coefficients")
    clustering_coeffs = nx.clustering(G_lcc)

    print("Assigning features to nodes...")
    for node in G_lcc.nodes():
        # Base features
        if node in features:
            G_lcc.nodes[node]["features"] = features[node]
        else:
            G_lcc.nodes[node]["features"] = []

        additional_features = (
            [
                target_dict.get(node, -1),  # ML/Web developer target
                G_lcc.degree(node),  # Degree
                clustering_coeffs.get(node, 0),  # Clustering coefficient
            ]
        )
    G_lcc.nodes[node]["features"].extend(additional_features)
    print("Node features assigned.")

    return G_lcc

# Standardize features using MultiLabelBinarizer and PCA
def standardize_features(G, output_dim=128):
    print("Standardizing features to fixed dimensions...")
    feature_list = [
        set(feats) for feats in nx.get_node_attributes(G, "features").values()
    ]
    mlb = MultiLabelBinarizer()
    binary_features = mlb.fit_transform(feature_list)
    print(f"Initial feature matrix shape: {binary_features.shape}")

    if binary_features.shape[1] > output_dim:
        print(f"Reducing dimensions to {output_dim} using PCA...")
        pca = PCA(n_components=output_dim)
        reduced_features = pca.fit_transform(binary_features)
        print(f"Feature matrix shape after PCA: {reduced_features.shape}")
    else:
        print(f"No dimensionality reduction needed. Retaining shape {binary_features.shape}")
        reduced_features = binary_features

    print("Assigning standardized features back to nodes...")
    for idx, node in enumerate(G.nodes):
        G.nodes[node]["features"] = reduced_features[idx]
    print("Feature standardization complete.")

# Load, process, and standardize graph
print("Starting graph preprocessing...")
features, edges, target_dict = load_data()
G = create_graph(features, edges, target_dict)
standardize_features(G, output_dim=128)
print(f"Graph preprocessing complete. Final graph has {len(G.nodes)} nodes and {len(G.edges)} edges.")

Starting graph preprocessing...
Loading original features and edges...
Loaded 37700 nodes with features.
Loaded 289003 edges.
Creating graph from edges...
Graph created with 37700 nodes and 289003 edges.
Identifying the largest connected component...
Largest connected component has 37700 nodes and 289003 edges.
Assigning features to nodes...
Node features assigned.
Standardizing features to fixed dimensions...
Initial feature matrix shape: (37700, 4005)
Reducing dimensions to 128 using PCA...


In [None]:
def check_connectivity_bfs(G):
    print("Performing BFS to ensure all nodes are connected...")
    start_node = next(iter(G.nodes))  # Get an arbitrary starting node
    visited = set()
    queue = [start_node]

    while queue:
        node = queue.pop(0)
        if node not in visited:
            visited.add(node)
            queue.extend(neighbor for neighbor in G.neighbors(node) if neighbor not in visited)

    if len(visited) == len(G.nodes):
        print("All nodes are connected. The graph is a single connected component.")
    else:
        print(f"Graph is not fully connected. Only {len(visited)} out of {len(G.nodes)} nodes are reachable.")


check_connectivity_bfs(G)  # Ensure the graph is a single connected component

Performing BFS to ensure all nodes are connected...
All nodes are connected. The graph is a single connected component.


In [7]:
random_node = list(G.nodes())[0]
with open(original_features_path, "r") as f:
    original_features = json.load(f)
print("Original features for node:", original_features[random_node])

# Then see how they're transformed
print("\nTransformed features (after PCA):", G.nodes[random_node]["features"][:5])

Original features for node: [1574, 3773, 3571, 2672, 2478, 2534, 3129, 3077, 1171, 2045, 1539, 902, 1532, 2472, 1122, 2480, 3098, 2115, 1578]

Transformed features (after PCA): [1574, 3773, 3571, 2672, 2478]


# ML Model

## First Model

In [6]:
# Section 3A: Create Feature Vectors

def create_feature_vectors(G, edges):
    print("Creating feature vectors for ML tasks...")
    X, y = [], []

    print("Processing positive samples (existing edges)...")
    for i, (_, row) in enumerate(edges.iterrows()):
        node1, node2 = str(row["source"]), str(row["target"])
        if node1 in G.nodes and node2 in G.nodes:
            feature_vector = np.array(G.nodes[node1]["features"]) - np.array(G.nodes[node2]["features"])
            X.append(feature_vector)
            y.append(1)
        if i % 1000 == 0:
            print(f"Processed {i} positive samples.")

    print("Generating negative samples (random non-existing edges)...")
    all_nodes = list(G.nodes)
    for i in range(len(edges)):
        node1, node2 = np.random.choice(all_nodes, 2, replace=False)
        if not G.has_edge(node1, node2):
            feature_vector = np.array(G.nodes[node1]["features"]) - np.array(G.nodes[node2]["features"])
            X.append(feature_vector)
            y.append(0)
        if i % 1000 == 0:
            print(f"Generated {i} negative samples.")

    print(f"Feature vectors created. Total samples: {len(X)}")
    return np.array(X), np.array(y)

# Create and split feature vectors
print("Creating and splitting feature vectors...")
X, y = create_feature_vectors(G, edges)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")

with open("feature_vectors.pkl", "wb") as f:
    pickle.dump((X, y, X_train, X_test, y_train, y_test), f)

print("Feature vectors and splits saved successfully.")


Creating and splitting feature vectors...
Creating feature vectors for ML tasks...
Processing positive samples (existing edges)...
Processed 0 positive samples.
Processed 1000 positive samples.
Processed 2000 positive samples.
Processed 3000 positive samples.
Processed 4000 positive samples.
Processed 5000 positive samples.
Processed 6000 positive samples.
Processed 7000 positive samples.
Processed 8000 positive samples.
Processed 9000 positive samples.
Processed 10000 positive samples.
Processed 11000 positive samples.
Processed 12000 positive samples.
Processed 13000 positive samples.
Processed 14000 positive samples.
Processed 15000 positive samples.
Processed 16000 positive samples.
Processed 17000 positive samples.
Processed 18000 positive samples.
Processed 19000 positive samples.
Processed 20000 positive samples.
Processed 21000 positive samples.
Processed 22000 positive samples.
Processed 23000 positive samples.
Processed 24000 positive samples.
Processed 25000 positive samples

In [None]:
with open("feature_vectors.pkl", "rb") as f:
    X, y, X_train, X_test, y_train, y_test = pickle.load(f)

print("Feature vectors and splits loaded successfully.")
print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")

Feature vectors and splits loaded successfully.
Training set size: 462303, Test set size: 115576


In [5]:
# Section 3B: Train the Neural Network

# Define the neural network
print("Defining the neural network model...")
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation="relu", input_shape=(X_train.shape[1],), name="Input_Layer"),
    tf.keras.layers.Dense(64, activation="relu", name="Hidden_Layer_1"),
    tf.keras.layers.Dense(32, activation="relu", name="Hidden_Layer_2"),
    tf.keras.layers.Dense(1, activation="sigmoid", name="Output_Layer"),
])
print("Model defined successfully.")
model.summary()

# Compile the model
print("Compiling the model...")
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
print("Model compiled successfully.")

# Define a custom callback for logging
class TrainingLogger(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(f"\nEpoch {epoch + 1}:")
        print(
            f"  Training Loss: {logs['loss']:.4f}, Training Accuracy: {logs['accuracy']:.4f}"
        )
        print(
            f"  Validation Loss: {logs['val_loss']:.4f}, Validation Accuracy: {logs['val_accuracy']:.4f}"
        )

# Train the model
print("Starting model training...")
history = model.fit(
    X_train,
    y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.2,
    callbacks=[TrainingLogger()],
    verbose=0  # Suppress default verbose to use custom logging
)
print("Model training complete.")

# Evaluate the model
model.save("trained_model.h5")
print("Model saved successfully to 'trained_model.h5'.")

# Save the training history
with open("training_history.pkl", "wb") as f:
    pickle.dump(history.history, f)
print("Training history saved successfully to 'training_history.pkl'.")

# Evaluate the model
print("Evaluating the model on the test set...")
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")


Defining the neural network model...
Model defined successfully.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Compiling the model...
Model compiled successfully.
Starting model training...

Epoch 1:
  Training Loss: 0.3206, Training Accuracy: 0.8582
  Validation Loss: 0.2825, Validation Accuracy: 0.8786

Epoch 2:
  Training Loss: 0.2741, Training Accuracy: 0.8815
  Validation Loss: 0.2704, Validation Accuracy: 0.8836

Epoch 3:
  Training Loss: 0.2588, Training Accuracy: 0.8879
  Validation Loss: 0.2629, Validation Accuracy: 0.8870

Epoch 4:
  Training Loss: 0.2497, Training Accuracy: 0.8917
  Validation Loss: 0.2590, Validation Accuracy: 0.8892

Epoch 5:
  Training Loss: 0.2427, Training Accuracy: 0.8948
  Validation Loss: 0.2591, Validation Accuracy: 0.8887

Epoch 6:
  Training Loss: 0.2374, Training Accuracy: 0.8972
  Validation Loss: 0.2573, Validation Accuracy: 0.8904

Epoch 7:
  Training Loss: 0.2329, Training Accuracy: 0.8991
  Validation Loss: 0.2572, Validation Accuracy: 0.8906

Epoch 8:
  Training Loss: 0.2288, Training Accuracy: 0.9011
  Validation Loss: 0.2586, Validation Accuracy: 




Epoch 20:
  Training Loss: 0.2000, Training Accuracy: 0.9137
  Validation Loss: 0.2774, Validation Accuracy: 0.8865
Model training complete.
Model saved successfully to 'trained_model.h5'.
Training history saved successfully to 'training_history.pkl'.
Evaluating the model on the test set...
[1m3612/3612[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 499us/step - accuracy: 0.8873 - loss: 0.2788
Test Loss: 0.2810
Test Accuracy: 0.8864


In [8]:
# Load the trained model
model = tf.keras.models.load_model("trained_model.h5")
print("Model loaded successfully from 'trained_model.h5'.")
model.summary()

# Load the training history
with open("training_history.pkl", "rb") as f:
    history = pickle.load(f)
print("Training history loaded successfully from 'training_history.pkl'.")

# Print the loaded training history (optional)
print("Loaded Training History:")
for key, values in history.items():
    print(f"{key}: {values[:5]}...")  # Show first 5 values as a preview



Model loaded successfully from 'trained_model.h5'.


Training history loaded successfully from 'training_history.pkl'.
Loaded Training History:
accuracy: [0.8582178354263306, 0.8815007209777832, 0.8878872394561768, 0.8916618227958679, 0.8948037028312683]...
loss: [0.32061806321144104, 0.2741466164588928, 0.2588496506214142, 0.2497376650571823, 0.2426992654800415]...
val_accuracy: [0.8786190748214722, 0.8836482167243958, 0.8870442509651184, 0.8892289996147156, 0.8887098431587219]...
val_loss: [0.28252294659614563, 0.2703949809074402, 0.26289796829223633, 0.25896233320236206, 0.2591072916984558]...


# Standard Methods

# Evaluation

In [None]:
def predict_next_node_ml(model, G, current_node, target_node, visited, prediction_cache, debug=False):
    if debug:
        print(f"Predicting next node from current node: {current_node}, target: {target_node}")

    neighbors = [n for n in G.neighbors(current_node) if n not in visited]

    if not neighbors:
        if debug:
            print("No unvisited neighbors available.")
        return None  # No unvisited neighbors

    # Check if the target node is one of the neighbors
    if target_node in neighbors:
        if debug:
            print(f"Target node {target_node} is a direct neighbor of {current_node}. Auto-selecting target.")
        return target_node  # Auto-select the target node

    # Cache predictions to avoid redundant computations
    if current_node not in prediction_cache:
        target_features = G.nodes[target_node]["features"]
        predictions = []
        for neighbor in neighbors:
            neighbor_features = G.nodes[neighbor]["features"]
            feature_vector = neighbor_features - target_features
            prob = model.predict(feature_vector.reshape(1, -1), verbose=0)[0][0]  # Suppress model output
            predictions.append((neighbor, prob))
            if debug:
                print(f"Prediction for neighbor {neighbor}: {prob:.4f}")
        predictions.sort(key=lambda x: x[1], reverse=True)
        prediction_cache[current_node] = predictions
    else:
        if debug:
            print(f"Using cached predictions for {current_node}")
        predictions = prediction_cache[current_node]

    # Select the next node based on predictions
    for neighbor, prob in predictions:
        if neighbor not in visited:
            if debug:
                print(f"Next node selected: {neighbor} with probability {prob:.4f}")
            return neighbor

    if debug:
        print("No valid next node found.")
    return None


def predict_next_node_degree(
    G, current_node, target_node, visited, prediction_cache, debug=False
):
    """Degree-based prediction - selects neighbor with highest degree."""
    # TODO: Implement degree-based routing strategy
    pass

ROUTING_STRATEGIES = {
    "ml": predict_next_node_ml,
    "degree": predict_next_node_degree,
}

In [None]:
# Find the path with a limit on the number of hops
def find_path(G, source, target, strategy="degree", max_hops=40, model=None, debug=False):
    if debug:
        print(
            f"Starting pathfinding from source: {source} to target: {target}, with max hops: {max_hops}"
        )
    if (model==None and strategy=="ml"):
        raise Exception("Need a model if using ML strategy")
    predict_next_node = ROUTING_STRATEGIES[strategy]
    current_node = source
    visited = set()
    prediction_cache = {}  # Cache predictions to avoid recomputation
    path = [source]
    hops = 0

    while current_node != target:
        print(f"Current Number of Hops: {hops}")
        print(f"Current Node: {current_node}")
        visited.add(current_node)
        if (strategy == "ml"):
            next_node = predict_next_node(
              model, G, current_node, target, visited, prediction_cache, debug=debug
          )
        else:
            next_node = predict_next_node(
                G, current_node, target, visited, prediction_cache, debug=debug
            )
        if next_node is None:
            if debug:
                print(f"Pathfinding failed: no valid neighbors from {current_node}.")
            return None  # No path found
        path.append(next_node)
        current_node = next_node
        hops += 1

        if hops > max_hops:
            if debug:
                print(f"Pathfinding terminated: exceeded max hops ({max_hops}).")
            return None

    if debug:
        print(f"Pathfinding complete. Path: {path}")
    return path

In [None]:
# Evaluate pathfinding with paths included in the output
def evaluate_pathfinding(
    G, model = None, strategy="degree", max_hops=20, num_runs=20, seed=42, debug=False
):
    random.seed(seed)
    total_hops = 0
    successful_runs = 0

    node_list = list(G.nodes)
    all_pairs = []
    for _ in range(num_runs):
        source, target = random.sample(node_list, 2)
        all_pairs.append((source, target))

    results = []
    for run, (source_node, target_node) in enumerate(all_pairs):
        print(f"Run {run + 1}/{num_runs}: Source {source_node} -> Target {target_node}")

        path = find_path(G, source_node, target_node, strategy, max_hops, model, debug)

        if path:
            num_hops = len(path) - 1
            print(f"  Path found in {num_hops} hops. Path: {path}")
            total_hops += num_hops
            successful_runs += 1
            results.append(
                {"run": run + 1, "success": True, "hops": num_hops, "path": path}
            )
        else:
            print("  No path found or run terminated.")
            results.append(
                {"run": run + 1, "success": False, "hops": None, "path": None}
            )

    success_rate = (successful_runs / num_runs) * 100
    average_hops = total_hops / successful_runs if successful_runs > 0 else float("inf")

    print(f"\n--- Summary ---")
    print(f"Success rate: {success_rate:.2f}% ({successful_runs}/{num_runs})")
    print(f"Average hops: {average_hops:.2f}" if successful_runs > 0 else "No successful runs.")

    return success_rate, average_hops



Run 1/20: Source 29917 -> Target 28503
Current Number of Hops: (0).
Current Number of Hops: (1).
Current Number of Hops: (2).


In [None]:
EVAL_PARAMS = {"max_hops": 20, "num_runs": 20, "seed": 42}

ml_results = evaluate_pathfinding(G=G, model=model, strategy="ml", max_hops=20, num_runs=20, seed=42, debug=False)