In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import networkx as nx
import json
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import random
import pickle

print("Libraries imported successfully.")


Libraries imported successfully.


In [None]:
import pandas as pd
import json
import networkx as nx
from sklearn.preprocessing import MultiLabelBinarizer

# File paths
original_features_path = "../foursquare_data/features.json"
original_edges_path = "../foursquare_data/edges.csv"

# Load data
def load_data():
    print("Loading original features and edges...")
    with open(original_features_path, "r") as f:
        features = json.load(f)  # Node IDs as strings
    print(f"Loaded {len(features)} nodes with features.")
    
    edges = pd.read_csv(
        original_edges_path, names=["source", "target"], skiprows=1
    )  # Skip header row
    print(f"Loaded {len(edges)} edges.")
    return features, edges

# Create graph and extract largest connected component
def create_graph(features, edges):
    print("Creating graph from edges...")
    edges["source"] = edges["source"].astype(str)
    edges["target"] = edges["target"].astype(str)
    G = nx.from_pandas_edgelist(edges, source="source", target="target")
    print(f"Graph created with {len(G.nodes)} nodes and {len(G.edges)} edges.")

    print("Identifying the largest connected component...")
    largest_cc = max(nx.connected_components(G), key=len)
    G_lcc = G.subgraph(largest_cc).copy()
    print(f"Largest connected component has {len(G_lcc.nodes)} nodes and {len(G_lcc.edges)} edges.")

    print("Assigning features to nodes...")
    nodes_with_features = 0
    for node in G_lcc.nodes:
        if node in features:
            G_lcc.nodes[node]["features"] = features[node]
            nodes_with_features += 1
    print(f"Assigned features to {nodes_with_features} nodes out of {len(G_lcc.nodes)} in the largest connected component.")
    
    return G_lcc

def print_sample_nodes(G, sample_size=5):
    print("\nSample nodes with their features:")
    nodes_with_features = [
        (node, data["features"])
        for node, data in G.nodes(data=True)
        if "features" in data
    ]
    sample_nodes = nodes_with_features[:sample_size]
    for node, features in sample_nodes:
        print(f"Node ID: {node}")
        print(f"Features: {features}")
        print("-" * 40)

# Load, process, and print sample nodes
print("Starting graph preprocessing...")
features, edges = load_data()
G = create_graph(features, edges)

# Print sample nodes
print_sample_nodes(G, sample_size=5)

print(f"Graph preprocessing complete. Final graph has {len(G.nodes)} nodes and {len(G.edges)} edges.")


Starting graph preprocessing...
Loading original features and edges...
Loaded 29515 nodes with features.
Loaded 701311 edges.
Creating graph from edges...
Graph created with 114324 nodes and 701311 edges.
Identifying the largest connected component...
Largest connected component has 111251 nodes and 699461 edges.
Assigning features to nodes...
Node features assigned.
Standardizing features to fixed dimensions...
Number of nodes with features: 2420
Initial feature matrix shape: (2420, 8167)
Assigning standardized features back to nodes...
Feature standardization complete.

Sample nodes with their features:
Node ID: 857774
Features: No features assigned
----------------------------------------
Node ID: 908378
Features: No features assigned
----------------------------------------
Node ID: 631363
Features: No features assigned
----------------------------------------
Node ID: 693207
Features: No features assigned
----------------------------------------
Node ID: 480586
Features: No featur

In [8]:
def check_connectivity_bfs(G):
    print("Performing BFS to ensure all nodes are connected...")
    start_node = next(iter(G.nodes))  # Get an arbitrary starting node
    visited = set()
    queue = [start_node]

    while queue:
        node = queue.pop(0)
        if node not in visited:
            visited.add(node)
            queue.extend(neighbor for neighbor in G.neighbors(node) if neighbor not in visited)

    if len(visited) == len(G.nodes):
        print("All nodes are connected. The graph is a single connected component.")
    else:
        print(f"Graph is not fully connected. Only {len(visited)} out of {len(G.nodes)} nodes are reachable.")


check_connectivity_bfs(G)  # Ensure the graph is a single connected component

Performing BFS to ensure all nodes are connected...
All nodes are connected. The graph is a single connected component.


In [9]:
# Section 3A: Create Feature Vectors

def create_feature_vectors(G, edges):
    print("Creating feature vectors for ML tasks...")
    X, y = [], []

    print("Processing positive samples (existing edges)...")
    for i, (_, row) in enumerate(edges.iterrows()):
        node1, node2 = str(row["source"]), str(row["target"])
        if (
            node1 in G.nodes 
            and node2 in G.nodes 
            and "features" in G.nodes[node1] 
            and "features" in G.nodes[node2]
        ):
            feature_vector = np.array(G.nodes[node1]["features"]) - np.array(G.nodes[node2]["features"])
            X.append(feature_vector)
            y.append(1)
        if i % 50000 == 0:
            print(f"Processed {i} positive samples.")

    print("Generating negative samples (random non-existing edges)...")
    all_nodes = [node for node in G.nodes if "features" in G.nodes[node]]  # Nodes with features
    for i in range(len(edges)):
        node1, node2 = np.random.choice(all_nodes, 2, replace=False)
        if not G.has_edge(node1, node2):
            feature_vector = np.array(G.nodes[node1]["features"]) - np.array(G.nodes[node2]["features"])
            X.append(feature_vector)
            y.append(0)
        if i % 50000 == 0:
            print(f"Generated {i} negative samples.")

    print(f"Feature vectors created. Total samples: {len(X)}")
    return np.array(X), np.array(y)


# Create and split feature vectors
print("Creating and splitting feature vectors...")
X, y = create_feature_vectors(G, edges)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")

with open("feature_vectors.pkl", "wb") as f:
    pickle.dump((X, y, X_train, X_test, y_train, y_test), f)

print("Feature vectors and splits saved successfully.")


Creating and splitting feature vectors...
Creating feature vectors for ML tasks...
Processing positive samples (existing edges)...
Processed 0 positive samples.
Processed 50000 positive samples.
Processed 100000 positive samples.
Processed 150000 positive samples.
Processed 200000 positive samples.
Processed 250000 positive samples.
Processed 300000 positive samples.
Processed 350000 positive samples.
Processed 400000 positive samples.
Processed 450000 positive samples.
Processed 500000 positive samples.
Processed 550000 positive samples.
Processed 600000 positive samples.
Processed 650000 positive samples.
Processed 700000 positive samples.
Generating negative samples (random non-existing edges)...
Generated 0 negative samples.
Generated 50000 negative samples.
Generated 100000 negative samples.
Generated 150000 negative samples.
Generated 200000 negative samples.
Generated 250000 negative samples.
Generated 300000 negative samples.
Generated 350000 negative samples.
Generated 400000 

In [10]:
with open("feature_vectors.pkl", "rb") as f:
    X, y, X_train, X_test, y_train, y_test = pickle.load(f)

print("Feature vectors and splits loaded successfully.")
print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")

Feature vectors and splits loaded successfully.
Training set size: 565490, Test set size: 141373


In [11]:
# Section 3B: Train the Neural Network

# Define the neural network
print("Defining the neural network model...")
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation="relu", input_shape=(X_train.shape[1],), name="Input_Layer"),
    tf.keras.layers.Dense(64, activation="relu", name="Hidden_Layer_1"),
    tf.keras.layers.Dense(32, activation="relu", name="Hidden_Layer_2"),
    tf.keras.layers.Dense(1, activation="sigmoid", name="Output_Layer"),
])
print("Model defined successfully.")
model.summary()

# Compile the model
print("Compiling the model...")
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
print("Model compiled successfully.")

# Define a custom callback for logging
class TrainingLogger(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(f"\nEpoch {epoch + 1}:")
        print(
            f"  Training Loss: {logs['loss']:.4f}, Training Accuracy: {logs['accuracy']:.4f}"
        )
        print(
            f"  Validation Loss: {logs['val_loss']:.4f}, Validation Accuracy: {logs['val_accuracy']:.4f}"
        )

# Train the model
print("Starting model training...")
history = model.fit(
    X_train,
    y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.2,
    callbacks=[TrainingLogger()],
    verbose=0  # Suppress default verbose to use custom logging
)
print("Model training complete.")

# Evaluate the model
model.save("trained_model.h5")
print("Model saved successfully to 'trained_model.h5'.")

# Save the training history
with open("training_history.pkl", "wb") as f:
    pickle.dump(history.history, f)
print("Training history saved successfully to 'training_history.pkl'.")

# Evaluate the model
print("Evaluating the model on the test set...")
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")


Defining the neural network model...
Model defined successfully.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Compiling the model...
Model compiled successfully.
Starting model training...

Epoch 1:
  Training Loss: 0.0552, Training Accuracy: 0.9904
  Validation Loss: 0.0539, Validation Accuracy: 0.9902

Epoch 2:
  Training Loss: 0.0515, Training Accuracy: 0.9904
  Validation Loss: 0.0529, Validation Accuracy: 0.9902

Epoch 3:
  Training Loss: 0.0499, Training Accuracy: 0.9904
  Validation Loss: 0.0524, Validation Accuracy: 0.9902

Epoch 4:
  Training Loss: 0.0484, Training Accuracy: 0.9904
  Validation Loss: 0.0527, Validation Accuracy: 0.9902

Epoch 5:
  Training Loss: 0.0471, Training Accuracy: 0.9904
  Validation Loss: 0.0528, Validation Accuracy: 0.9902

Epoch 6:
  Training Loss: 0.0459, Training Accuracy: 0.9904
  Validation Loss: 0.0535, Validation Accuracy: 0.9902

Epoch 7:
  Training Loss: 0.0447, Training Accuracy: 0.9904
  Validation Loss: 0.0552, Validation Accuracy: 0.9901

Epoch 8:
  Training Loss: 0.0435, Training Accuracy: 0.9905
  Validation Loss: 0.0554, Validation Accuracy: 




Epoch 20:
  Training Loss: 0.0344, Training Accuracy: 0.9920
  Validation Loss: 0.0765, Validation Accuracy: 0.9896
Model training complete.
Model saved successfully to 'trained_model.h5'.
Training history saved successfully to 'training_history.pkl'.
Evaluating the model on the test set...
[1m4418/4418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 258us/step - accuracy: 0.9905 - loss: 0.0679
Test Loss: 0.0714
Test Accuracy: 0.9902


In [12]:
# Load the trained model
model = tf.keras.models.load_model("trained_model.h5")
print("Model loaded successfully from 'trained_model.h5'.")
model.summary()

# Load the training history
with open("training_history.pkl", "rb") as f:
    history = pickle.load(f)
print("Training history loaded successfully from 'training_history.pkl'.")

# Print the loaded training history (optional)
print("Loaded Training History:")
for key, values in history.items():
    print(f"{key}: {values[:5]}...")  # Show first 5 values as a preview



Model loaded successfully from 'trained_model.h5'.


Training history loaded successfully from 'training_history.pkl'.
Loaded Training History:
accuracy: [0.9903976917266846, 0.9904308915138245, 0.990428626537323, 0.9904198050498962, 0.990428626537323]...
loss: [0.05517600476741791, 0.051534269005060196, 0.0499231219291687, 0.04841449856758118, 0.0470704585313797]...
val_accuracy: [0.9901943206787109, 0.9901943206787109, 0.9901943206787109, 0.9901943206787109, 0.9901677966117859]...
val_loss: [0.05393171310424805, 0.05294496938586235, 0.05241876840591431, 0.05273481085896492, 0.05276070907711983]...


In [45]:
# Helper function to find the best nodes
def find_best_nodes(G, neighbors, target_node):
    """
    Find the best nodes among neighbors based on shortest path distance to the target.
    Returns the list of tied 'best' nodes with the minimum distance.
    """
    distances = {
        neighbor: nx.shortest_path_length(G, source=neighbor, target=target_node)
        for neighbor in neighbors
    }
    min_distance = min(distances.values())
    best_nodes = [node for node, dist in distances.items() if dist == min_distance]
    return best_nodes

In [None]:
def predict_next_node(model, G, current_node, target_node, visited, prediction_cache, debug=False):
    """
    Predict the next node from the current node and return step-by-step accuracy.
    """
    if debug:
        print(f"Predicting next node from current node: {current_node}")

    neighbors = [
        neighbor for neighbor in G.neighbors(current_node)
        if "features" in G.nodes[neighbor] and neighbor not in visited
    ]

    if not neighbors:
        random_choice = random.choice(list(G.neighbors(current_node))) if G.neighbors(current_node) else None
        if debug:
            print("Opting For Random Choice")
        return random_choice, False  # No "best choice" available, random node selected
    
    if target_node in neighbors:
        if debug:
            print(f"Target node {target_node} is a direct neighbor. Automatically selecting it.")
        return target_node, True  # Automatically move to the destination and mark as accurate

    # Find the best nodes
    best_nodes = find_best_nodes(G, neighbors, target_node)

    # Predict probabilities for neighbors
    predictions = []
    for neighbor in neighbors:
        feature_vector = G.nodes[neighbor]["features"] - G.nodes[target_node]["features"]
        prob = model.predict(feature_vector.reshape(1, -1), verbose=0)[0][0]
        predictions.append((neighbor, prob))

    # Sort by probability
    predictions.sort(key=lambda x: x[1], reverse=True)

    if debug:
        print(f"Neighbors Were Available, Chance of Containing Edge:{predictions[0][1]}, Selected Node {predictions[0][0]}" )
    chosen_node = predictions[0][0]  # Node with the highest probability

    is_accurate = chosen_node in best_nodes
    return chosen_node, is_accurate

def find_path(model, G, source, target, max_hops=40, debug=False):
    """
    Find the path while tracking accuracy at each step.
    Outputs path and per-move accuracy data.
    """
    if debug:
        print(f"Starting pathfinding from source: {source} to target: {target}, with max hops: {max_hops}")

    current_node = source
    visited = set()
    prediction_cache = {}
    path = [source]  # Track path regardless of success
    hops = 0
    correct_choices = 0
    step_actuals = []  # Ground truth: 1 when a "best choice" exists
    step_predictions = []  # Prediction: 1 if chosen move is among the best

    while hops < max_hops:
        if current_node == target:  # Success condition
            if debug:
                print(f"Pathfinding succeeded: target {target} reached.")
            return path, correct_choices, hops, step_actuals, step_predictions

        visited.add(current_node)
        next_node, is_accurate = predict_next_node(model, G, current_node, target, visited, prediction_cache, debug=debug)

        if not next_node:  # No valid moves left
            if debug:
                print(f"Pathfinding failed: no valid neighbors from {current_node}.")
            return path, correct_choices, hops, step_actuals, step_predictions

        # Log step accuracy
        step_actuals.append(1)  # A "best node" always exists
        step_predictions.append(1 if is_accurate else 0)

        if is_accurate:
            correct_choices += 1

        # Update path and state
        path.append(next_node)
        current_node = next_node
        hops += 1

    # Failure due to exceeding max hops
    if debug:
        print(f"Pathfinding failed: exceeded max hops ({max_hops}).")
    return path, correct_choices, hops, step_actuals, step_predictions


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_pathfinding(model, G, max_hops=20, num_runs=20):
    """
    Evaluate the pathfinding algorithm, calculate metrics, and print paths.
    """
    total_hops = 0
    successful_runs = 0
    total_correct_choices = 0
    total_steps = 0
    all_actuals = []  # Tracks actual best node occurrences
    all_predictions = []  # Tracks predictions (1 if accurate, 0 otherwise)

    nodes_with_features = [node for node in G.nodes if "features" in G.nodes[node]]

    for run in range(num_runs):
        if len(nodes_with_features) < 2:
            print("Not enough nodes with features for evaluation.")
            break

        source_node, target_node = random.sample(nodes_with_features, 2)
        print(f"\nRun {run + 1}/{num_runs}: Source {source_node} -> Target {target_node}")

        # Run pathfinding
        path, correct_choices, steps, step_actuals, step_predictions = find_path(model, G, source_node, target_node, max_hops)

        # Print path regardless of success
        print(f"  Path: {path}")

        if path and path[-1] == target_node:
            print(f"  Path found in {steps} steps. Correct choices: {correct_choices}/{steps}")
            successful_runs += 1
            total_hops += steps
        else:
            print(f"  Path failed after {steps} steps.")

        # Update precision/recall data
        all_actuals.extend(step_actuals)
        all_predictions.extend(step_predictions)

        # Track total moves
        total_correct_choices += correct_choices
        total_steps += steps

    # Metrics
    success_rate = successful_runs / num_runs * 100
    average_hops = total_hops / successful_runs if successful_runs > 0 else float('inf')
    accuracy = total_correct_choices / total_steps if total_steps > 0 else 0.0
    f1 = f1_score(all_actuals, all_predictions, zero_division=0)

    # Print summary
    print(f"\n--- Summary ---")
    print(f"Success rate: {success_rate:.2f}% ({successful_runs}/{num_runs})")
    print(f"Average hops: {average_hops:.2f}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"F1-score: {f1:.2f}")

    return success_rate, average_hops, accuracy, precision, recall, f1




success_rate, average_hops, accuracy, precision, recall, f1 = evaluate_pathfinding(model, G, max_hops=20, num_runs=100)



Run 1/100: Source 11780 -> Target 28564
  Path: ['11780', '9984', '9935', '12233', '10648', '12823', '7943', '12144', '13352', '26396', '10175', '18102', '12832', '10849', '11071', '10994', '15110', '12396', '16399', '12350', '13006']
  Path failed after 20 steps.

Run 2/100: Source 2698 -> Target 15861
  Path: ['2698', '9917', '1668', '54', '528', '16637', '758', '541', '19495', '854', '953', '866', '5975', '8524', '4786', '12184', '6034', '27138', '3812', '6571', '15890']
  Path failed after 20 steps.

Run 3/100: Source 28524 -> Target 10373
  Path: ['28524', '19785', '7199', '541', '24306', '20962', '78', '10716', '9041', '12645', '534', '545', '23897', '12555', '6158', '26417', '11717', '28916', '26863', '23321', '1536']
  Path failed after 20 steps.

Run 4/100: Source 18431 -> Target 13072
  Path: ['18431', '8825', '258282', '8825', '18431', '316587', '530738', '112489', '1110031', '112489', '358514', '178698', '29967', '14571', '20902', '20163', '13708', '20163', '9657', '2907',