In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import networkx as nx
import json
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import random
import pickle

print("Libraries imported successfully.")


Libraries imported successfully.


In [36]:
import pandas as pd
import json
import networkx as nx
from sklearn.preprocessing import MultiLabelBinarizer

# File paths
original_features_path = "../foursquare_data/features.json"
original_edges_path = "../foursquare_data/edges.csv"

# Load data
def load_data():
    print("Loading original features and edges...")
    with open(original_features_path, "r") as f:
        features = json.load(f)  # Node IDs as strings
    print(f"Loaded {len(features)} nodes with features.")
    
    edges = pd.read_csv(
        original_edges_path, names=["source", "target"], skiprows=1
    )  # Skip header row
    print(f"Loaded {len(edges)} edges.")
    return features, edges

# Create graph and extract largest connected component
def create_graph(features, edges):
    print("Creating graph from edges...")
    edges["source"] = edges["source"].astype(str)
    edges["target"] = edges["target"].astype(str)
    G = nx.from_pandas_edgelist(edges, source="source", target="target")
    print(f"Graph created with {len(G.nodes)} nodes and {len(G.edges)} edges.")

    print("Identifying the largest connected component...")
    largest_cc = max(nx.connected_components(G), key=len)
    G_lcc = G.subgraph(largest_cc).copy()
    print(f"Largest connected component has {len(G_lcc.nodes)} nodes and {len(G_lcc.edges)} edges.")

    print("Assigning features to nodes...")
    nodes_with_features = 0
    for node in G_lcc.nodes:
        if node in features:
            G_lcc.nodes[node]["features"] = features[node]
            nodes_with_features += 1
    print(f"Assigned features to {nodes_with_features} nodes out of {len(G_lcc.nodes)} in the largest connected component.")
    
    return G_lcc

def print_sample_nodes(G, sample_size=5):
    print("\nSample nodes with their features:")
    nodes_with_features = [
        (node, data["features"])
        for node, data in G.nodes(data=True)
        if "features" in data
    ]
    sample_nodes = nodes_with_features[:sample_size]
    for node, features in sample_nodes:
        print(f"Node ID: {node}")
        print(f"Features: {features}")
        print("-" * 40)

# Load, process, and print sample nodes
print("Starting graph preprocessing...")
features, edges = load_data()
G = create_graph(features, edges)

# Print sample nodes
print_sample_nodes(G, sample_size=5)

print(f"Graph preprocessing complete. Final graph has {len(G.nodes)} nodes and {len(G.edges)} edges.")




Starting graph preprocessing...
Loading original features and edges...
Loaded 29515 nodes with features.
Loaded 701311 edges.
Creating graph from edges...
Graph created with 114324 nodes and 701311 edges.
Identifying the largest connected component...
Largest connected component has 111251 nodes and 699461 edges.
Assigning features to nodes...
Assigned features to 2420 nodes out of 111251 in the largest connected component.

Sample nodes with their features:
Node ID: 78943
Features: [-22.912699, -43.221986, 47.0, 0.1378353376503237, 1.0, 16.0, 7.0, 7637.8296]
----------------------------------------
Node ID: 151365
Features: [30.267537, -97.745404, 29.0, 0.1699507389162561, 1.0, 589.0, 233.0, 945.9471]
----------------------------------------
Node ID: 12434
Features: [25.655705, -100.361285, 1367.0, 0.0054430890869384, 1.0, 16.0, 23.0, 2084.239]
----------------------------------------
Node ID: 239907
Features: [19.128165, -98.768466, 43.0, 0.2259136212624584, 1.0, 209.0, 534.0, 669.42

In [8]:
def check_connectivity_bfs(G):
    print("Performing BFS to ensure all nodes are connected...")
    start_node = next(iter(G.nodes))  # Get an arbitrary starting node
    visited = set()
    queue = [start_node]

    while queue:
        node = queue.pop(0)
        if node not in visited:
            visited.add(node)
            queue.extend(neighbor for neighbor in G.neighbors(node) if neighbor not in visited)

    if len(visited) == len(G.nodes):
        print("All nodes are connected. The graph is a single connected component.")
    else:
        print(f"Graph is not fully connected. Only {len(visited)} out of {len(G.nodes)} nodes are reachable.")


check_connectivity_bfs(G)  # Ensure the graph is a single connected component

Performing BFS to ensure all nodes are connected...
All nodes are connected. The graph is a single connected component.


In [33]:
# Section 3A: Create Feature Vectors

def create_feature_vectors(G, edges):
    print("Creating feature vectors for ML tasks...")
    X, y = [], []

    print("Processing positive samples (existing edges)...")
    for i, (_, row) in enumerate(edges.iterrows()):
        node1, node2 = str(row["source"]), str(row["target"])
        if (
            node1 in G.nodes 
            and node2 in G.nodes 
            and "features" in G.nodes[node1] 
            and "features" in G.nodes[node2]
        ):
            feature_vector = np.array(G.nodes[node1]["features"]) - np.array(G.nodes[node2]["features"])
            X.append(feature_vector)
            y.append(1)
            print(feature_vector)
        if i % 50000 == 0:
            print(f"Processed {i} positive samples.")

    print("Generating negative samples (random non-existing edges)...")
    all_nodes = [node for node in G.nodes if "features" in G.nodes[node]]  # Nodes with features
    for i in range(len(edges)):
        node1, node2 = np.random.choice(all_nodes, 2, replace=False)
        if not G.has_edge(node1, node2):
            feature_vector = np.array(G.nodes[node1]["features"]) - np.array(G.nodes[node2]["features"])
            X.append(feature_vector)
            y.append(0)
        if i % 50000 == 0:
            print(f"Generated {i} negative samples.")

    print(f"Feature vectors created. Total samples: {len(X)}")
    return np.array(X), np.array(y)


# Create and split feature vectors
print("Creating and splitting feature vectors...")
X, y = create_feature_vectors(G, edges)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")

with open("feature_vectors.pkl", "wb") as f:
    pickle.dump((X, y, X_train, X_test, y_train, y_test), f)

print("Feature vectors and splits saved successfully.")


Creating and splitting feature vectors...
Creating feature vectors for ML tasks...
Processing positive samples (existing edges)...
Processed 0 positive samples.
[ 8.15757600e+00  8.33891200e+00 -2.00000000e+01  1.58182813e-01
  0.00000000e+00  3.79300000e+03  3.44800000e+03  4.96938030e+03]
[1.49236630e+01 6.18422100e+00 1.70000000e+01 7.50988142e-02
 0.00000000e+00 1.56200000e+03 1.68900000e+03 9.20347700e+02]
[ 6.30680000e-02 -2.55180000e-02 -1.80000000e+01  4.50089127e-02
 -1.00000000e+00 -1.27000000e+02 -3.40000000e+01  0.00000000e+00]
[-5.81505500e+00 -2.69791130e+01  3.90000000e+01 -3.09104401e-02
  0.00000000e+00  3.49000000e+02  6.28000000e+02  8.49855250e+02]
[-1.49730000e-02 -1.30093000e-01  1.20000000e+01  1.76470588e-02
 -1.00000000e+00  6.61000000e+02  3.11000000e+02  1.73555820e+03]
[-5.17830000e-02 -3.59840000e-02  8.00000000e+00 -1.06060606e-01
  0.00000000e+00 -5.30000000e+01 -1.20000000e+01 -3.72227350e+03]
[ 1.08646040e+01  7.36419200e+01 -1.18000000e+02  1.04277118e

KeyboardInterrupt: 

In [10]:
with open("feature_vectors.pkl", "rb") as f:
    X, y, X_train, X_test, y_train, y_test = pickle.load(f)

print("Feature vectors and splits loaded successfully.")
print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")

Feature vectors and splits loaded successfully.
Training set size: 561872, Test set size: 140468


In [None]:
# Section 3B: Train the Neural Network

# Define the neural network
print("Defining the neural network model...")
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation="relu", input_shape=(X_train.shape[1],), name="Input_Layer"),
    tf.keras.layers.Dense(64, activation="relu", name="Hidden_Layer_1"),
    tf.keras.layers.Dense(32, activation="relu", name="Hidden_Layer_2"),
    tf.keras.layers.Dense(1, activation="sigmoid", name="Output_Layer"),
])
print("Model defined successfully.")
model.summary()

# Compile the model
print("Compiling the model...")
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
print("Model compiled successfully.")

# Define a custom callback for logging
class TrainingLogger(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(f"\nEpoch {epoch + 1}:")
        print(
            f"  Training Loss: {logs['loss']:.4f}, Training Accuracy: {logs['accuracy']:.4f}"
        )
        print(
            f"  Validation Loss: {logs['val_loss']:.4f}, Validation Accuracy: {logs['val_accuracy']:.4f}"
        )

# Train the model
print("Starting model training...")
history = model.fit(
    X_train,
    y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.2,
    callbacks=[TrainingLogger()],
    verbose=0  # Suppress default verbose to use custom logging
)
print("Model training complete.")

# Evaluate the model
model.save("trained_model.h5")
print("Model saved successfully to 'trained_model.h5'.")

# Save the training history
with open("training_history.pkl", "wb") as f:
    pickle.dump(history.history, f)
print("Training history saved successfully to 'training_history.pkl'.")

# Evaluate the model
print("Evaluating the model on the test set...")
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")


Defining the neural network model...
Model defined successfully.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Compiling the model...
Model compiled successfully.
Starting model training...

Epoch 1:
  Training Loss: 0.1968, Training Accuracy: 0.9972
  Validation Loss: 0.0338, Validation Accuracy: 0.9981

Epoch 2:
  Training Loss: 0.0238, Training Accuracy: 0.9980
  Validation Loss: 0.0139, Validation Accuracy: 0.9981

Epoch 3:
  Training Loss: 0.0188, Training Accuracy: 0.9980
  Validation Loss: 0.0139, Validation Accuracy: 0.9981

Epoch 4:
  Training Loss: 0.0189, Training Accuracy: 0.9980
  Validation Loss: 0.0140, Validation Accuracy: 0.9981

Epoch 5:
  Training Loss: 0.0176, Training Accuracy: 0.9980
  Validation Loss: 0.0157, Validation Accuracy: 0.9978

Epoch 6:
  Training Loss: 0.0182, Training Accuracy: 0.9980
  Validation Loss: 0.0139, Validation Accuracy: 0.9981

Epoch 7:
  Training Loss: 0.0179, Training Accuracy: 0.9979
  Validation Loss: 0.0139, Validation Accuracy: 0.9981

Epoch 8:
  Training Loss: 0.0162, Training Accuracy: 0.9980
  Validation Loss: 0.0137, Validation Accuracy: 




Epoch 20:
  Training Loss: 0.0188, Training Accuracy: 0.9980
  Validation Loss: 0.0148, Validation Accuracy: 0.9981
Model training complete.
Model saved successfully to 'trained_model.h5'.
Training history saved successfully to 'training_history.pkl'.
Evaluating the model on the test set...
[1m4390/4390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 255us/step - accuracy: 0.9981 - loss: 0.0187
Test Loss: 0.0162
Test Accuracy: 0.9982


In [12]:
# Load the trained model
model = tf.keras.models.load_model("trained_model.h5")
print("Model loaded successfully from 'trained_model.h5'.")
model.summary()

# Load the training history
with open("training_history.pkl", "rb") as f:
    history = pickle.load(f)
print("Training history loaded successfully from 'training_history.pkl'.")

# Print the loaded training history (optional)
print("Loaded Training History:")
for key, values in history.items():
    print(f"{key}: {values[:5]}...")  # Show first 5 values as a preview



Model loaded successfully from 'trained_model.h5'.


Training history loaded successfully from 'training_history.pkl'.
Loaded Training History:
accuracy: [0.9971612691879272, 0.9979555010795593, 0.9979621767997742, 0.9979755282402039, 0.9979955554008484]...
loss: [0.19679401814937592, 0.023762507364153862, 0.01882072351872921, 0.018871981650590897, 0.01762070134282112]...
val_accuracy: [0.9981134533882141, 0.9981134533882141, 0.9981134533882141, 0.9981134533882141, 0.9978464841842651]...
val_loss: [0.033759549260139465, 0.013948516920208931, 0.013933347538113594, 0.014005061239004135, 0.015745781362056732]...


In [20]:
# Helper function to find the best nodes
def find_best_nodes(G, neighbors, target_node):
    """
    Find the best nodes among neighbors based on shortest path distance to the target.
    Returns the list of tied 'best' nodes with the minimum distance.
    """
    distances = {
        neighbor: nx.shortest_path_length(G, source=neighbor, target=target_node)
        for neighbor in neighbors
    }
    min_distance = min(distances.values())
    print(min_distance)
    best_nodes = [node for node, dist in distances.items() if dist == min_distance]
    return best_nodes

In [46]:
def predict_next_node(model, G, current_node, target_node, visited, prediction_cache, debug=True):
    """
    Predict the next node from the current node and return step-by-step accuracy.
    """
    if debug:
        print(f"Predicting next node from current node: {current_node}")

    neighbors = [
        neighbor for neighbor in G.neighbors(current_node)
        if "features" in G.nodes[neighbor] and neighbor not in visited
    ]

    if not neighbors:
        random_choice = random.choice(list(G.neighbors(current_node))) if G.neighbors(current_node) else None
        if debug:
            print("Opting For Random Choice")
        return random_choice, False  # No "best choice" available, random node selected
    
    if target_node in neighbors:
        if debug:
            print(f"Target node {target_node} is a direct neighbor. Automatically selecting it.")
        return target_node, True  # Automatically move to the destination and mark as accurate

    # Find the best nodes
    best_nodes = find_best_nodes(G, neighbors, target_node)

    # Predict probabilities for neighbors
    predictions = []
    for neighbor in neighbors:
        feature_vector = np.array(G.nodes[neighbor]["features"]) - np.array(G.nodes[target_node]["features"])
        print(feature_vector)
        prob = model.predict(feature_vector.reshape(1, -1), verbose=0)[0][0]
        predictions.append((neighbor, prob))

    # Sort by probability
    predictions.sort(key=lambda x: x[1], reverse=True)
    print(predictions)

    if debug:
        print(f"Neighbors Were Available, Chance of Containing Edge:{predictions[0][1]}, Selected Node {predictions[0][0]}" )
    chosen_node = predictions[0][0]  # Node with the highest probability

    is_accurate = chosen_node in best_nodes
    return chosen_node, is_accurate

def find_path(model, G, source, target, max_hops=40, debug=True):
    """
    Find the path while tracking accuracy at each step.
    Outputs path and per-move accuracy data.
    """
    if debug:
        print(f"Starting pathfinding from source: {source} to target: {target}, with max hops: {max_hops}")

    current_node = source
    visited = set()
    prediction_cache = {}
    path = [source]  # Track path regardless of success
    hops = 0
    correct_choices = 0
    step_actuals = []  # Ground truth: 1 when a "best choice" exists
    step_predictions = []  # Prediction: 1 if chosen move is among the best

    while hops < max_hops:
        if current_node == target:  # Success condition
            if debug:
                print(f"Pathfinding succeeded: target {target} reached.")
            return path, correct_choices, hops, step_actuals, step_predictions

        visited.add(current_node)
        next_node, is_accurate = predict_next_node(model, G, current_node, target, visited, prediction_cache, debug=debug)

        if not next_node:  # No valid moves left
            if debug:
                print(f"Pathfinding failed: no valid neighbors from {current_node}.")
            return path, correct_choices, hops, step_actuals, step_predictions

        # Log step accuracy
        step_actuals.append(1)  # A "best node" always exists
        step_predictions.append(1 if is_accurate else 0)

        if is_accurate:
            correct_choices += 1

        # Update path and state
        path.append(next_node)
        current_node = next_node
        hops += 1

    # Failure due to exceeding max hops
    if debug:
        print(f"Pathfinding failed: exceeded max hops ({max_hops}).")
    return path, correct_choices, hops, step_actuals, step_predictions


In [49]:
dummy_input = np.array([[0, 0, 0, 0, 0, 0, 0, 0]], dtype=np.float32)  # A neutral input
test_prediction = model.predict(dummy_input)
print("Test Prediction for Neutral Input:", test_prediction)

print("Model Summary:")
model.summary()

print("Model Weights (First Layer):", model.get_weights()[0])  # Print weights


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Test Prediction for Neutral Input: [[0.00201973]]
Model Summary:


Model Weights (First Layer): [[-4.92191315e-02  5.43339312e-01 -1.11308599e+00 ... -1.53442729e+00
  -1.38920927e+00 -5.39898217e-01]
 [ 1.01262772e+00 -7.61828780e-01 -1.82980463e-01 ...  2.50758439e-01
  -6.13084316e-01 -1.98532060e-01]
 [ 3.60224843e-02 -3.28342229e-01 -2.88264215e-01 ... -1.28176168e-01
  -2.97008723e-01 -5.02876520e-01]
 ...
 [ 2.52495352e-02 -1.56229800e-02  1.17307985e-02 ...  4.22608964e-02
   1.30823790e-03  3.76720935e-01]
 [ 2.35208496e-03  5.72206117e-02  7.03009591e-02 ...  1.20768897e-01
   3.17262001e-02  2.48534098e-01]
 [-4.79526399e-03  2.70476282e-01  1.06114924e-01 ...  2.44188197e-02
   1.69041362e-02 -6.46801293e-01]]


In [47]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_pathfinding(model, G, max_hops=20, num_runs=20):
    """
    Evaluate the pathfinding algorithm, calculate metrics, and print paths.
    """
    total_hops = 0
    successful_runs = 0
    total_correct_choices = 0
    total_steps = 0
    all_actuals = []  # Tracks actual best node occurrences
    all_predictions = []  # Tracks predictions (1 if accurate, 0 otherwise)

    nodes_with_features = [node for node in G.nodes if "features" in G.nodes[node]]

    for run in range(num_runs):
        if len(nodes_with_features) < 2:
            print("Not enough nodes with features for evaluation.")
            break

        source_node, target_node = random.sample(nodes_with_features, 2)
        print(f"\nRun {run + 1}/{num_runs}: Source {source_node} -> Target {target_node}")

        # Run pathfinding
        path, correct_choices, steps, step_actuals, step_predictions = find_path(model, G, source_node, target_node, max_hops)

        # Print path regardless of success
        print(f"  Path: {path}")

        if path and path[-1] == target_node:
            print(f"  Path found in {steps} steps. Correct choices: {correct_choices}/{steps}")
            successful_runs += 1
            total_hops += steps
        else:
            print(f"  Path failed after {steps} steps.")

        # Update precision/recall data
        all_actuals.extend(step_actuals)
        all_predictions.extend(step_predictions)

        # Track total moves
        total_correct_choices += correct_choices
        total_steps += steps

    # Metrics
    success_rate = successful_runs / num_runs * 100
    average_hops = total_hops / successful_runs if successful_runs > 0 else float('inf')
    accuracy = total_correct_choices / total_steps if total_steps > 0 else 0.0
    f1 = f1_score(all_actuals, all_predictions, zero_division=0)

    # Print summary
    print(f"\n--- Summary ---")
    print(f"Success rate: {success_rate:.2f}% ({successful_runs}/{num_runs})")
    print(f"Average hops: {average_hops:.2f}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"F1-score: {f1:.2f}")

    return success_rate, average_hops, accuracy, precision, recall, f1




success_rate, average_hops, accuracy, precision, recall, f1 = evaluate_pathfinding(model, G, max_hops=20, num_runs=100)



Run 1/100: Source 207422 -> Target 90038
Starting pathfinding from source: 207422 to target: 90038, with max hops: 20
Predicting next node from current node: 207422
Opting For Random Choice
Predicting next node from current node: 240155
4
[ 5.53250000e-02 -3.85368200e+00  5.00000000e+00  7.87878788e-02
  0.00000000e+00  1.93000000e+02  2.44270000e+04 -4.38006232e+03]
[('104843', np.float32(0.0))]
Neighbors Were Available, Chance of Containing Edge:0.0, Selected Node 104843
Predicting next node from current node: 104843
3
[-7.84982000e-01 -1.16036400e+00  1.20000000e+01  1.83006536e-02
  0.00000000e+00  2.77000000e+02 -2.00000000e+01 -2.37980320e+03]
[('24697', np.float32(0.002019727))]
Neighbors Were Available, Chance of Containing Edge:0.0020197269041091204, Selected Node 24697
Predicting next node from current node: 24697
2
[-8.6299600e-01 -1.2366430e+00  2.5000000e+01 -4.7311828e-02
  0.0000000e+00  1.8750000e+03  1.4340000e+03 -2.3798032e+03]
[('6138', np.float32(0.002019727))]
Ne

KeyboardInterrupt: 