In [1]:
import networkx as nx
import torch
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix
from constants import (
    AGGR_GRAPH_PATH,
    COMMUNITY_PARTITIONS_FPATH,
    DISTANCE_NEGATIVE_THRESHOLD,
    DISTANCE_POSITIVE_THRESHOLD,
    FINAL_GRAPH_PATH,
)

In [2]:
with open(FINAL_GRAPH_PATH,"rb") as f:
    G = pickle.load(f)

# Community columns already exist
COMM_COLS = [f"comm_{i}" for i in range(153)]

FEATURE_COLS = (["street_count","population","distance_to_nearest_stop"]+ COMM_COLS+ ["is_transit_stop"])


In [3]:
node_features = []
labels = []
node_id_map = {}
for idx, node in enumerate(G.nodes()):
    node_id_map[node] = idx
    attr = G.nodes[node]

    features = []

    for col in FEATURE_COLS:
        if col == "is_transit_stop":
            features.append(int(attr[col]))
        else:
            features.append(float(attr[col]))

    node_features.append(features)
    label = attr["label"]
    if label is None or (isinstance(label, float) and np.isnan(label)):
        labels.append(-1)
    else:
        labels.append(int(label))


In [4]:
x = torch.tensor(node_features, dtype=torch.float)
y = torch.tensor(labels, dtype=torch.long)
edge_index = []

for u, v in G.edges():
    edge_index.append([node_id_map[u], node_id_map[v]])
    edge_index.append([node_id_map[v], node_id_map[u]])  # undirected

# Convert to torch.LongTensor and transpose to shape [2, num_edges]
edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

# Get indices of labeled nodes
labeled_idx = (y >= 0).nonzero(as_tuple=True)[0].numpy()
print(f"Total labeled nodes: {len(labeled_idx)}")

# Split: 70% train, 20% val, 10% test
train_idx, temp_idx = train_test_split(labeled_idx, test_size=0.3, random_state=42)
val_idx, test_idx = train_test_split(temp_idx, test_size=1/3, random_state=42)  # 0.1/0.3 = 1/3
 


Total labeled nodes: 42161


In [5]:
from torch_geometric.data import Data
from torch_geometric.transforms import NormalizeFeatures

data = Data(x=x, edge_index=edge_index, y=y)

# Masks
num_nodes = data.num_nodes
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
val_mask   = torch.zeros(num_nodes, dtype=torch.bool)
test_mask  = torch.zeros(num_nodes, dtype=torch.bool)
unknown_mask = (y < 0)

train_mask[train_idx] = True
val_mask[val_idx] = True
test_mask[test_idx] = True

data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask
data.unknown_mask = unknown_mask

# Normalize
data = NormalizeFeatures()(data)


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class TransitGCN(nn.Module):
    def __init__(self, in_channels, hidden_channels=64):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.classifier = nn.Linear(hidden_channels, 2)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.3, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        return self.classifier(x)


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransitGCN(in_channels=data.num_features).to(device)
data = data.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()


In [11]:
best_val_f1 = 0
best_model_state = None

for epoch in range(1, 301):
    # === Training ===
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    # === Validation ===
    model.eval()
    with torch.no_grad():
        val_logits = model(data.x, data.edge_index)  # recompute in eval mode
        val_preds = val_logits[data.val_mask].argmax(dim=1)
        val_labels = data.y[data.val_mask]
        val_f1 = f1_score(val_labels.cpu(), val_preds.cpu())

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_model_state = model.state_dict()

    if epoch % 30 == 0:
        print(f"Epoch {epoch:03d} | Loss: {loss:.4f} | Val F1: {val_f1:.4f}")


Epoch 030 | Loss: 0.6011 | Val F1: 0.0232
Epoch 060 | Loss: 0.4878 | Val F1: 0.4516
Epoch 090 | Loss: 0.2900 | Val F1: 0.8723
Epoch 120 | Loss: 0.1348 | Val F1: 0.9711
Epoch 150 | Loss: 0.0677 | Val F1: 0.9921
Epoch 180 | Loss: 0.0394 | Val F1: 0.9966
Epoch 210 | Loss: 0.0260 | Val F1: 0.9980
Epoch 240 | Loss: 0.0186 | Val F1: 0.9995
Epoch 270 | Loss: 0.0139 | Val F1: 0.9995
Epoch 300 | Loss: 0.0117 | Val F1: 0.9997


In [12]:
model.load_state_dict(best_model_state)
model.eval()
with torch.no_grad():
    test_logits = model(data.x, data.edge_index)[data.test_mask]
    test_preds = test_logits.argmax(dim=1)
    test_labels = data.y[data.test_mask]
    
    acc = accuracy_score(test_labels.cpu(), test_preds.cpu())
    f1 = f1_score(test_labels.cpu(), test_preds.cpu())
    print(f"Test Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")


Test Accuracy: 0.9986, F1 Score: 0.9981


In [13]:
with torch.no_grad():
    logits = model(data.x, data.edge_index)
    probs = F.softmax(logits, dim=1)
    preds = probs.argmax(dim=1)

uncertain_nodes = data.unknown_mask.nonzero(as_tuple=True)[0]
predicted_labels = preds[uncertain_nodes]
served_prob = probs[uncertain_nodes, 1]


In [14]:
inv_map = {v: k for k, v in node_id_map.items()}

results = {}
for idx, pred, prob in zip(
    uncertain_nodes.cpu().numpy(),
    predicted_labels.cpu().numpy(),
    served_prob.cpu().numpy()
):
    results[inv_map[idx]] = {
        "predicted_label": int(pred),
        "served_probability": float(prob)
    }


In [15]:
print(results)

{'59634998': {'predicted_label': 1, 'served_probability': 0.9981726408004761}, '3582117819': {'predicted_label': 1, 'served_probability': 0.9984329342842102}, '10307335764': {'predicted_label': 1, 'served_probability': 0.9976465106010437}, '83339509': {'predicted_label': 1, 'served_probability': 0.999365508556366}, '566590276': {'predicted_label': 1, 'served_probability': 0.562284529209137}, '251311135': {'predicted_label': 1, 'served_probability': 0.9963557720184326}, '60782561': {'predicted_label': 1, 'served_probability': 0.9925376772880554}, '3658159625': {'predicted_label': 1, 'served_probability': 0.9775355458259583}, '3649737823': {'predicted_label': 1, 'served_probability': 0.8034713268280029}, '60782567': {'predicted_label': 0, 'served_probability': 0.0451454259455204}, '6606798568': {'predicted_label': 0, 'served_probability': 0.048577018082141876}, '60782574': {'predicted_label': 0, 'served_probability': 0.019115623086690903}, '7661673766': {'predicted_label': 0, 'served_pro

In [16]:
for node, res in results.items():
    G.nodes[node]["label"] = res["predicted_label"]  # 0 or 1
    #G.nodes[node]["served_prob"] = res["served_probability"]  # optional


In [17]:
with open("final.gpickl","wb") as f:
    pickle.dump(G,f)

## Save Model Weights

In [18]:
torch.save(best_model_state, "best_transit_gcn.pth")
print("Best model weights saved to best_transit_gcn.pth")


Best model weights saved to best_transit_gcn.pth
