## Loan Graph Construction

This notebook constructs PyTorch Geometric graph objects from cleaned loan origination data (2015–2016), linking borrowers by shared geographic area, loan provider, or both. Nodes represent individual loans with engineered features, while edges encode relational metadata and group-size weighting. The resulting graphs—used for training, testing, and explanation—support GAT-based credit risk explainability pipelines by capturing borrower interconnectivity through structured edge types and temporal filtering.

Note: This script is intended for academic reference only.

In [None]:
import pandas as pd
import numpy as np

import torch
from torch_geometric.data import Data

from tqdm.auto import tqdm 

import os

DATA_ROOT_DIR = '../data'
CLEANED_DATA_DIR = os.path.join(DATA_ROOT_DIR, 'cleaned_data')
OUTPUT_GRAPH_DIR = os.path.join(DATA_ROOT_DIR, 'graph_data')
GRAPH_D_TIMER_THRESHOLD = 1
EDGE_BATCH_SIZE = 50 

os.makedirs(OUTPUT_GRAPH_DIR, exist_ok=True)

device = torch.device("cpu")
print(f"Using device: {device}")

EDGE_TYPE_AREA = 0
EDGE_TYPE_PROVIDER = 1
EDGE_TYPE_AREA_PROVIDER = 2
NUM_EDGE_TYPES = 3 
TOTAL_EDGE_DIM = NUM_EDGE_TYPES + 1

def create_edges(group_indices_list, edge_type_id, num_edge_types_one_hot, debug_prefix="", max_edges_per_group=20):

    all_edges_tensors = []
    all_edge_type_and_feature_tensors = []
    total_groups_processed = 0
    total_edges_added = 0

    for i in tqdm(range(0, len(group_indices_list), EDGE_BATCH_SIZE), desc=f"{debug_prefix} Sampling edge batches"):
        batch = group_indices_list[i:i+EDGE_BATCH_SIZE]

        for indices in batch:
            if len(indices) <= 1:
                continue

            idx_tensor = torch.tensor(indices, dtype=torch.long, device=device)
            comb = torch.combinations(idx_tensor, r=2)

            if comb.size(0) > max_edges_per_group:
                perm = torch.randperm(comb.size(0), device=device)[:max_edges_per_group]
                sampled = comb[perm]
            else:
                sampled = comb

            bidir = torch.cat([
                sampled,
                sampled.flip(dims=[1])
            ], dim=0).T

            edge_attr_one_hot = torch.zeros((bidir.shape[1], num_edge_types_one_hot), dtype=torch.float32, device=device)
            edge_attr_one_hot[:, edge_type_id] = 1.0
            group_size_feat_value = 1.0 / len(indices)
            extra_feat = torch.full((bidir.shape[1], 1), group_size_feat_value, device=device)
            edge_attr_combined = torch.cat([edge_attr_one_hot, extra_feat], dim=1)
            all_edges_tensors.append(bidir)
            all_edge_type_and_feature_tensors.append(edge_attr_combined)
            total_groups_processed += 1
            total_edges_added += bidir.shape[1]

    if all_edges_tensors:
        edge_index = torch.cat(all_edges_tensors, dim=1)
        edge_attr = torch.cat(all_edge_type_and_feature_tensors, dim=0)
    else:
        edge_index = torch.empty((2, 0), dtype=torch.long, device=device)
        edge_attr = torch.empty((0, TOTAL_EDGE_DIM), dtype=torch.float32, device=device)

    print(f"{debug_prefix} Processed groups: {total_groups_processed:,}")
    print(f"{debug_prefix} Total edges added: {total_edges_added:,}")
    return edge_index, edge_attr

def create_loan_graph(origination_df, d_timer_threshold, graph_type):

    print(f"\n### Constructing {graph_type} graph (d_timer >= {d_timer_threshold}) ###")

    if 'd_timer' in origination_df.columns:
        origination_df['d_timer'] = pd.to_numeric(origination_df['d_timer'], errors='coerce')
        origination_df['d_timer'] = origination_df['d_timer'].fillna(origination_df['d_timer'].median() if not origination_df['d_timer'].isnull().all() else 0)
        print(f"- d_timer min: {origination_df['d_timer'].min():.2f}, max: {origination_df['d_timer'].max():.2f}")
    else:
        print("column not found in DataFrame. Cannot filter by d_timer.")
        origination_df['d_timer'] = d_timer_threshold + 1

    filtered_df = origination_df[origination_df['d_timer'] >= d_timer_threshold].copy()
    print(f"Nodes after d_timer filtering: {len(filtered_df):,}")

    if filtered_df.empty:
        print(f"No nodes found for {graph_type} graph with d_timer >= {d_timer_threshold}")
        return None

    final_nodes = filtered_df.reset_index(drop=True)
    final_nodes['node_idx'] = final_nodes.index

    feature_cols = ['fico','if_fthb','cnt_borr','cnt_units','dti','ltv',
                    'orig_upb','loan_term','int_rt','if_prim_res','if_corr','if_sf','if_purc']
    if 'current_upb' in final_nodes.columns:
        feature_cols.append('current_upb')

    existing_features = [f for f in feature_cols if f in final_nodes.columns]

    for col in existing_features:
        final_nodes[col] = pd.to_numeric(final_nodes[col], errors='coerce').fillna(0)

    x = torch.tensor(final_nodes[existing_features].values, dtype=torch.float, device=device)
    print(f"Node features shape: {x.shape} | {x.device}")

    print("Preparing edge groups...")
    if 'area' not in final_nodes.columns or 'provider' not in final_nodes.columns:
        print("'area' or 'provider' column missing. Cannot create grouping edges. Proceeding with no edges.")
        area_groups = []
        provider_groups = []
        area_provider_groups = []
    else:
        area_groups = [g['node_idx'].tolist() for _,g in final_nodes.groupby('area')]
        provider_groups = [g['node_idx'].tolist() for _,g in final_nodes.groupby('provider')]
        area_provider_groups = [g['node_idx'].tolist() for _,g in final_nodes.groupby(['area','provider'])]

    print("Building edges...")

    print("\n### Creating AREA edges ###")
    area_edges, area_edge_types = create_edges(
        area_groups, EDGE_TYPE_AREA, NUM_EDGE_TYPES, debug_prefix="[AREA]", max_edges_per_group=20
    )
    print(f"Area edges created: {area_edges.shape[1]:,}")

    print("\n### Creating PROVIDER edges ###")
    provider_edges, provider_edge_types = create_edges(
        provider_groups, EDGE_TYPE_PROVIDER, NUM_EDGE_TYPES, debug_prefix="[PROVIDER]", max_edges_per_group=20
    )
    print(f"Provider edges created: {provider_edges.shape[1]:,}")

    print("\n### Creating AREA+PROVIDER edges ###")
    area_provider_edges, area_provider_edge_types = create_edges(
        area_provider_groups, EDGE_TYPE_AREA_PROVIDER, NUM_EDGE_TYPES, debug_prefix="[A+P]", max_edges_per_group=20
    )
    print(f"Area+Provider edges created: {area_provider_edges.shape[1]:,}")

    edges_to_concat = []
    if area_edges.shape[1] > 0:
        edges_to_concat.append(torch.cat([area_edges, area_edge_types.T], dim=0))
    if provider_edges.shape[1] > 0:
        edges_to_concat.append(torch.cat([provider_edges, provider_edge_types.T], dim=0))
    if area_provider_edges.shape[1] > 0:
        edges_to_concat.append(torch.cat([area_provider_edges, area_provider_edge_types.T], dim=0))

    if not edges_to_concat:
        print("No edges created across all types. Returning graph with no edges.")
        edge_index = torch.empty((2,0), dtype=torch.long, device=device)
        edge_attr = torch.empty((0, TOTAL_EDGE_DIM), dtype=torch.float32, device=device)
    else:
        all_combined_edges_pre_dedup = torch.cat(edges_to_concat, dim=1).T
        print(f"Total edges (before unique_with_type): {all_combined_edges_pre_dedup.shape[0]:,}")

        unique_combined_edges = torch.unique(all_combined_edges_pre_dedup, dim=0)
        edge_index = unique_combined_edges[:, :2].T
        edge_attr = unique_combined_edges[:, 2:]

        print(f"Final unique edges (with type and feature distinction): {edge_index.shape[1]:,}")

    if 'default' in final_nodes.columns:
        y = torch.tensor(final_nodes['default'].values, dtype=torch.long, device=device)
    else:
        print("'default' column not found for target. Using dummy target of zeros.")
        y = torch.zeros(len(final_nodes), dtype=torch.long, device=device)

    graph_data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
    graph_data.num_nodes = len(final_nodes)

    print(f"Graph constructed | Nodes: {graph_data.num_nodes:,} | Edges: {graph_data.edge_index.shape[1]:,}")
    print(f"Edge attributes shape: {graph_data.edge_attr.shape} (Expected: (num_edges, {TOTAL_EDGE_DIM}))")

    return graph_data

if __name__ == "__main__":
    print("\nLoading data for graph construction...")

    train_graph_path = '../data/cleaned_data/df_origination_train_graph_scaled.csv'
    test_graph_path = '../data/cleaned_data/df_origination_test_graph_scaled.csv'

    try:
        df_train_graph = pd.read_csv(train_graph_path)
        df_test_graph = pd.read_csv(test_graph_path)
        print(f"Train graph data: {df_train_graph.shape} | Test graph data: {df_test_graph.shape}")
    except FileNotFoundError as e:
        print(f"Error loading graph data CSVs.")
        print(f"Details: {e}")
        exit() 

    print("\nBuilding training graph (2015 Jan-Jun)...")
    train_graph = create_loan_graph(df_train_graph, GRAPH_D_TIMER_THRESHOLD, 'train')
    if train_graph:
        torch.save(train_graph, os.path.join(OUTPUT_GRAPH_DIR, 'train_graph.pt'))
        print(f"Saved training graph to {os.path.join(OUTPUT_GRAPH_DIR, 'train_graph.pt')}")
    else:
        print("Training graph was not built (likely due to no nodes).")

    print("\nBuilding standard test graph (2016 Jan-Jun)...")
    test_graph = create_loan_graph(df_test_graph, GRAPH_D_TIMER_THRESHOLD, 'test')
    if test_graph:
        torch.save(test_graph, os.path.join(OUTPUT_GRAPH_DIR, 'test_graph.pt'))
        print(f"Saved standard test graph to {os.path.join(OUTPUT_GRAPH_DIR, 'test_graph.pt')}")
    else:
        print("Standard test graph was not built.")

    print("\nLoading data for July explanation graph construction...")

    explanation_data_path = '../data/cleaned_data/df_origination_test_explanation_scaled.csv'
    try:
        df_explanation_raw = pd.read_csv(explanation_data_path)
        print(f"Raw explanation data loaded: {df_explanation_raw.shape}")
    except FileNotFoundError as e:
        print(f"Error loading explanation data CSV.")
        print(f"Details: {e}")
        df_explanation_raw = pd.DataFrame()

    if 'month' in df_explanation_raw.columns and 'year' in df_explanation_raw.columns:
        df_explanation_raw['month'] = pd.to_numeric(df_explanation_raw['month'], errors='coerce')
        df_explanation_raw['year'] = pd.to_numeric(df_explanation_raw['year'], errors='coerce')
        df_july_originations = df_explanation_raw[
            (df_explanation_raw['year'] == 2016) &
            (df_explanation_raw['month'] == 7)
        ].copy()

        print(f"Filtered for 2016 July originations: {df_july_originations.shape}")

        if df_july_originations.empty:
            print("No 2016 July originated loans found in explanation dataset. Cannot build July explanation graph.")
            july_explanation_graph = None
        else:
            print("\nBuilding July explanation graph (2016 July Originations)...")
            july_explanation_graph = create_loan_graph(df_july_originations,
                                                           d_timer_threshold=0,
                                                           graph_type='july_explanation')
            if july_explanation_graph:
                torch.save(july_explanation_graph, os.path.join(OUTPUT_GRAPH_DIR, 'july_explanation_graph.pt'))
                print(f"Saved July explanation graph to {os.path.join(OUTPUT_GRAPH_DIR, 'july_explanation_graph.pt')}")
            else:
                print("July explanation graph was not built (likely due to no nodes).")
    else:
        print("'month' or 'year' columns not found in df_origination_test_explanation_scaled.csv. Cannot filter for July originations.")

    print("\nDone! All graphs saved to:", OUTPUT_GRAPH_DIR)