In [2]:
from LendingClubAutoencoder import preprocessing, autoencoders, training, testing

import torch

from datetime import datetime, timedelta

import os
import shutil

import json
import pickle

import polars as pl
import numpy as np
import scipy.stats

import plotly.graph_objects as go

# Model Training & Evaluation

In [3]:
evaluation_results_path = 'evaluation_results.json'

if os.path.exists(evaluation_results_path):
    with open(evaluation_results_path, 'r') as f:
        evaluation_results = json.load(f)
    print('Loaded Evaluation Results')

else:
    evaluation_results = {}

    learning_rate = 1e-3
    
    print('Loaded Data Handler')
    #Data
    lending_club_data_handler = preprocessing.DataHandler(csv_path='local_data/all_lending_club_loan_data_2007-2018.csv')

    #Date Ranges
    train_start = datetime(2013, 1, 1)
    train_end = datetime(2017, 5, 31)

    validation_start = datetime(2017, 6, 1)
    validation_end = datetime(2017, 12, 31)

    test_year = 2018

    #Cross Validation
    print('Running Cross Validation')
    evaluation_results['cross-validation'] = testing.cross_validate_vae(lending_club_data_handler, train_start, train_end, learning_rate=learning_rate)

    print('Preparing for Final Model')
    if os.path.exists('trained_models'):
        shutil.rmtree('trained_models')

    evaluation_results['evaluation_results'] = {}

    #Full evaluation
    train_data, train_mask = lending_club_data_handler.get_train_data(train_start, train_end)
    validation_data, validation_mask = lending_club_data_handler.get_test_data(validation_start, validation_end)

    train_loader = preprocessing.to_torch_dataloader(train_data,train_mask)
    validation_loader = preprocessing.to_torch_dataloader(validation_data,validation_mask)

    sigmoid_mask = lending_club_data_handler.get_sigmoid_feature_mask(as_torch=True)
    binary_mask = lending_club_data_handler.get_binary_feature_mask(as_torch=True)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Instantiate model and optimiser 
    model = autoencoders.VariationalAutoencoder(input_size=len(train_data[0]), sigmoid_mask=sigmoid_mask)
    optimiser = torch.optim.Adam(model.parameters(), lr=learning_rate)#original is 1e-3

    # Train model 
    print('Training Final Model')
    training.train_variational_autoencoder(model, optimiser, train_loader, validation_loader, binary_mask=binary_mask, device=device)

    print('Evaluating Final Model')
    quarter_counter = 1
    for start_month, end_month in zip([1,4,7,10], [3,6,9,12]):
        test_start = datetime(test_year, start_month, 1)
        
        try:
            test_end = datetime(test_year, end_month, 31)
        except ValueError:
            test_end = datetime(test_year, end_month, 30)

        test_data, test_mask = lending_club_data_handler.get_test_data(test_start, test_end)
        test_loader = preprocessing.to_torch_dataloader(test_data,test_mask)

        model_file_name = f'trained_models/vae_best-input_size:{len(train_data[0])}.pt'

        total_loss, rmse_loss, bce_loss = testing.test_vae(model_file_name, test_loader, sigmoid_mask, binary_mask, device)

        evaluation_results['evaluation_results'][f'q{quarter_counter}'] = {
            'total_loss': total_loss,
            'rmse_loss': rmse_loss,
            'bce_loss': bce_loss
        }

        quarter_counter = quarter_counter + 1

    json.dump(evaluation_results, open(evaluation_results_path, 'w'))
    
    with open('trained_models/lending_club_data_handler.pkl', 'wb') as f:
        pickle.dump(lending_club_data_handler, f)

Loaded Evaluation Results


In [5]:
vector_representations_path = 'vector_representations.csv'

if os.path.exists(evaluation_results_path):
    vectors_df = pl.read_csv(vector_representations_path)
    print('Vectors Loaded Successfuly')
else:
    # Load data handler and set up model
    try:
        with open('trained_models/lending_club_data_handler.pkl', 'rb') as file:
            lending_club_data_handler = pickle.load(file)
    except FileNotFoundError:
        raise FileExistsError('Data handler file not found. Please run the model training cell first.')

    sigmoid_mask = lending_club_data_handler.get_sigmoid_feature_mask(as_torch=True)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = training.get_best_model(autoencoders.VariationalAutoencoder, sigmoid_mask)

    model.eval()

    all_quarters_dfs = []
    test_year = 2018
    
    # Process each quarter
    for quarter_counter, (start_month, end_month) in enumerate(zip([1, 4, 7, 10], [3, 6, 9, 12]), 1):
        print(f'Processing Q{quarter_counter}...')
        test_start = datetime(test_year, start_month, 1)
        √ç
        try:
            test_end = datetime(test_year, end_month, 31)
        except ValueError:
            test_end = datetime(test_year, end_month, 30)

        test_data, test_mask = lending_club_data_handler.get_test_data(test_start, test_end)
        test_loader = preprocessing.to_torch_dataloader(test_data, test_mask)

        quarter_means = []
        with torch.no_grad():
            for data_batch, mask_batch in test_loader:
                data_batch = data_batch.to(device)
                _, mean = model(data_batch)
                quarter_means.append(mean.cpu().numpy())

        means_array = np.vstack(quarter_means)
        num_features = means_array.shape[1]
        feature_columns = [f'feature_{i+1}' for i in range(num_features)]
        
        df = pl.DataFrame(means_array, schema=feature_columns)
        df = df.with_columns(pl.lit(f'q{quarter_counter}').alias('quarter'))
        
        # Reorder columns to have 'quarter' first
        df = df.select(['quarter'] + feature_columns)
        
        all_quarters_dfs.append(df)

    # Concatenate all quarterly dataframes
    vectors_df = pl.concat(all_quarters_dfs)

    vectors_df.write_csv(vector_representations_path)
    print('Successfully saved to CSV.')


Vectors Loaded Successfuly


In [18]:

def plot_evaluation_results(evaluation_results: dict, metric_name: str):
    # compute cv mean
    cv_values = [fold[metric_name] for fold in evaluation_results['cross-validation'].values()]
    cv_mean   = sum(cv_values) / len(cv_values)

    # original eval keys and values
    eval_keys   = list(evaluation_results['evaluation_results'].keys())
    eval_values = [evaluation_results['evaluation_results'][k][metric_name] for k in eval_keys]

    # build buffered lists
    x_labels    = ['buffer1'] + eval_keys + ['buffer2']
    bar_values  = [0] + eval_values + [0]
    line_values = [cv_mean] * len(x_labels)

    # bar trace
    bar = go.Bar(
        x=x_labels,
        y=bar_values,
        marker_color='black',
        name='Evaluation Results'
    )

    # cv mean line trace
    line = go.Scatter(
        x=x_labels,
        y=line_values,
        mode='lines',
        line=dict(color='firebrick', width=2),
        name=f'CV Mean {cv_mean:.3f}'
    )

    fig = go.Figure(data=[bar, line])
    fig.update_layout(
        plot_bgcolor='white',
        xaxis_title_text='Test Quarter',
        yaxis_title_text=f'{metric_name.replace("_"," ")}',
        title=f'Final Model {" ".join([word.upper() for word in metric_name.replace("_"," ").split(" ")])}'
    )

    # zoom x axis to skip buffers
    start_idx = 1
    end_idx   = len(x_labels) - 2
    fig.update_xaxes(range=[start_idx - 0.5, end_idx + 0.5])

    fig.show()

plot_evaluation_results(evaluation_results, 'rmse_loss')
plot_evaluation_results(evaluation_results, 'bce_loss')

In [9]:
# Wasserstein Distance Calculation and Heatmap

# get unique quarters
quarters = sorted(vectors_df['quarter'].unique())
features = [f'feature_{i+1}' for i in range(16)]
n = len(quarters)

# collect feature arrays for each quarter
quarter_feature_arrays = []
for q in quarters:
    q_vectors = vectors_df.filter(pl.col('quarter') == q).select(features).to_numpy()
    quarter_feature_arrays.append(q_vectors)

# compute pairwise Wasserstein distances (average over features)
wasserstein_matrix = np.zeros((n, n))
for i in range(n):
    for j in range(n):
        # average Wasserstein distance over all features
        dists = []
        for f in range(len(features)):
            arr1 = quarter_feature_arrays[i][:, f]
            arr2 = quarter_feature_arrays[j][:, f]
            dist = scipy.stats.wasserstein_distance(arr1, arr2)
            dists.append(dist)
        wasserstein_matrix[i, j] = np.mean(dists)

# plot heatmap
fig = go.Figure(
    data=go.Heatmap(
        z=wasserstein_matrix,
        x=quarters,
        y=quarters,
        colorscale='RdBu_r',  # blue=low, red=high
        text=np.round(wasserstein_matrix, 3),  # round for readability
        texttemplate='%{text}',
        zmin=0,
        zmax=np.max(wasserstein_matrix)
    )
)
fig.update_layout(
    title='Wasserstein Distance Between Quarters',
    xaxis_title='Quarter',
    yaxis_title='Quarter'
)
fig.show()

In [11]:
# PCA visualisation of quarters in 2D

# import PCA if not already imported
from sklearn.decomposition import PCA

# prepare data for PCA
features = [f'feature_{i+1}' for i in range(16)]
quarters = sorted(vectors_df['quarter'].unique())

# stack all feature vectors for PCA fit
all_vectors = vectors_df.select(features).to_numpy()

# fit PCA and transform
pca = PCA(n_components=2)
all_vectors_2d = pca.fit_transform(all_vectors)

# add PCA results to DataFrame
vectors_df = vectors_df.with_columns([
    pl.Series('pca1', all_vectors_2d[:, 0]),
    pl.Series('pca2', all_vectors_2d[:, 1])
])

# compute means and standard deviations for each quarter
quarter_stats = {}
for q in quarters:
    q_df = vectors_df.filter(pl.col('quarter') == q)
    pca1 = q_df['pca1'].to_numpy()
    pca2 = q_df['pca2'].to_numpy()
    mean = np.array([pca1.mean(), pca2.mean()])
    std = np.array([pca1.std(), pca2.std()])
    quarter_stats[q] = {'mean': mean, 'std': std}

# plot each quarter as dot and circle
colour_map = {'q1': 'red', 'q2': 'green', 'q3': 'blue', 'q4': 'orange'}
fig = go.Figure()

for q in quarters:
    mean = quarter_stats[q]['mean']
    std = quarter_stats[q]['std']
    colour = colour_map.get(q, 'black')
    # dot for mean
    fig.add_trace(go.Scatter(
        x=[mean[0]], y=[mean[1]],
        mode='markers',
        marker=dict(size=10, color=colour),
        name=f'{q} mean',
        showlegend=True
    ))
    # circle for 1 std dev
    theta = np.linspace(0, 2 * np.pi, 100)
    circle_x = mean[0] + std[0] * np.cos(theta)
    circle_y = mean[1] + std[1] * np.sin(theta)
    fig.add_trace(go.Scatter(
        x=circle_x, y=circle_y,
        mode='lines',
        line=dict(color=colour, width=2, dash='dot'),
        name=f'{q} 1 std dev',
        showlegend=True
    ))

# final plot layout
fig.update_layout(
    title='PCA of Quarter Feature Vectors',
    xaxis_title='PCA 1',
    yaxis_title='PCA 2',
    plot_bgcolor='white',
    legend_title='Quarter'
)
fig.show()