# SHAP interpretation

In [5]:
import shap
import pandas as pd
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import pickle

from sklearn.metrics import (
    r2_score, max_error, mean_absolute_error, mean_squared_error,
    mean_absolute_percentage_error, make_scorer
)
from sklearn.model_selection import (
    TimeSeriesSplit, train_test_split, cross_val_score)

from glob import glob
import os
import optuna
import joblib
from tqdm import tqdm
import json

import warnings
warnings.filterwarnings('ignore')

from datetime import datetime

import psutil
    
src_date = "2023-06-17"     

ram_gb = psutil.virtual_memory().total / 2**30 # total physical memory in bytes
print(f"RAM: {ram_gb:.2f} GB")

def _change_bf_name(name):
    rename_map = {
        'all': '50-1000',
        '50-50' : '50',
        '100-100': '100',
        '150-150': '150',
        '200-200': '200',
        '250-250': '250',
        '500-500': '500',
        '1000-1000': '1000',
    }
    return rename_map.get(name, name)


# preprocess utils

# compress dtypes
dtypes_compressed = {
    'datetime': np.object_,    
    'hour': np.int8,
    'month': np.int8,
    'dayofweek': np.int8,
    'number_of_lanes': np.float32,
    
    'speed_limit_kph': np.float32,    
    
    'lon': np.float64,
    'lat': np.float64,
    'elevation': np.int16,    
    'agg_speed': np.float64,  
    
    'pix_business_4x4_200': np.int16,
    'pix_residential_4x4_200': np.int16,
    'pix_industrial_4x4_200': np.int16,
    'pix_institutional_4x4_200': np.int16,
    'pix_business_4x4_50': np.int16,
    'pix_residential_4x4_50': np.int16,
    'pix_industrial_4x4_50': np.int16,
    'pix_institutional_4x4_50': np.int16,
    'pix_business_4x4_100': np.int16,
    'pix_residential_4x4_100': np.int16,
    'pix_industrial_4x4_100': np.int16,
    'pix_institutional_4x4_100': np.int16,
    'pix_business_4x4_150': np.int16,
    'pix_residential_4x4_150': np.int16,
    'pix_industrial_4x4_150': np.int16,
    'pix_institutional_4x4_150': np.int16,
    'pix_business_4x4_250': np.int16,
    'pix_residential_4x4_250': np.int16,
    'pix_industrial_4x4_250': np.int16,
    'pix_institutional_4x4_250': np.int16,
    'pix_business_4x4_500': np.int16,
    'pix_residential_4x4_500': np.int16,
    'pix_industrial_4x4_500': np.int16,
    'pix_institutional_4x4_500': np.int16,
    'pix_business_4x4_1000': np.int16,
    'pix_residential_4x4_1000': np.int16,
    'pix_industrial_4x4_1000': np.int16,
}

def correct_service_rd_kphlimit(temp_df):
    temp_df.loc[temp_df['speed_limit_kph'] == 36.7, 'speed_limit_kph'] = 20
    return temp_df

def impute_lanes(df):
    df['number_of_lanes'] = df['number_of_lanes'].replace(np.nan, value=2)
    return df

def filter_brgy(df):
    df['barangay'] = df['barangay'].fillna('Out-of-town')
    
    # not in cauayan boundary
    df = df.loc[df['barangay'] != 'Out-of-town'] 
    return df

def filter_num_periods(df, thresh):
    df = df.loc[df['num_periods'] >= thresh]
    return df

def filter_vehicles(df, remove_list=None):
    df = df.loc[~df.vehicle_id.isin(remove_list)]
    return df

def prepare_df(df):
    df = df.astype(dtype=dtypes_compressed)
    
    # vehicles with erratic sending of data; stopped sending data before collection date
    remove_list = [
        123200872653, 123200872678, 
        123200872713, 123200872727, 123200872819,
    ] 
    df = filter_brgy(df)
    df = filter_vehicles(df, remove_list=remove_list)
    df = filter_num_periods(df, thresh=12)
    df = impute_lanes(df)
    df = correct_service_rd_kphlimit(df)
    df = df.drop_duplicates()
    return df

RAM: 251.54 GB


In [6]:
# SANITY CHECK
from itertools import product

buffer_ablation = [
    'all',
    [50, 100, 150, 200, 250, 500],
    [50, 100, 150, 200, 250],
    [50, 100, 150, 200],
    [50, 100, 150],
    [50, 100],
    [1000],
    [500],
    [250],
    [200],
    [150],
    [100],
    [50], 
]


etypes = [
    "all_features",
    "landuse_and_time", # queue this with the ablation
    "landuse_only",
]

exp_combis = product(etypes, buffer_ablation)

fixed_params = {
    'tree_learner': 'data',
    'seed':11,
    'verbose': -1,
    'boosting_type': 'goss',    
}

best_param_dict = {}

for combi in exp_combis:
    experiment_type = combi[0]
    buffers = combi[1]
    
    study_name = f'{experiment_type}'
    
    if buffers=='all':
        buffers = buffers
    else:
        buffers = "-".join([str(buffers[0]), str(buffers[-1])])

    optuna_dir = f'../experiments/post-review/{buffers}/{src_date}'
    optuna_path = os.path.join(optuna_dir, f"{study_name}.db")
#     print("OPTUNA PATH: ", optuna_path)
    
    if not os.path.exists(optuna_path):
        print(f"NO OPTUNA STUDY: {buffers}-{experiment_type}")
    
    study_name = combi[0] # expriement type
    study = optuna.load_study(
        study_name=study_name, 
        storage=f"sqlite:///{optuna_path}"
    )
    params = study.best_params
    params.update(fixed_params)
    
    feature_set = combi[0]
    buffer_range = _change_bf_name(buffers) # combi[1]
    
    best_param_dict[f"{feature_set}-{buffer_range}"] = params

In [7]:
# load data
src_dir = f"../datasets/processed/post-review-tt-splits/{src_date}"
train = pd.read_csv(os.path.join(src_dir, "train.csv"))
train = prepare_df(train)

test = pd.read_csv(os.path.join(src_dir, "test.csv"))
test = prepare_df(test)

In [8]:
# feature
feature_sets = ['all_features', 'landuse_only', 'landuse_and_time']
buffer_ranges = ['50-1000']

shap_explanations = []
for combi in tqdm(product(feature_sets, buffer_ranges)):
    feature_set = combi[0]
    buffer_range = combi[1]

    # target
    target = 'agg_speed'

    # features
    time_cols = ['hour', 'dayofweek']

    road_cols = [
        'number_of_lanes', 'speed_limit_kph', 'elevation',
    ]
    buffer_sizes = [50, 100, 150, 200, 250, 500, 1000]
    
    if len(buffer_range.split('-'))==1:
        buffer_sizes = [int(buffer_range.split('-')[0])]
    else:
        min_size_index = buffer_sizes.index(
            int(buffer_range.split('-')[0])
        )
        max_size_index = buffer_sizes.index(
            int(buffer_range.split('-')[1])
        )
        buffer_sizes = buffer_sizes[min_size_index:max_size_index + 1]
    print(buffer_sizes)
    
    landuse_cols = []
    for size in buffer_sizes:
        landuse_cols.extend(
            [
                f'pix_residential_4x4_{size}',
                f'pix_institutional_4x4_{size}',
                f'pix_industrial_4x4_{size}',
                f'pix_business_4x4_{size}',
            ]
        )
    cat_cols = time_cols

    if feature_set == "all_features":
        features = time_cols + road_cols + landuse_cols
        cat_cols_index = [
            index for (index, col) in enumerate(features) if col in cat_cols]

    elif feature_set == "landuse_and_time":
        features = time_cols + landuse_cols
        cat_cols_index = [
            index for (index, col) in enumerate(features) if col in cat_cols]

    else:
        features = landuse_cols
        cat_cols_index = None    
    
    
    params = best_param_dict[f"{feature_set}-{buffer_range}"]
    
    X_train, y_train = train[features], train[target]
    X_test, y_test = test[features], test[target]
    
    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train, categorical_feature=cat_cols_index)
    
    preds = model.predict(X_test)
    true = y_test.values
    
    final_results = {
        "y_true": list(true),
        "y_pred": list(preds),
        "feature_list": X_test.columns.tolist(),
    }
    
    explainer = shap.TreeExplainer(model)
    shap_values = explainer(X_test)
    print("SHAP res shape: ", shap_values.shape)
    
    final_results["shap_values"] = shap_values
    
    save_dir = f"../experiments/post-review/shap/{src_date}"
    os.makedirs(save_dir, exist_ok=True)
    
    save_fname = os.path.join(save_dir, f"{feature_set}-{buffer_range}-shap.pickle")
    with open(save_fname, 'wb') as handle:
        pickle.dump(final_results, handle)
        
    shap_explanations.append(final_results) # for plotting in this notebook

0it [00:00, ?it/s]

[50, 100, 150, 200, 250, 500, 1000]
SHAP res shape:  (266751, 33)


1it [08:16, 496.40s/it]

[50, 100, 150, 200, 250, 500, 1000]
SHAP res shape:  (266751, 28)


2it [27:30, 883.04s/it]

[50, 100, 150, 200, 250, 500, 1000]
SHAP res shape:  (266751, 30)


3it [31:40, 633.39s/it]


## End