# ML: Tuning LightGBM for Aggregated Speed Prediction

In [1]:
import shap
import pandas as pd
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, make_scorer


from glob import glob
import os
import optuna
import joblib
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

from datetime import datetime

In [2]:
import psutil
    
ram_gb = psutil.virtual_memory().total / 2**30 # total physical memory in bytes
print(f"RAM: {ram_gb:.2f} GB")

RAM: 251.79 GB


In [3]:
# read data: mean
agg = 'mean'
lookback_window = 60
source_date = '2022-06-01'

glob_path = f'../datasets/per-vehicle-moving-average/{agg}-window-{lookback_window}-{source_date}/*.csv'
filepaths = glob(glob_path)

In [4]:
def correct_service_rd_kphlimit(temp_df):
    temp_df.loc[temp_df['speed_limit_kph'] == 36.7, 'speed_limit_kph'] = 20
    return temp_df

In [5]:
# get dtypes
dtypes_dict = pd.read_csv(filepaths[0]).dtypes.to_dict()

# set low_memory=False to ensure no mixed types
df = pd.concat([pd.read_csv(path, dtype=dtypes_dict) for path in filepaths])
df = correct_service_rd_kphlimit(df)

df.info()
display(df.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33817601 entries, 0 to 132806
Data columns (total 43 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   datetime               object 
 1   altitude               int64  
 2   angle                  int64  
 3   instant_speed          int64  
 4   accel                  float64
 5   ai1                    int64  
 6   ai2                    int64  
 7   ai3                    int64  
 8   ai4                    int64  
 9   alarm_code             int64  
 10  bats                   int64  
 11  decel                  float64
 12  di1                    int64  
 13  di2                    int64  
 14  di3                    int64  
 15  di4                    int64  
 16  di5                    int64  
 17  do1                    int64  
 18  do2                    int64  
 19  do3                    int64  
 20  do4                    int64  
 21  gpslev                 int64  
 22  hdop              

Unnamed: 0,datetime,altitude,angle,instant_speed,accel,ai1,ai2,ai3,ai4,alarm_code,...,hour,dayofweek,month,elevation,pix_business_4x4,pix_residential_4x4,pix_industrial_4x4,pix_institutional_4x4,num_periods,agg_speed
0,2021-02-19 09:42:48+08:00,79,127,0,0.0,0,1023,307,628,0,...,9,4,2,58,222,1111,14,503,1,0.0
1,2021-02-19 09:43:19+08:00,79,93,0,0.0,0,1023,303,628,0,...,9,4,2,58,222,1111,14,503,2,0.0
2,2021-02-19 09:43:49+08:00,79,189,0,0.0,0,1023,305,629,0,...,9,4,2,58,222,1111,14,503,2,0.0
3,2021-02-19 09:44:19+08:00,79,115,0,0.0,0,1023,306,629,0,...,9,4,2,58,222,1111,14,503,2,0.0
4,2021-02-19 09:44:47+08:00,79,13,0,0.0,0,1023,300,629,33,...,9,4,2,58,222,1111,14,503,3,0.0


In [6]:
# quick eda on some cols
def filter_data(df, thresh):
    if thresh:
        df = df.loc[df['num_periods'] >= thresh]
        return df
    else:
        return df

In [7]:
# filtered
thresh = 12
original_size = df.shape[0]
df = filter_data(df, thresh=thresh)
new_size = df.shape[0]

above_thresh_percentage = (new_size / original_size) * 100
print(f"Percentage of samples with period over {thresh}: ", above_thresh_percentage)

Percentage of samples with period over 12:  2535.5239204889663


In [8]:
# use filtered
test_size = 0.20 # 20% of data set used for evaluation, 80% as train
tuning_size = 0.20 # 20% of the training set is used for tuning


retrain, test = train_test_split(df, test_size=test_size, random_state=11)
_, tune = train_test_split(retrain, test_size=tuning_size, random_state=11)

run_date = str(datetime.now().date()) # now date
print(run_date)

save_dir = f'../datasets/inference/window-{lookback_window}-filter-{thresh}'

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

test.to_csv(f'{save_dir}/{agg}-window-{lookback_window}-test-{run_date}.csv', index=False) # 20%
retrain.to_csv(f'{save_dir}/{agg}-window-{lookback_window}-retrain-{run_date}.csv', index=False) # 80% uncluding tuning set

2022-06-02


In [9]:
print("Retrain shape: ", retrain.shape)
print("Test shape: ", test.shape)
print("Tune shape: ", tune.shape)

Retrain shape:  (1067001, 43)
Test shape:  (266751, 43)
Tune shape:  (213401, 43)


In [10]:
# usecols for ML
discrete_num_cols = ['number_of_lanes', 'speed_limit_kph',]
building_types = ['residential', 'business', 'industrial', 'institutional']
pixel_size = '4x4' # can change
landuse_cols = [f'pix_{type_}_{pixel_size}' for type_ in building_types]
num_cols = landuse_cols + discrete_num_cols + ['elevation']

# cat cols
cat_cols = ['hour', 'dayofweek']
features = sorted(cat_cols + num_cols) # sort for reference in feature order
cat_cols_index = [index for (index, col) in enumerate(features) if col in cat_cols]
cat_cols_index

[0, 2]

In [11]:
# prepare data: features, target
if agg == 'instant':
    target = 'instant_speed'
else:
    target = 'agg_speed'
    
X, y = tune[features], tune[target]

# prepare kfold
kf = KFold(n_splits=10)

# prepare rmse scorer
def rmse(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return rmse
rmse_scorer = make_scorer(rmse) # rmse scoring metric for cross_val

In [12]:
def objective(trial):
    """Objective function for optuna tuning. Uses LGBM sklearn API."""
    lgb_params = {
        'tree_learner': 'data',
        'seed':11,
        'verbose': -1,
        'boosting_type': trial.suggest_categorical(
            'boosting_type', ['goss', 'dart']),
        'objective': trial.suggest_categorical(
            'objective', ['regression', 'tweedie', 'regression_l1', 'mape']),
        'num_leaves': trial.suggest_int('num_leaves', 2, 512, log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 100, 10000, log=True),
        'max_bin': trial.suggest_int('max_bin', 100, 300),
        'num_iterations': trial.suggest_int("num_iterations", 100, 1000),
        'learning_rate' : trial.suggest_float("learning_rate", 0.001, 1, log=True),
    }
    if lgb_params['boosting_type']=='goss':
        top_rate =  trial.suggest_uniform("top_rate", 0, 1.0)
        other_rate = trial.suggest_uniform("other_rate", 0, (1.0 - top_rate)) 

        # corrects the top_rate/other_rate values to only sum to 1
        if (top_rate + other_rate) > 1.0:
            lgb_params['other_rate'] = other_rate / (top_rate + other_rate)
            lgb_params['top_rate'] = top_rate / (top_rate + other_rate)
        

    model = lgb.LGBMRegressor(**lgb_params)
    scores = cross_val_score(estimator=model, X=X, y=y, scoring=rmse_scorer, cv=kf, n_jobs=-1, 
                             fit_params={'categorical_feature': cat_cols_index},
                             error_score='raise')
    return np.mean(scores)

In [17]:
study_name = f'{agg}-window-{lookback_window}-{run_date}'

save_dir = f'../experiments/window-{lookback_window}-filter-{thresh}/{agg}/'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
study = optuna.create_study(study_name=study_name,
                            direction='minimize',
                            storage=f'sqlite:///{save_dir}/{study_name}.db',
                            load_if_exists=True)

[32m[I 2022-06-02 18:50:45,016][0m Using an existing study with name 'mean-window-60-2022-06-02' instead of creating a new one.[0m


In [19]:
study.optimize(objective, n_trials=20)

[32m[I 2022-06-02 23:17:44,301][0m Trial 50 finished with value: 6.148724191904195 and parameters: {'boosting_type': 'goss', 'objective': 'tweedie', 'num_leaves': 195, 'min_data_in_leaf': 122, 'max_bin': 132, 'num_iterations': 676, 'learning_rate': 0.03528242509791282, 'top_rate': 0.9169961941970033, 'other_rate': 0.07091695599992787}. Best is trial 0 with value: 6.146534582023514.[0m


[32m[I 2022-06-02 23:18:48,292][0m Trial 51 finished with value: 6.150076781032701 and parameters: {'boosting_type': 'goss', 'objective': 'tweedie', 'num_leaves': 162, 'min_data_in_leaf': 118, 'max_bin': 142, 'num_iterations': 665, 'learning_rate': 0.03423904431337078, 'top_rate': 0.8564610933289047, 'other_rate': 0.11606691924579354}. Best is trial 0 with value: 6.146534582023514.[0m
[32m[I 2022-06-02 23:19:38,174][0m Trial 52 finished with value: 6.156528796663756 and parameters: {'boosting_type': 'goss', 'objective': 'tweedie', 'num_leaves': 157, 'min_data_in_leaf': 116, 'max_bin': 142, 'num_iterations': 514, 'learning_rate': 0.0680343370522847, 'top_rate': 0.9148323668049132, 'other_rate': 0.07254748473486217}. Best is trial 0 with value: 6.146534582023514.[0m


[32m[I 2022-06-02 23:20:17,159][0m Trial 53 finished with value: 6.166322902154059 and parameters: {'boosting_type': 'goss', 'objective': 'tweedie', 'num_leaves': 148, 'min_data_in_leaf': 112, 'max_bin': 130, 'num_iterations': 539, 'learning_rate': 0.08601376848679507, 'top_rate': 0.9231841881098511, 'other_rate': 0.06580522762808681}. Best is trial 0 with value: 6.146534582023514.[0m
[32m[I 2022-06-02 23:21:01,004][0m Trial 54 finished with value: 6.168928666220085 and parameters: {'boosting_type': 'goss', 'objective': 'tweedie', 'num_leaves': 151, 'min_data_in_leaf': 115, 'max_bin': 112, 'num_iterations': 484, 'learning_rate': 0.09229386350341405, 'top_rate': 0.8943009563992785, 'other_rate': 0.057274193780274214}. Best is trial 0 with value: 6.146534582023514.[0m


[32m[I 2022-06-02 23:21:26,709][0m Trial 55 finished with value: 6.344079891621131 and parameters: {'boosting_type': 'goss', 'objective': 'tweedie', 'num_leaves': 41, 'min_data_in_leaf': 244, 'max_bin': 144, 'num_iterations': 400, 'learning_rate': 0.06798434129373956, 'top_rate': 0.910897474822151, 'other_rate': 0.07524444275398887}. Best is trial 0 with value: 6.146534582023514.[0m




[32m[I 2022-06-02 23:26:40,158][0m Trial 56 finished with value: 7.187429184032759 and parameters: {'boosting_type': 'dart', 'objective': 'mape', 'num_leaves': 94, 'min_data_in_leaf': 161, 'max_bin': 131, 'num_iterations': 498, 'learning_rate': 0.17195853562113225}. Best is trial 0 with value: 6.146534582023514.[0m
[32m[I 2022-06-02 23:27:22,360][0m Trial 57 finished with value: 6.145078099725658 and parameters: {'boosting_type': 'goss', 'objective': 'tweedie', 'num_leaves': 157, 'min_data_in_leaf': 102, 'max_bin': 116, 'num_iterations': 543, 'learning_rate': 0.05364215507648957, 'top_rate': 0.8404036603505203, 'other_rate': 0.0019159712783337751}. Best is trial 57 with value: 6.145078099725658.[0m


[32m[I 2022-06-02 23:28:12,216][0m Trial 58 finished with value: 6.157409713102524 and parameters: {'boosting_type': 'goss', 'objective': 'tweedie', 'num_leaves': 108, 'min_data_in_leaf': 101, 'max_bin': 103, 'num_iterations': 667, 'learning_rate': 0.0532890478765734, 'top_rate': 0.8346372866474117, 'other_rate': 0.0815720277476892}. Best is trial 57 with value: 6.145078099725658.[0m
[32m[I 2022-06-02 23:28:54,319][0m Trial 59 finished with value: 6.667769045439508 and parameters: {'boosting_type': 'goss', 'objective': 'tweedie', 'num_leaves': 66, 'min_data_in_leaf': 2245, 'max_bin': 101, 'num_iterations': 665, 'learning_rate': 0.05484324673173297, 'top_rate': 0.7328911052103765, 'other_rate': 0.054860990504618436}. Best is trial 57 with value: 6.145078099725658.[0m


[32m[I 2022-06-02 23:29:07,169][0m Trial 60 finished with value: 7.847249913741955 and parameters: {'boosting_type': 'goss', 'objective': 'tweedie', 'num_leaves': 2, 'min_data_in_leaf': 3919, 'max_bin': 107, 'num_iterations': 443, 'learning_rate': 0.03505451609960645, 'top_rate': 0.827796283743702, 'other_rate': 0.023549543449573475}. Best is trial 57 with value: 6.145078099725658.[0m
[32m[I 2022-06-02 23:29:53,330][0m Trial 61 finished with value: 6.172127745656365 and parameters: {'boosting_type': 'goss', 'objective': 'tweedie', 'num_leaves': 251, 'min_data_in_leaf': 157, 'max_bin': 117, 'num_iterations': 516, 'learning_rate': 0.05237009063269441, 'top_rate': 0.8399437647569737, 'other_rate': 0.15272464574136707}. Best is trial 57 with value: 6.145078099725658.[0m


[32m[I 2022-06-02 23:30:34,356][0m Trial 62 finished with value: 6.183978676608802 and parameters: {'boosting_type': 'goss', 'objective': 'tweedie', 'num_leaves': 104, 'min_data_in_leaf': 102, 'max_bin': 298, 'num_iterations': 564, 'learning_rate': 0.03833735849454481, 'top_rate': 0.9549807271280363, 'other_rate': 0.02952292991655045}. Best is trial 57 with value: 6.145078099725658.[0m
[32m[I 2022-06-02 23:31:25,995][0m Trial 63 finished with value: 6.15645042869999 and parameters: {'boosting_type': 'goss', 'objective': 'tweedie', 'num_leaves': 168, 'min_data_in_leaf': 114, 'max_bin': 120, 'num_iterations': 680, 'learning_rate': 0.06595835784313814, 'top_rate': 0.8591855948719381, 'other_rate': 0.07605761913333606}. Best is trial 57 with value: 6.145078099725658.[0m


[32m[I 2022-06-02 23:32:09,460][0m Trial 64 finished with value: 6.144427720489359 and parameters: {'boosting_type': 'goss', 'objective': 'regression', 'num_leaves': 147, 'min_data_in_leaf': 206, 'max_bin': 117, 'num_iterations': 678, 'learning_rate': 0.07426186933742891, 'top_rate': 0.7342286982032082, 'other_rate': 0.050850992533279016}. Best is trial 64 with value: 6.144427720489359.[0m
[32m[I 2022-06-02 23:33:04,344][0m Trial 65 finished with value: 6.140158532947716 and parameters: {'boosting_type': 'goss', 'objective': 'regression', 'num_leaves': 166, 'min_data_in_leaf': 212, 'max_bin': 121, 'num_iterations': 689, 'learning_rate': 0.07126925567800038, 'top_rate': 0.7295747182221559, 'other_rate': 0.1278329577209684}. Best is trial 65 with value: 6.140158532947716.[0m


[32m[I 2022-06-02 23:33:40,963][0m Trial 66 finished with value: 6.181544101782207 and parameters: {'boosting_type': 'goss', 'objective': 'regression', 'num_leaves': 81, 'min_data_in_leaf': 236, 'max_bin': 118, 'num_iterations': 647, 'learning_rate': 0.10297143479823932, 'top_rate': 0.7197725992138957, 'other_rate': 0.036759083481447165}. Best is trial 65 with value: 6.140158532947716.[0m
[32m[I 2022-06-02 23:34:31,705][0m Trial 67 finished with value: 6.1756247838997735 and parameters: {'boosting_type': 'goss', 'objective': 'regression', 'num_leaves': 171, 'min_data_in_leaf': 329, 'max_bin': 123, 'num_iterations': 752, 'learning_rate': 0.07423005703969968, 'top_rate': 0.7908308437203216, 'other_rate': 0.021735645715970642}. Best is trial 65 with value: 6.140158532947716.[0m


[32m[I 2022-06-02 23:35:33,831][0m Trial 68 finished with value: 6.193876985881198 and parameters: {'boosting_type': 'goss', 'objective': 'regression', 'num_leaves': 233, 'min_data_in_leaf': 194, 'max_bin': 115, 'num_iterations': 698, 'learning_rate': 0.14233069688663197, 'top_rate': 0.7478241025312683, 'other_rate': 0.1333464150239656}. Best is trial 65 with value: 6.140158532947716.[0m






[32m[I 2022-06-02 23:51:00,653][0m Trial 69 finished with value: 6.892799538921314 and parameters: {'boosting_type': 'dart', 'objective': 'regression', 'num_leaves': 289, 'min_data_in_leaf': 479, 'max_bin': 148, 'num_iterations': 688, 'learning_rate': 0.028481629015029004}. Best is trial 65 with value: 6.140158532947716.[0m






# End