# ML: Tuning LightGBM for Aggregated Speed Prediction

In [1]:
import shap
import pandas as pd
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, make_scorer


from glob import glob
import os
import optuna
import joblib
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

from datetime import datetime

In [2]:
import psutil
    
ram_gb = psutil.virtual_memory().total / 2**30 # total physical memory in bytes
print(f"RAM: {ram_gb:.2f} GB")

RAM: 251.79 GB


In [3]:
# read data: mean
agg = 'mean'
lookback_window = 60
source_date = '2022-06-01'

glob_path = f'../datasets/per-vehicle-moving-average/{agg}-window-{lookback_window}-{source_date}/*.csv'
filepaths = glob(glob_path)

In [4]:
def correct_service_rd_kphlimit(temp_df):
    temp_df.loc[temp_df['speed_limit_kph'] == 36.7, 'speed_limit_kph'] = 20
    return temp_df

In [5]:
# get dtypes
dtypes_dict = pd.read_csv(filepaths[0]).dtypes.to_dict()

# set low_memory=False to ensure no mixed types
df = pd.concat([pd.read_csv(path, dtype=dtypes_dict) for path in filepaths])
df = correct_service_rd_kphlimit(df)

df.info()
display(df.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33817601 entries, 0 to 132806
Data columns (total 43 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   datetime               object 
 1   altitude               int64  
 2   angle                  int64  
 3   instant_speed          int64  
 4   accel                  float64
 5   ai1                    int64  
 6   ai2                    int64  
 7   ai3                    int64  
 8   ai4                    int64  
 9   alarm_code             int64  
 10  bats                   int64  
 11  decel                  float64
 12  di1                    int64  
 13  di2                    int64  
 14  di3                    int64  
 15  di4                    int64  
 16  di5                    int64  
 17  do1                    int64  
 18  do2                    int64  
 19  do3                    int64  
 20  do4                    int64  
 21  gpslev                 int64  
 22  hdop              

Unnamed: 0,datetime,altitude,angle,instant_speed,accel,ai1,ai2,ai3,ai4,alarm_code,...,hour,dayofweek,month,elevation,pix_business_4x4,pix_residential_4x4,pix_industrial_4x4,pix_institutional_4x4,num_periods,agg_speed
0,2021-02-19 09:42:48+08:00,79,127,0,0.0,0,1023,307,628,0,...,9,4,2,58,222,1111,14,503,1,0.0
1,2021-02-19 09:43:19+08:00,79,93,0,0.0,0,1023,303,628,0,...,9,4,2,58,222,1111,14,503,2,0.0
2,2021-02-19 09:43:49+08:00,79,189,0,0.0,0,1023,305,629,0,...,9,4,2,58,222,1111,14,503,2,0.0
3,2021-02-19 09:44:19+08:00,79,115,0,0.0,0,1023,306,629,0,...,9,4,2,58,222,1111,14,503,2,0.0
4,2021-02-19 09:44:47+08:00,79,13,0,0.0,0,1023,300,629,33,...,9,4,2,58,222,1111,14,503,3,0.0


In [6]:
# quick eda on some cols
def filter_data(df, thresh):
    df = df.loc[df['num_periods'] >= thresh]
    return df

In [7]:
# filtered
df_filtered = filter_data(df, thresh=12)

In [8]:
above_thresh_percentage =((df_filtered.shape[0]) / df.shape[0]) * 100
print("Percentage of samples with period over 12: ", above_thresh_percentage)

Percentage of samples with period over 12:  3.943958059000105


In [9]:
# use filtered
test_size = 0.20 # 20% of data set used for evaluation, 80% as train
tuning_size = 0.05 # 5% of the training set is used for tuning

retrain, test = train_test_split(df, test_size=test_size, random_state=11)
_, tune = train_test_split(retrain, test_size=tuning_size, random_state=11)

run_date = str(datetime.now().date()) # now date
print(run_date)

save_dir = f'../datasets/inference'

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

test.to_csv(f'{save_dir}/{agg}-window-{lookback_window}-test-{run_date}.csv', index=False) # 20%
retrain.to_csv(f'{save_dir}/{agg}-window-{lookback_window}-retrain-{run_date}.csv', index=False) # 80% uncluding tuning set

2022-06-02


In [10]:
print("Retrain shape: ", retrain.shape)
print("Test shape: ", test.shape)
print("Tune shape: ", tune.shape)

Retrain shape:  (27054080, 43)
Test shape:  (6763521, 43)
Tune shape:  (1352704, 43)


In [11]:
# usecols for ML
discrete_num_cols = ['number_of_lanes', 'speed_limit_kph',]
building_types = ['residential', 'business', 'industrial', 'institutional']
pixel_size = '4x4'
landuse_cols = [f'pix_{type_}_{pixel_size}' for type_ in building_types]
num_cols = landuse_cols + discrete_num_cols + ['elevation']

# cat cols
cat_cols = ['hour', 'dayofweek']
features = sorted(cat_cols + num_cols) # sort for reference in feature order
cat_cols_index = [index for (index, col) in enumerate(features) if col in cat_cols]
cat_cols_index

[0, 2]

In [12]:
# prepare data: features, target
X, y = tune[features], tune['agg_speed']

# prepare kfold
kf = KFold(n_splits=10)

# prepare rmse scorer
def rmse(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return rmse
rmse_scorer = make_scorer(rmse) # rmse scoring metric for cross_val

In [13]:
def objective(trial):
    lgb_params = {
        'tree_learner': 'data',
        'seed':11,
        'verbose': -1,
        'boosting_type': trial.suggest_categorical(
            'boosting_type', ['goss', 'dart']),
        'objective': trial.suggest_categorical(
            'objective', ['regression', 'tweedie', 'regression_l1', 'mape']),
        'num_leaves': trial.suggest_int('num_leaves', 2, 512, log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 100, 10000, log=True),
        'max_bin': trial.suggest_int('max_bin', 100, 300),
        'num_iterations': trial.suggest_int("num_iterations", 100, 1000),
        'learning_rate' : trial.suggest_float("learning_rate", 0.001, 1, log=True),
    }
    if lgb_params['boosting_type']=='goss':
        top_rate =  trial.suggest_uniform("top_rate", 0, 1.0)
        other_rate = trial.suggest_uniform("other_rate", 0, (1.0 - top_rate)) 

        # corrects the top_rate/other_rate values to only sum to 1
        if (top_rate + other_rate) > 1.0:
            lgb_params['other_rate'] = other_rate / (top_rate + other_rate)
            lgb_params['top_rate'] = top_rate / (top_rate + other_rate)
        

    model = lgb.LGBMRegressor(**lgb_params)
    scores = cross_val_score(estimator=model, X=X, y=y, scoring=rmse_scorer, cv=kf, n_jobs=-1, 
                             fit_params={'categorical_feature': cat_cols_index},
                             error_score='raise')
    return np.mean(scores)

In [14]:
study_name = f'{agg}-window-{lookback_window}-{run_date}'

save_dir = f'../experiments/{agg}/'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
study = optuna.create_study(study_name=study_name,
                            direction='minimize',
                            storage=f'sqlite:///{save_dir}/{study_name}.db',
                            load_if_exists=True)

[32m[I 2022-06-02 12:33:07,548][0m A new study created in RDB with name: mean-window-60-2022-06-02[0m


In [None]:
study.optimize(objective, n_trials=20)

[32m[I 2022-06-02 12:35:33,547][0m Trial 0 finished with value: 7.8622583003588655 and parameters: {'boosting_type': 'goss', 'objective': 'regression_l1', 'num_leaves': 14, 'min_data_in_leaf': 415, 'max_bin': 261, 'num_iterations': 652, 'learning_rate': 0.29363920164598656, 'top_rate': 0.8411746438549087, 'other_rate': 0.12884792906031803}. Best is trial 0 with value: 7.8622583003588655.[0m


# End