# Tuning: Moving average inference approach

In [1]:
import shap
import pandas as pd
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, make_scorer


from glob import glob
import os
import optuna
import joblib
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

from datetime import datetime

date = str(datetime.date(datetime.now()))
print(date)

2022-05-23


In [2]:
import psutil
    
ram_gb = psutil.virtual_memory().total / 2**30 # total physical memory in bytes
print(f"RAM: {ram_gb:.2f} GB")

RAM: 251.79 GB


In [3]:
# import
window = 60
csv_paths = glob(f'dataset/per-vehicle-moving-average/window-{window}/*.csv')

df = pd.concat([pd.read_csv(path) for path in csv_paths])

In [4]:
# quick eda on some cols
inspect_cols = ['accel', 'decel', 'speed_kph', 'elevation']
df[inspect_cols].describe()

Unnamed: 0,accel,decel,speed_kph,elevation
count,30906320.0,30906320.0,30906320.0,30906320.0
mean,1.272281,1.446541,41.7595,58.6081
std,1.630303,1.749992,18.85087,5.345039
min,0.0,0.0,20.0,36.0
25%,0.0,0.0,30.0,55.0
50%,1.0,1.0,30.0,59.0
75%,2.0,2.0,60.0,62.0
max,20.0,20.0,80.0,207.0


In [5]:
def clean_data(df):
    df['barangay'] = df['barangay'].fillna('Out-of-town')
    df = df.loc[df['barangay'] != 'Out-of-town']
    df['lanes'] = df['lanes'].fillna(1.0)
    return df

def filter_data(df, thresh):
    df = df.loc[df['num_periods'] >= thresh]
    return df

In [6]:
df = clean_data(df)

In [7]:
above_thresh_percentage =((df['num_periods'] >= 12).sum() / df.shape[0]) * 100
print("Percentage of samples with period over 12: ", above_thresh_percentage)

Percentage of samples with period over 12:  3.27421963137833


In [8]:
df = filter_data(df, thresh=12)

In [9]:
%%time
tuning_size = 0.30 # 30% of the training set is used for tuning

retrain, tuning_sample = train_test_split(df, test_size=tuning_size, random_state=11)
train, test = train_test_split(tuning_sample, test_size=0.1, random_state=11)

save_dir = f'dataset/per-vehicle-moving-average/inference/{date}'

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

test.to_csv(f'{save_dir}/window-{window}-test.csv', index=False)
train.to_csv(f'{save_dir}/window-{window}-train.csv', index=False)
retrain.to_csv(f'{save_dir}/window-{window}-retrain.csv', index=False)

CPU times: user 13.1 s, sys: 211 ms, total: 13.3 s
Wall time: 13.7 s


In [10]:
print("Test shape: ", test.shape)
print("Train shape: ", train.shape)
print("Retrain shape: ", retrain.shape)

Test shape:  (30357, 27)
Train shape:  (273213, 27)
Retrain shape:  (708330, 27)


In [11]:
# use for ML
cat_cols = ['hour', 'dayofweek']
cat_cols_index = [i for i in range(len(cat_cols))]
num_cols = ['pix_business', 'pix_residential', 'pix_industrial', 
            'lanes', 'speed_kph', 'elevation']

In [12]:
# prepare data from train: previously used old vehicle speed, 
# use recomputed speed starting 2022-05-23
X, y = train[cat_cols + num_cols], train['recomputed_speed']

# prepare kfold
kf = KFold(n_splits=10)

# prepare rmse scorer
def rmse(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return rmse
rmse_scorer = make_scorer(rmse) # rmse scoring metric for cross_val

In [13]:
def objective(trial):
    lgb_params = {
        'boosting_type': 'goss',
        'objective':'tweedie',
        'metric' : 'rmse',
        'seed':11,
        'verbose': -1,
        'num_leaves': trial.suggest_int('num_leaves', 2, 512),
        'n_estimators': trial.suggest_int("n_estimators", 100, 1000),
        'learning_rate' : trial.suggest_float("learning_rate", 0.001, 1, log=True)
    }
    
    top_rate =  trial.suggest_uniform("top_rate", 0, 1.0)
    other_rate = trial.suggest_uniform("other_rate", 0, (1.0 - top_rate)) 
    
    # corrects the top_rate/other_rate values to only sum to 1
    if (top_rate + other_rate) > 1.0:
        lgb_params['other_rate'] = other_rate / (top_rate + other_rate)
        lgb_params['top_rate'] = top_rate / (top_rate + other_rate)
        

    model = lgb.LGBMRegressor(**lgb_params)
    scores = cross_val_score(estimator=model, X=X, y=y, scoring=rmse_scorer, cv=kf, n_jobs=-1, 
                             fit_params={'categorical_feature': cat_cols_index},
                             error_score='raise')
    return np.mean(scores)

In [15]:
study_name = f'moving-average-inference-window-{window}-{date}'

save_dir = f'per-vehicle-moving-average-experiments/inference/{date}'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
study = optuna.create_study(study_name=study_name,
                                direction='minimize',
                                storage=f'sqlite:///{save_dir}/window-{window}.db',
                                load_if_exists=True)

[32m[I 2022-05-23 16:34:34,315][0m A new study created in RDB with name: moving-average-inference-window-60-2022-05-23[0m


In [16]:
%%time
study.optimize(objective, n_trials=30)

[32m[I 2022-05-23 16:35:03,435][0m Trial 0 finished with value: 7.99574873115382 and parameters: {'num_leaves': 223, 'n_estimators': 187, 'learning_rate': 0.00310708146118461, 'top_rate': 0.35186343333523074, 'other_rate': 0.3400138860081786}. Best is trial 0 with value: 7.99574873115382.[0m
[32m[I 2022-05-23 16:35:59,366][0m Trial 1 finished with value: 6.590409393612956 and parameters: {'num_leaves': 247, 'n_estimators': 537, 'learning_rate': 0.00301407982851219, 'top_rate': 0.825893214874046, 'other_rate': 0.047036524517581495}. Best is trial 1 with value: 6.590409393612956.[0m
[32m[I 2022-05-23 16:36:16,224][0m Trial 2 finished with value: 6.238903129369373 and parameters: {'num_leaves': 261, 'n_estimators': 145, 'learning_rate': 0.12369953814778169, 'top_rate': 0.08389226230883473, 'other_rate': 0.13567619404885303}. Best is trial 2 with value: 6.238903129369373.[0m
[32m[I 2022-05-23 16:36:27,097][0m Trial 3 finished with value: 6.624398353945762 and parameters: {'num_l

[32m[I 2022-05-23 16:37:04,540][0m Trial 4 finished with value: 6.4413456510350695 and parameters: {'num_leaves': 414, 'n_estimators': 354, 'learning_rate': 0.16406575404796647, 'top_rate': 0.880538229005749, 'other_rate': 0.09870105811370543}. Best is trial 2 with value: 6.238903129369373.[0m
[32m[I 2022-05-23 16:37:29,836][0m Trial 5 finished with value: 8.360557850305927 and parameters: {'num_leaves': 416, 'n_estimators': 233, 'learning_rate': 0.001978319513962644, 'top_rate': 0.09074570740765997, 'other_rate': 0.6042139026107844}. Best is trial 2 with value: 6.238903129369373.[0m
[32m[I 2022-05-23 16:39:09,763][0m Trial 6 finished with value: 6.329576339190508 and parameters: {'num_leaves': 373, 'n_estimators': 897, 'learning_rate': 0.08532190237836396, 'top_rate': 0.295716306344311, 'other_rate': 0.46072569288050286}. Best is trial 2 with value: 6.238903129369373.[0m
[32m[I 2022-05-23 16:39:35,405][0m Trial 7 finished with value: 6.177239985133684 and parameters: {'num_

[32m[I 2022-05-23 16:40:02,452][0m Trial 8 finished with value: 7.324981911228072 and parameters: {'num_leaves': 41, 'n_estimators': 417, 'learning_rate': 0.0027898880617945585, 'top_rate': 0.6493758875345564, 'other_rate': 0.02630126457775068}. Best is trial 7 with value: 6.177239985133684.[0m
[32m[I 2022-05-23 16:41:06,171][0m Trial 9 finished with value: 6.407193258324135 and parameters: {'num_leaves': 210, 'n_estimators': 793, 'learning_rate': 0.14679739818866877, 'top_rate': 0.9615186693418317, 'other_rate': 0.0345713447176307}. Best is trial 7 with value: 6.177239985133684.[0m
[32m[I 2022-05-23 16:42:19,416][0m Trial 10 finished with value: 6.138205177088809 and parameters: {'num_leaves': 486, 'n_estimators': 642, 'learning_rate': 0.01796841812616916, 'top_rate': 0.5545783562852231, 'other_rate': 0.25278704275640446}. Best is trial 10 with value: 6.138205177088809.[0m
[32m[I 2022-05-23 16:43:42,630][0m Trial 11 finished with value: 6.133588340453553 and parameters: {'n

[32m[I 2022-05-23 16:45:05,907][0m Trial 12 finished with value: 6.135239546041646 and parameters: {'num_leaves': 512, 'n_estimators': 699, 'learning_rate': 0.01598924445782866, 'top_rate': 0.5684218372587501, 'other_rate': 0.23983828425963655}. Best is trial 11 with value: 6.133588340453553.[0m
[32m[I 2022-05-23 16:46:29,280][0m Trial 13 finished with value: 6.141124885499159 and parameters: {'num_leaves': 493, 'n_estimators': 716, 'learning_rate': 0.014499655279838144, 'top_rate': 0.6605889369072147, 'other_rate': 0.20169084927265113}. Best is trial 11 with value: 6.133588340453553.[0m
[32m[I 2022-05-23 16:48:30,605][0m Trial 14 finished with value: 6.142520576632847 and parameters: {'num_leaves': 502, 'n_estimators': 985, 'learning_rate': 0.009335112574654142, 'top_rate': 0.46211177740547826, 'other_rate': 0.3154385549002356}. Best is trial 11 with value: 6.133588340453553.[0m
[32m[I 2022-05-23 16:48:59,633][0m Trial 15 finished with value: 6.403245642007567 and parameter

[32m[I 2022-05-23 16:49:56,951][0m Trial 16 finished with value: 6.357588043210969 and parameters: {'num_leaves': 132, 'n_estimators': 785, 'learning_rate': 0.007187258622975977, 'top_rate': 0.4967699584922641, 'other_rate': 0.27368355806908123}. Best is trial 11 with value: 6.133588340453553.[0m
[32m[I 2022-05-23 16:51:01,785][0m Trial 17 finished with value: 7.718791572046657 and parameters: {'num_leaves': 321, 'n_estimators': 621, 'learning_rate': 0.0010612610675358885, 'top_rate': 0.20624198810239935, 'other_rate': 0.46832060877898535}. Best is trial 11 with value: 6.133588340453553.[0m
[32m[I 2022-05-23 16:51:51,736][0m Trial 18 finished with value: 26.789094189836693 and parameters: {'num_leaves': 460, 'n_estimators': 454, 'learning_rate': 0.738663201640586, 'top_rate': 0.5641206907194664, 'other_rate': 0.14599034366529504}. Best is trial 11 with value: 6.133588340453553.[0m
[32m[I 2022-05-23 16:53:29,949][0m Trial 19 finished with value: 6.141798016665581 and paramete

[32m[I 2022-05-23 16:54:37,070][0m Trial 20 finished with value: 6.207412906428023 and parameters: {'num_leaves': 324, 'n_estimators': 705, 'learning_rate': 0.010329149388672304, 'top_rate': 0.7542954971757253, 'other_rate': 0.16863000830552366}. Best is trial 11 with value: 6.133588340453553.[0m
[32m[I 2022-05-23 16:55:54,868][0m Trial 21 finished with value: 6.132498123038543 and parameters: {'num_leaves': 502, 'n_estimators': 653, 'learning_rate': 0.021815764111155836, 'top_rate': 0.5785556168542206, 'other_rate': 0.2543492124504017}. Best is trial 21 with value: 6.132498123038543.[0m
[32m[I 2022-05-23 16:57:17,256][0m Trial 22 finished with value: 6.148144897324218 and parameters: {'num_leaves': 510, 'n_estimators': 670, 'learning_rate': 0.03390773411879177, 'top_rate': 0.6200318303281036, 'other_rate': 0.2512663048117055}. Best is trial 21 with value: 6.132498123038543.[0m
[32m[I 2022-05-23 16:58:17,207][0m Trial 23 finished with value: 6.235787021963625 and parameters:

[32m[I 2022-05-23 17:00:57,499][0m Trial 24 finished with value: 6.208421590079649 and parameters: {'num_leaves': 439, 'n_estimators': 864, 'learning_rate': 0.0499300528624749, 'top_rate': 0.5517128236775103, 'other_rate': 0.2939751396938719}. Best is trial 21 with value: 6.132498123038543.[0m
[32m[I 2022-05-23 17:04:30,572][0m Trial 25 finished with value: 6.1467967019935825 and parameters: {'num_leaves': 373, 'n_estimators': 734, 'learning_rate': 0.021519327471146116, 'top_rate': 0.7667025597446101, 'other_rate': 0.1858038165642294}. Best is trial 21 with value: 6.132498123038543.[0m
[32m[I 2022-05-23 17:05:38,006][0m Trial 26 finished with value: 6.254534202026561 and parameters: {'num_leaves': 510, 'n_estimators': 575, 'learning_rate': 0.005305099332568694, 'top_rate': 0.22098351583015885, 'other_rate': 0.7242535135793309}. Best is trial 21 with value: 6.132498123038543.[0m
[32m[I 2022-05-23 17:06:10,145][0m Trial 27 finished with value: 6.368692695320467 and parameters:

[32m[I 2022-05-23 17:08:01,009][0m Trial 28 finished with value: 6.208176061186297 and parameters: {'num_leaves': 466, 'n_estimators': 939, 'learning_rate': 0.004901901930798224, 'top_rate': 0.6867142848708374, 'other_rate': 0.240838468884082}. Best is trial 21 with value: 6.132498123038543.[0m
[32m[I 2022-05-23 17:09:21,278][0m Trial 29 finished with value: 6.144919634521927 and parameters: {'num_leaves': 379, 'n_estimators': 749, 'learning_rate': 0.028661340178704944, 'top_rate': 0.37924507391967544, 'other_rate': 0.3263991457996601}. Best is trial 21 with value: 6.132498123038543.[0m


CPU times: user 6.7 s, sys: 6.57 s, total: 13.3 s
Wall time: 34min 43s
