# Vehicle Speed Aggregation: Moving Average Strategy

Specifics:
* lookback `window` in seconds can be tweaked
* speed aggregation used is `median`, try `mean`
* ETA: jojie-portless ~7 hours
* new columns on csv: 
    - `num_periods` - the number of points for a chosen lookback `window` from a reference point
    - `recomputed_speed` - the agg speed contained in `num_periods`
* save directory: `dataset/per-vehicle-moving-average/window-<some window choice>`

In [14]:
import shap
import pandas as pd
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from glob import glob
import os
import optuna
import joblib
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [15]:
import psutil
    
ram_gb = psutil.virtual_memory().total / 2**30 # total physical memory in bytes
print(f"RAM: {ram_gb:.2f} GB")

RAM: 251.79 GB


In [16]:
data_monthdate = '0328' # new data
test = pd.read_csv(f'dataset/ncc_{data_monthdate}_lgb_test.csv')
train = pd.read_csv(f'dataset/ncc_{data_monthdate}_lgb_train.csv')
retrain = pd.read_csv(f'dataset/ncc_{data_monthdate}_lgb_retrain.csv')

In [17]:
df = pd.concat([test, train, retrain])
df['time'] = pd.to_datetime(df['time'])
df = df.sort_values(by='time')
df = df.set_index('time')

In [18]:
per_vehicle_grouper = df.groupby('vehicle_id')

In [20]:
def get_periods(row, ref_index, thresh):
    """Return period counts for thresh"""
    timedeltas = (row.name - ref_index).total_seconds()
    # >= 0 includes self
    periods = np.sum((timedeltas <= thresh) & (timedeltas >= 0))
    return periods

def get_recomp_speeds(row, group, thresh, agg='mean'):
    timedeltas = (row.name - group.index).total_seconds()
    mask = (timedeltas <= thresh) & (timedeltas >= 0)
    # >= 0 includes self
    if agg=='mean':
        return group[mask]['vehicle_speed'].mean()
    elif agg=='median':
        return group[mask]['vehicle_speed'].median()    
    else:
        raise "Enter valid agg."
    

In [None]:
## use this for progress_apply method:
# tqdm.pandas()

window = 60 # seconds, can be changed
agg = 'mean'

# recompute speed through "moving average" method, get median speed in window
# saves a new csv with columns for number of periods and agg vehicle speed
for name, group in tqdm(per_vehicle_grouper):
    ref_index = group.index
    group['num_periods'] = group.apply(
        lambda row: get_periods(row, ref_index, thresh=window), axis=1)
    group['recomputed_speed'] = group.apply(
        lambda row: get_recomp_speeds(row, group, thresh=window, agg='mean'), axis=1)
    
    # prep paths
    save_dir = f'dataset/per-vehicle-moving-average/{agg}/window-{window}'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    save_file = os.path.join(save_dir, f"{name}.csv")
    
    # save
    group = group.reset_index()
    group.to_csv(save_file, index=False)

 10%|█         | 20/199 [5:17:42<37:21:23, 751.30s/it] 

# Space-time Agg Strategy (Not done)

In [None]:
data_monthdate = '0328' # new data
test = pd.read_csv(f'dataset/ncc_{data_monthdate}_lgb_test.csv')
train = pd.read_csv(f'dataset/ncc_{data_monthdate}_lgb_train.csv')
retrain = pd.read_csv(f'dataset/ncc_{data_monthdate}_lgb_retrain.csv')

In [None]:
df = pd.concat([test, train, retrain])
df['time'] = pd.to_datetime(df['time'])
df = df.sort_values(by='time')
df = df.set_index('time')

## Code Dump

In [54]:
!ls dataset

lgb_retrain.csv		  ncc_0325_lgb_train.csv    ncc_1118_lgb_test.csv
lgb_test.csv		  ncc_0328_lgb_retrain.csv  ncc_1118_lgb_train.csv
lgb_train.csv		  ncc_0328_lgb_test.csv     ncc_1120_lgb_retrain.csv
ncc_0325_lgb_retrain.csv  ncc_0328_lgb_train.csv    ncc_1120_lgb_test.csv
ncc_0325_lgb_test.csv	  ncc_1118_lgb_retrain.csv  ncc_1120_lgb_train.csv


In [None]:
# test group for functions
window = 120 # seconds
for name, group in per_vehicle_grouper:
    timedeltas = (group.iloc[2].name - group.index).total_seconds()
    periods = np.sum((timedeltas <= window) & (timedeltas >= 0))
    print("Periods: ", periods)

    mask = (timedeltas <= window) & (timedeltas >= 0)
    print("Agg speed: ", group[mask]['vehicle_speed'].median())
    break

In [None]:
# filtering prior to ML
def filter_data(df):
    df = df[df['dist_to_edge'] <= 20]
    df = df[df['accel'] <= 20]
    df = df[df['decel'] <= 20]
    df['barangay'] = df['barangay'].fillna('Out-of-town')
    df['lanes'] = df['lanes'].fillna(1.0)

    # WATCH THIS: 60 (determine the percentile of 60kph in speed dist) reasonable na trike 
    df = df[df['vehicle_speed'] <= 60] # CHANGE TO 60
    df = df[df['elevation'] <= 148] # based on cauayan city highest elevation in meters. (32 to 148 range)
    return df