# Vehicle Speed Aggregation: Moving Average Strategy

* save directory: `../datasets/per-vehicle-moving-average/<aggregation type>/window-<some window choice>-<datestring>`

In [11]:
import shap
import pandas as pd
import numpy as np

import time

import seaborn as sns

import matplotlib.pyplot as plt

import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder

from glob import glob
import os
import optuna
import joblib
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

import multiprocessing as mp
from datetime import datetime
date = str(datetime.date(datetime.now()))
print(date)

2022-05-24


In [4]:
import psutil
    
ram_gb = psutil.virtual_memory().total / 2**30 # total physical memory in bytes
print(f"RAM: {ram_gb:.2f} GB")

RAM: 251.79 GB


## Read from source

In [6]:
# read
data_monthdate = '0328' # new data
test = pd.read_csv(f'../datasets/ncc_{data_monthdate}_lgb_test.csv')
train = pd.read_csv(f'../datasets/ncc_{data_monthdate}_lgb_train.csv')
retrain = pd.read_csv(f'../datasets/ncc_{data_monthdate}_lgb_retrain.csv')
df = pd.concat([test, train, retrain])

In [7]:
# get datetime index
df['time'] = pd.to_datetime(df['time'])
df = df.sort_values(by='time')
df = df.set_index('time')

In [8]:
def get_periods(row, ref_index, thresh):
    """Return period counts for thresh"""
    timedeltas = (row.name - ref_index).total_seconds()
    # >= 0 includes self
    periods = np.sum((timedeltas <= thresh) & (timedeltas >= 0))
    return periods

def get_agg_speed(row, group, thresh, agg='mean'):
    """Return a aggregated speed value according to thresh and agg."""
    timedeltas = (row.name - group.index).total_seconds()
    mask = (timedeltas <= thresh) & (timedeltas >= 0)
    # >= 0 includes self
    if agg=='mean':
        return group[mask]['vehicle_speed'].mean()
    elif agg=='median':
        return group[mask]['vehicle_speed'].median()    
    else:
        raise "Enter valid agg."

## Parallelize

In [9]:
def mp_get_periods(chunk, ref_index, thresh):
    chunk['num_periods'] = chunk.apply(
        lambda row: get_periods(row, ref_index, thresh=thresh), axis=1)
    return chunk

def mp_agg_speed(chunk, full_df, thresh, agg):
    chunk['agg_speed'] = chunk.apply(
        lambda row: get_agg_speed(row, full_df, thresh=thresh, agg=agg), axis=1)
    return chunk

# parallelize agg function
def parallelize_get_periods(df, func, thresh):
    """Parallelize mp_elevation and mp_building_counts functions"""
    ref_index = df.index
    cpus = mp.cpu_count()
    df_chunks = np.array_split(df, cpus)
    pool = mp.Pool(processes=cpus)
    chunk_processes = [pool.apply_async(func, args=(chunk, ref_index, thresh)) for chunk in df_chunks]
    df_results = []
    for chunk in chunk_processes:
        res = chunk.get()
        df_results.append(res)
    df_out = pd.concat(df_results)
    return df_out

def parallelize_agg_speed(df, func, thresh, agg):
    cpus = mp.cpu_count()
    df_chunks = np.array_split(df, cpus)
    pool = mp.Pool(processes=cpus)
    chunk_processes = [pool.apply_async(func, args=(chunk, df, thresh, agg)) for chunk in df_chunks]
    df_results = []
    for chunk in chunk_processes:
        res = chunk.get()
        df_results.append(res)
    df_out = pd.concat(df_results)
    return df_out


In [None]:
per_vehicle_grouper = df.groupby('vehicle_id')

In [12]:
## use this for progress_apply method:
# tqdm.pandas()

lookback_window = 60 # seconds, can be changed
agg = 'mean'
# recompute speed through "moving average" method, get median/mean speed in window
# saves a new csv with columns for number of periods (num_periods) and agg vehicle speed (agg_speed)

for name, group in tqdm(per_vehicle_grouper):
    
    group = parallelize_get_periods(
        df=group, 
        func=mp_get_periods, 
        thresh=lookback_window)
    
    group = parallelize_agg_speed(
        df=group, 
        func=mp_agg_speed, 
        thresh=lookback_window, 
        agg=agg) 
    
    group = group.reset_index()
    display(group.info())
    
    break
    
    # prep paths
    save_dir = f'datasets/per-vehicle-moving-average/{agg}/window-{window}-{date}'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    save_file = os.path.join(save_dir, f"{name}.csv")
    
    # save
    group.to_csv(save_file, index=False)

  0%|          | 0/199 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60335 entries, 0 to 60334
Data columns (total 27 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   time             60335 non-null  datetime64[ns]
 1   altitude         60335 non-null  float64       
 2   vehicle_speed    60335 non-null  float64       
 3   accel            60335 non-null  float64       
 4   decel            60335 non-null  float64       
 5   lon              60335 non-null  float64       
 6   lat              60335 non-null  float64       
 7   vehicle_id       60335 non-null  int64         
 8   osmid            60335 non-null  object        
 9   lanes            60335 non-null  float64       
 10  speed_kph        60335 non-null  float64       
 11  length           60335 non-null  float64       
 12  dist_to_edge     60335 non-null  float64       
 13  barangay         60335 non-null  object        
 14  hour             60335 non-null  int64

None

  0%|          | 0/199 [00:40<?, ?it/s]


## Serial Run (refactored)

In [None]:
## use this for progress_apply method:
# tqdm.pandas()

window = 60 # seconds, can be changed
agg = 'mean'

per_vehicle_grouper = df.groupby('vehicle_id')

# recompute speed through "moving average" method, get median speed in window
# saves a new csv with columns for number of periods and agg vehicle speed
for name, group in tqdm(per_vehicle_grouper):
    ref_index = group.index
    group['num_periods'] = group.apply(
        lambda row: get_periods(row, ref_index, thresh=window), axis=1)
    group['recomputed_speed'] = group.apply(
        lambda row: get_agg_speed(row, group, thresh=window, agg='mean'), axis=1)
    
    # prep paths
    save_dir = f'../dataset/per-vehicle-moving-average/{agg}/window-{window}'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    save_file = os.path.join(save_dir, f"{name}.csv")
    
    # save
    group = group.reset_index()
    group.to_csv(save_file, index=False)

 10%|█         | 20/199 [5:17:42<37:21:23, 751.30s/it] 

# Space-time Agg Strategy (Not done)

In [None]:
data_monthdate = '0328' # new data
test = pd.read_csv(f'dataset/ncc_{data_monthdate}_lgb_test.csv')
train = pd.read_csv(f'dataset/ncc_{data_monthdate}_lgb_train.csv')
retrain = pd.read_csv(f'dataset/ncc_{data_monthdate}_lgb_retrain.csv')

In [None]:
df = pd.concat([test, train, retrain])
df['time'] = pd.to_datetime(df['time'])
df = df.sort_values(by='time')
df = df.set_index('time')

## Code Dump

In [54]:
!ls dataset

lgb_retrain.csv		  ncc_0325_lgb_train.csv    ncc_1118_lgb_test.csv
lgb_test.csv		  ncc_0328_lgb_retrain.csv  ncc_1118_lgb_train.csv
lgb_train.csv		  ncc_0328_lgb_test.csv     ncc_1120_lgb_retrain.csv
ncc_0325_lgb_retrain.csv  ncc_0328_lgb_train.csv    ncc_1120_lgb_test.csv
ncc_0325_lgb_test.csv	  ncc_1118_lgb_retrain.csv  ncc_1120_lgb_train.csv


In [None]:
# test group for functions
window = 120 # seconds
for name, group in per_vehicle_grouper:
    timedeltas = (group.iloc[2].name - group.index).total_seconds()
    periods = np.sum((timedeltas <= window) & (timedeltas >= 0))
    print("Periods: ", periods)

    mask = (timedeltas <= window) & (timedeltas >= 0)
    print("Agg speed: ", group[mask]['vehicle_speed'].median())
    break

In [None]:
# filtering prior to ML
def filter_data(df):
    df = df[df['dist_to_edge'] <= 20]
    df = df[df['accel'] <= 20]
    df = df[df['decel'] <= 20]
    df['barangay'] = df['barangay'].fillna('Out-of-town')
    df['lanes'] = df['lanes'].fillna(1.0)

    # WATCH THIS: 60 (determine the percentile of 60kph in speed dist) reasonable na trike 
    df = df[df['vehicle_speed'] <= 60] # CHANGE TO 60
    df = df[df['elevation'] <= 148] # based on cauayan city highest elevation in meters. (32 to 148 range)
    return df