# Naive baseline lookup

In [1]:
import os
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from glob import glob
from tqdm import tqdm
import os
import json

src_date = "2023-06-17"

In [2]:
import psutil
    
ram_gb = psutil.virtual_memory().total / 2**30 # total physical memory in bytes
print(f"RAM: {ram_gb:.2f} GB")

RAM: 251.54 GB


In [3]:
# preprocess utils

# compress dtypes
dtypes_compressed = {
    'datetime': np.object_,    
    'hour': np.int8,
    'month': np.int8,
    'dayofweek': np.int8,
    'number_of_lanes': np.float32,
    
    'speed_limit_kph': np.float32,    
    
    'lon': np.float64,
    'lat': np.float64,
    'elevation': np.int16,    
    'agg_speed': np.float64,  
    
    'pix_business_4x4_200': np.int16,
    'pix_residential_4x4_200': np.int16,
    'pix_industrial_4x4_200': np.int16,
    'pix_institutional_4x4_200': np.int16,
    'pix_business_4x4_50': np.int16,
    'pix_residential_4x4_50': np.int16,
    'pix_industrial_4x4_50': np.int16,
    'pix_institutional_4x4_50': np.int16,
    'pix_business_4x4_100': np.int16,
    'pix_residential_4x4_100': np.int16,
    'pix_industrial_4x4_100': np.int16,
    'pix_institutional_4x4_100': np.int16,
    'pix_business_4x4_150': np.int16,
    'pix_residential_4x4_150': np.int16,
    'pix_industrial_4x4_150': np.int16,
    'pix_institutional_4x4_150': np.int16,
    'pix_business_4x4_250': np.int16,
    'pix_residential_4x4_250': np.int16,
    'pix_industrial_4x4_250': np.int16,
    'pix_institutional_4x4_250': np.int16,
    'pix_business_4x4_500': np.int16,
    'pix_residential_4x4_500': np.int16,
    'pix_industrial_4x4_500': np.int16,
    'pix_institutional_4x4_500': np.int16,
    'pix_business_4x4_1000': np.int16,
    'pix_residential_4x4_1000': np.int16,
    'pix_industrial_4x4_1000': np.int16,
}

def correct_service_rd_kphlimit(temp_df):
    temp_df.loc[temp_df['speed_limit_kph'] == 36.7, 'speed_limit_kph'] = 20
    return temp_df

def impute_lanes(df):
    df['number_of_lanes'] = df['number_of_lanes'].replace(np.nan, value=2)
    return df

def filter_brgy(df):
    df['barangay'] = df['barangay'].fillna('Out-of-town')
    
    # not in cauayan boundary
    df = df.loc[df['barangay'] != 'Out-of-town'] 
    return df

def filter_num_periods(df, thresh):
    df = df.loc[df['num_periods'] >= thresh]
    return df

def filter_vehicles(df, remove_list=None):
    df = df.loc[~df.vehicle_id.isin(remove_list)]
    return df

def prepare_df(df):
    df = df.astype(dtype=dtypes_compressed)
    
    # vehicles with erratic sending of data; stopped sending data before collection date
    remove_list = [
        123200872653, 123200872678, 123200872713, 123200872727, 
        123200872819,
    ] 
    df = filter_brgy(df)
    df = filter_vehicles(df, remove_list=remove_list)
    df = filter_num_periods(df, thresh=12)
    df = impute_lanes(df)
    df = correct_service_rd_kphlimit(df)
    df = df.drop_duplicates()
    return df

## Load data

In [4]:
src_dir = f"../datasets/processed/post-review-tt-splits/{src_date}"
train = pd.read_csv(os.path.join(src_dir, "train.csv"))
train = prepare_df(train)

In [5]:
def _clean_osmid(osmid_data_col):
    return osmid_data_col.map(
        lambda x: x[0] if isinstance(x, list) else x)

def get_mean_lookup(df, target='agg_speed'):
    """Return lookup dictionary"""
    df['road_osmid'] = _clean_osmid(df['road_osmid'])
    history_lookup = df.groupby(['road_osmid', 'dayofweek', 'hour']).apply(
        lambda group: group[target].mean()).to_dict()
    return history_lookup

def _lookup_mean(history_lookup, road_osmid, dayofweek, hour):
    # transform osmid to str in cases where osmid is list of ids_
    return history_lookup.get(
        (str(road_osmid), dayofweek, hour), np.nan
    )

In [6]:
# test
history_lookup = get_mean_lookup(train.sample(1000))
_lookup_mean(history_lookup, 195152024, 2, 10)

nan

## Get baseline preds

In [7]:
target = 'agg_speed'
history_lookup = get_mean_lookup(train, target=target)

In [8]:
def get_baseline_preds(history_lookup, df):
    """Return baseline preds"""
    preds = df[['road_osmid', 'dayofweek', 'hour']].apply(
        lambda row : _lookup_mean(history_lookup, *row) , axis=1
    )
    return preds

In [9]:
test = pd.read_csv(os.path.join(src_dir, "test.csv"))
test = prepare_df(test)

In [10]:
preds = get_baseline_preds(history_lookup, test)

In [11]:
true = test['agg_speed']

## Evaluate

In [12]:
def _nan_counter(preds):
    """Return count of nan preds"""
    return preds.isna().sum()

def percent_nan(preds):
    nan_counts = _nan_counter(preds)
    return (nan_counts / preds.size) * 100 

def _clean_nans(true, preds):
    mask = ~preds.isna()
    return true.loc[mask], preds.loc[mask]

In [13]:
from sklearn.metrics import (
    r2_score, max_error, mean_absolute_error, mean_squared_error,
    mean_absolute_percentage_error, make_scorer
)

from sklearn.model_selection import TimeSeriesSplit, train_test_split

In [14]:
def rmse(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return rmse

# rmse_scorer = make_scorer(rmse) # rmse scoring metric for cross_val

In [15]:
metrics_dict = {
    "r2" : r2_score,
    "mean_absolute_error" : mean_absolute_error, 
    "mean_absolute_percentage_error" : mean_absolute_percentage_error,     
    "rmse" : rmse,
    "mean_squared_error" : mean_squared_error,    
    "max_error": max_error,
}

In [16]:
percent_nan(preds)

5.588357681883104

In [17]:
for metric, func in metrics_dict.items():
    print(f"{metric} : {func(*_clean_nans(true, preds))}")

r2 : 0.5779677483537582
mean_absolute_error : 5.187424342219826
mean_absolute_percentage_error : 299020436521425.6
rmse : 7.212273728132856
mean_squared_error : 52.01689232951541
max_error : 55.30050021423367


## Baseline TSCV

In [18]:
target = 'agg_speed'

tscv = TimeSeriesSplit(n_splits=5)

results = {}

for i, (train_index, test_index) in enumerate(tscv.split(train)):
    
    fold_train = train.iloc[train_index]
    fold_test = train.iloc[test_index]
    
    history_lookup = get_mean_lookup(fold_train, target=target)
    
    preds = get_baseline_preds(history_lookup, fold_test)
    true = fold_test['agg_speed']
    
    
    # log per fold
    results[i] = {
        "y_true": list(true),
        "y_pred": list(preds),
        "percent_nan" : percent_nan(preds),
    }
    
    for metric, func in metrics_dict.items():
        results[i][metric] = func(*_clean_nans(true, preds))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['road_osmid'] = _clean_osmid(df['road_osmid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['road_osmid'] = _clean_osmid(df['road_osmid'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['road_osmid'] = _clean_osmid(df['road_osmid'])
A value is trying to be set on a copy of a slice from a

In [19]:
for fold, result in results.items():
    print(f"Fold: {fold}")
    for metric, score in result.items():
        if metric in ['percent_nan', 'rmse', 'r2']:
            print(f"{metric} : {score}")

Fold: 0
percent_nan : 14.24538752649958
r2 : 0.31588827547144605
rmse : 7.1521089462208245
Fold: 1
percent_nan : 14.063756445654068
r2 : 0.39374854493602274
rmse : 7.446328408059908
Fold: 2
percent_nan : 8.458497579189464
r2 : 0.6190359486720262
rmse : 7.27415598953839
Fold: 3
percent_nan : 6.310977152721936
r2 : 0.6584720245217135
rmse : 6.761758808982925
Fold: 4
percent_nan : 4.694291835598567
r2 : 0.6125717739960981
rmse : 6.644796972450685


In [20]:
tscv_results_dir = f"../datasets/results/post-review/tscv/baseline/{src_date}"
os.makedirs(tscv_results_dir, exist_ok=True)

fname = f"baseline.json"
with open(os.path.join(tscv_results_dir, fname), 'w') as fp:
    json.dump(results, fp)

In [21]:
!ls ../datasets/results/post-review/tscv/

1000-1000  150-150  250-250  50-100  50-200  50-50   all
100-100    200-200  500-500  50-150  50-250  50-500  baseline


## Evaluate on test

In [22]:
target = 'agg_speed'
history_lookup = get_mean_lookup(train, target=target)

preds = get_baseline_preds(history_lookup, test)
true = test['agg_speed']


# log per fold
final_results = {
    "percent_nan" : percent_nan(preds),
    "y_pred": list(preds),
    "y_true": list(true),

}

for metric, func in metrics_dict.items():
    final_results[metric] = func(*_clean_nans(true, preds))

In [23]:
final_results_dir = f"../datasets/results/post-review/final_results/baseline/{src_date}"
os.makedirs(final_results_dir, exist_ok=True)

fname = f"baseline.json"
with open(os.path.join(final_results_dir, fname), 'w') as fp:
    json.dump(final_results, fp)

In [24]:
!ls ../datasets/results/post-review/final_results/

1000-1000  150-150  250-250  50-100  50-200  50-50   all
100-100    200-200  500-500  50-150  50-250  50-500  baseline


In [28]:
for key, value in final_results.items():
    if key in ['percent_nan', 'rmse', 'r2']:
        print(f"{key} : {value}")

percent_nan : 5.588357681883104
r2 : 0.5779677483537582
rmse : 7.212273728132856


## End