# Data splits for cross-validation

In [4]:
import os
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from glob import glob
from tqdm import tqdm
import os

src_date = "2023-06-17"
src_dir = f"../datasets/processed/post-review/{src_date}"

In [5]:
import psutil
    
ram_gb = psutil.virtual_memory().total / 2**30 # total physical memory in bytes
print(f"RAM: {ram_gb:.2f} GB")

RAM: 251.54 GB


## Load data

In [6]:
# # set up columns : DONT for ease of eval
# time_cols = ['datetime','hour', 'dayofweek']

# road_cols = [
#     'number_of_lanes', 'speed_limit_kph', 'elevation',
# ]

# vehicle_cols = [
#     'agg_speed', 
# #     'lat', 'lon', 'vehicle_id', 'num_periods', 'barangay',
# ]

# landuse_cols = []
# buffer_sizes = [50, 100, 150, 200, 250, 500, 1000]
# for size in buffer_sizes:
#     landuse_cols.extend(
#         [
#             f'pix_residential_4x4_{size}',
#             f'pix_institutional_4x4_{size}',
#             f'pix_industrial_4x4_{size}',
#             f'pix_business_4x4_{size}',
#         ]
#     )
# usecols = time_cols + road_cols + vehicle_cols + landuse_cols

paths = glob(os.path.join(src_dir, "*.csv"))
df = pd.DataFrame()
for path in tqdm(paths):
    df = pd.concat([df, pd.read_csv(path)])

100%|██████████| 195/195 [01:02<00:00,  3.12it/s]


In [7]:
# sort by datetime
df["datetime"] = pd.to_datetime(df["datetime"])
df.sort_values(by="datetime", inplace=True)

## Split according to time

In [8]:
from sklearn.model_selection import TimeSeriesSplit, train_test_split

In [9]:
train, test = train_test_split(
    df, test_size=0.20, random_state=42, shuffle=False
)

In [10]:
print("Train shape: ", train.shape)
print("Test shape: ", test.shape)

Train shape:  (1067001, 67)
Test shape:  (266751, 67)


In [11]:
(train["datetime"].min(), train['datetime'].max(),
 test["datetime"].min(), test["datetime"].max())

(Timestamp('2021-02-16 12:31:05+0800', tz='pytz.FixedOffset(480)'),
 Timestamp('2021-06-27 06:48:13+0800', tz='pytz.FixedOffset(480)'),
 Timestamp('2021-06-27 06:48:16+0800', tz='pytz.FixedOffset(480)'),
 Timestamp('2021-11-17 07:57:14+0800', tz='pytz.FixedOffset(480)'))

In [12]:
save_dir = f"../datasets/processed/post-review-tt-splits/{src_date}"
os.makedirs(save_dir, exist_ok=True)

train.to_csv(os.path.join(save_dir, "train.csv"), index=False)
test.to_csv(os.path.join(save_dir, "test.csv"), index=False)

In [13]:
tscv = TimeSeriesSplit(n_splits=3)

for i, (train_index, test_index) in enumerate(tscv.split(train)):
    fold_train = train.iloc[train_index]
    fold_test = train.iloc[test_index]
    
    print(f"Fold {i} train shape: ", fold_train.shape)
    print(f"Fold {i} test shape: ", fold_test.shape)
    
    save_dir = f"../datasets/processed/post-review-ts-splits/{src_date}"
    os.makedirs(save_dir, exist_ok=True)
    fold_train.to_csv(
        os.path.join(save_dir, f"fold-{i}-train.csv"), index=False)
    fold_test.to_csv(
        os.path.join(save_dir, f"fold-{i}-test.csv"), index=False)
    
    # use first fold as tuning set, save to tt
    save_dir = f"../datasets/processed/post-review-tt-splits/{src_date}"
    if not i:
        tune = fold_train
        tune.to_csv(os.path.join(save_dir, "tune.csv"), index=False) # 1/3 of train

Fold 0 train shape:  (266751, 67)
Fold 0 test shape:  (266750, 67)
Fold 1 train shape:  (533501, 67)
Fold 1 test shape:  (266750, 67)


KeyboardInterrupt: 

In [15]:
tune['datetime'].sort_values()

1223   2021-02-16 12:31:05+08:00
1163   2021-02-16 12:31:11+08:00
335    2021-02-16 12:31:13+08:00
2452   2021-02-16 12:31:19+08:00
2858   2021-02-16 12:31:25+08:00
                  ...           
4298   2021-03-19 21:30:39+08:00
8481   2021-03-19 21:30:45+08:00
5477   2021-03-19 21:30:49+08:00
9511   2021-03-19 21:30:53+08:00
5819   2021-03-19 21:30:58+08:00
Name: datetime, Length: 266751, dtype: datetime64[ns, pytz.FixedOffset(480)]