In [None]:
# ===============================================================
# Data Preprocessing
# Outputs (under ./data)
# ===============================================================

import pandas as pd
import numpy as np
from pathlib import Path

DATA_DIR = Path("./data")

TRAIN_PATH = DATA_DIR / "train.csv"
TEST_PATH = DATA_DIR / "test.csv"

TRAIN_PREPROCESSED_PATH = DATA_DIR / "train_preprocessed.csv"
TEST_PREPROCESSED_PATH = DATA_DIR / "test_preprocessed.csv"

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

In [6]:
def preprocess(dataset: pd.DataFrame, train=True) -> None:
    dataset['TDateTimeOfAccident'] = pd.to_datetime(dataset['DateTimeOfAccident'], utc=True)
    dataset['TDateReported'] = pd.to_datetime(dataset['DateReported'], utc=True)

    dataset['accident_year'] = dataset['TDateTimeOfAccident'].dt.year
    dataset['accident_month'] = dataset['TDateTimeOfAccident'].dt.month
    dataset['accident_dow'] = dataset['TDateTimeOfAccident'].dt.dayofweek
    dataset['accident_hour'] = dataset['TDateTimeOfAccident'].dt.hour
    dataset['is_weekend'] = dataset['accident_dow'].isin([5, 6]).astype(int)

    dataset['report_delay_days'] = (dataset['TDateReported'] - dataset['TDateTimeOfAccident']).dt.days

    dataset['hourly_wage'] = dataset['WeeklyWages'] / dataset['HoursWorkedPerWeek']
    dataset['hourly_wage'] = dataset['hourly_wage'].replace([0, np.inf, -np.inf], np.nan)
    
    dataset['iicc_is_one_flag'] = (dataset['InitialIncurredClaimsCost'] == 1).astype(int)
    dataset['inconsistent_wages_flag'] = dataset[dataset['WeeklyWages'] > 0].index.isin((dataset[dataset['HoursWorkedPerWeek'] == 0].index)).astype(int)
    dataset['invalid_exposure_flag'] = dataset[dataset['DaysWorkedPerWeek'] > 0].index.isin((dataset[dataset['HoursWorkedPerWeek'] == 0].index)).astype(int)
    dataset['iicc_small_flag'] = (dataset['InitialIncurredClaimsCost'] < 100).astype(int)

    dataset['logIICC'] = np.log1p(dataset['InitialIncurredClaimsCost'])

    # Fill missing data
    dataset['report_delay_days'] = dataset['report_delay_days'].apply(lambda x: 0 if x < 0 else x)
    dataset['MaritalStatus'] = dataset['MaritalStatus'].fillna('U')

    if train:
        dataset['logUICC'] = np.log1p(dataset['UltimateIncurredClaimCost'])

In [7]:
# Load raw datasets
df_train = pd.read_csv(TRAIN_PATH, delimiter=",")
df_test = pd.read_csv(TEST_PATH, delimiter=",")

preprocess(df_train)
preprocess(df_test, False)

# Dump preprocessed datasets
df_train.to_csv(TRAIN_PREPROCESSED_PATH, index=False)
df_test.to_csv(TEST_PREPROCESSED_PATH, index=False)