In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

from utils.Bhushan_utils import *
from utils.Temp_utils import *

In [2]:
# Train data read
orig_df = pd.read_csv(r"./dataset/train.csv")
test_df = pd.read_csv(r"./dataset/test.csv")
print(len(orig_df), len(test_df))

25000 10000


In [3]:
train_df, val_df = train_test_split(orig_df, test_size=0.2, random_state=42, shuffle=True)
print(len(train_df), len(val_df))

20000 5000


In [4]:
# Make model Imputation - No need to merge to main

unique_makes = orig_df['make'].unique()
unique_makes_list = sorted([make for make in unique_makes if pd.notnull(make)])
makes_regex = compile_make_pattern(unique_makes_list)
train_df = apply_make_extraction(train_df, makes_regex)
val_df = apply_make_extraction(val_df, makes_regex)

In [5]:
# Depreciation

# Imputation using make, model and car age - Dependency on 'manufactured' - Required to calculate car age
depreciation_imputer = DepreciationImputer()
train_df = calc_vehicle_age(train_df)
val_df = calc_vehicle_age(val_df)
train_df = depreciation_imputer.fit_transform(train_df)
val_df = depreciation_imputer.transform(val_df)

# Min-max normalization
scaler = MinMaxScaler(feature_range=(0, 1))
train_df['depreciation'] = scaler.fit_transform(train_df[['depreciation']])
val_df['depreciation'] = scaler.transform(val_df[['depreciation']])

In [6]:
# Transmission

# One-hot (Binary) encoding
encoder = OneHotEncoder(sparse_output=False, drop='first')
train_df['transmission'] = encoder.fit_transform(train_df[['transmission']])
val_df['transmission'] = encoder.transform(val_df[['transmission']])

In [7]:
# COE

# Cap Outliers
train_df = cap_coe_outliers(train_df)
val_df = cap_coe_outliers(val_df)

# Min-Max normalization
scaler = MinMaxScaler(feature_range=(0, 1))
train_df['coe'] = scaler.fit_transform(train_df[['coe']])
val_df['coe'] = scaler.transform(val_df[['coe']])