In [49]:
from utils.utils import *
import pandas as pd
%load_ext autoreload
%autoreload 2
from IPython.display import display, HTML
from utils.lta_omv_scraper import *
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import time
from datetime import datetime

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Read and clean dataset

In [58]:
train_df = pd.read_csv(r"./dataset/train.csv")

# clean model
train_df['model'] = train_df['model'].apply(lambda x:x.replace('(', ''))
train_df['reg_date_dt'] = train_df['reg_date'].apply(lambda x: datetime.strptime(x, "%d-%b-%Y"))
train_df['reg_date_year'] = train_df['reg_date_dt'].apply(lambda x:x.year)
train_df['reg_date_month'] = train_df['reg_date_dt'].apply(lambda x:x.month)

### Fill missing 'make' info from title

In [59]:
make_ls = train_df['make'].dropna().unique()
compiled_regex = compile_make_pattern(make_ls)

train_df = apply_make_extraction(train_df, compiled_regex)

In [60]:
train_df['make'] = train_df['make'].fillna(train_df['extracted_make'])


In [61]:
train_df["model_make"] = train_df.model + "_" + train_df.make

### type_of_vehicle

In [62]:
# train_df.groupby(ref_col)[feature].agg(["mean"]).rename(columns={"mean": feature})[feature]

### curb_weight

In [63]:
# display(HTML(train_df.head(50).to_html()))

In [66]:
def fill_column_b_with_mean(df, column_a, column_b):
    # Calculate the mean of column B grouped by column A
    values = df.groupby(column_a)[column_b].agg(["mean"]).rename(columns={"mean": column_b})[column_b]
    
    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        # If column B is NaN, fill it with the mean value of column B for the corresponding value in column A
        if pd.isna(row[column_b]):
            df.at[index, column_b] = values[row[column_a]]
    return df

In [67]:
feature = "curb_weight"
ref_col = "model_make"
new_df = fill_column_b_with_mean(train_df,ref_col,feature)
train_df['curb_weight_impute'] = new_df[feature]

### power 

In [68]:
feature = "power"
ref_col = "model_make"
new_df = fill_column_b_with_mean(train_df,ref_col,feature)
train_df['power_impute'] = new_df[feature]

### engine_cap 

In [69]:
feature = "engine_cap"
ref_col = "model_make"
new_df = fill_column_b_with_mean(train_df,ref_col,feature)
train_df['engine_cap_impute'] = new_df[feature]

### no_of_owners

### depreciation 

### dereg_value

### mileage   

### omv 

In [70]:
result_ls = get_lta_omv_data(2002, 2025)
df_lta_car_data = preprocess_lta_omv_data(result_ls)

In [71]:
def impute_using_lta_data(row):
    # print(row)
    if not np.isnan(row['omv']):
        return row['omv']
    else:
        make = row['make']
        model = row['model']
        year = row['reg_date_year']
        lookup_by_make_model_year = df_lta_car_data[(df_lta_car_data['make_clean'] == make) & (df_lta_car_data['model_split'].str.contains(model)) & (df_lta_car_data['year'] == year)]
        
        lookup_by_make_model = df_lta_car_data[(df_lta_car_data['make_clean'] == make) & (df_lta_car_data['model_split'].str.contains(model))]

        lookup_by_make = df_lta_car_data[(df_lta_car_data['make_clean'] == make)]
        
        if not lookup_by_make_model_year.empty:
            # print('lookup_by_make_model_year')
            return lookup_by_make_model_year['omv_clean'].mean()
        elif not lookup_by_make_model.empty:
            # print('lookup_by_make_model')
            return lookup_by_make_model['omv_clean'].mean()
        elif not lookup_by_make.empty:
            # print('lookup_by_make')
            return lookup_by_make['omv_clean'].mean()
        else: 
            return None

In [72]:
train_df['omv_new'] = train_df['omv']
train_df['omv_new'] = train_df.apply(impute_using_lta_data, axis=1)
train_df['omv_new'] = train_df['omv_new'].fillna(train_df['omv_new'].mean())

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 39 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   listing_id          25000 non-null  int64         
 1   title               25000 non-null  object        
 2   make                25000 non-null  object        
 3   model               25000 non-null  object        
 4   description         24320 non-null  object        
 5   manufactured        24993 non-null  float64       
 6   original_reg_date   255 non-null    object        
 7   reg_date            25000 non-null  object        
 8   type_of_vehicle     25000 non-null  object        
 9   category            25000 non-null  object        
 10  transmission        25000 non-null  object        
 11  curb_weight         24906 non-null  float64       
 12  power               23113 non-null  float64       
 13  fuel_type           5879 non-null   object    

### features 

In [73]:
train_df['cylinder_cnt'] = train_df['features'].apply(lambda x: extract_cylinder_from_features(x) if pd.notnull(x) else None)
cylinder_count_dict = extract_cylinder_by_model_make(train_df)
train_df['cylinder_cnt'] = train_df.apply(lambda x:impute_row_by_make_model(x, cylinder_count_dict, 'cylinder_cnt'), axis=1)
# fill remaining values with median of 4 cylinder (most common engine)
median_cylinder_cnt = train_df['cylinder_cnt'].median(axis=0)
train_df['cylinder_cnt'] = train_df['cylinder_cnt'].fillna(median_cylinder_cnt)

In [74]:
train_df.columns

Index(['listing_id', 'title', 'make', 'model', 'description', 'manufactured',
       'original_reg_date', 'reg_date', 'type_of_vehicle', 'category',
       'transmission', 'curb_weight', 'power', 'fuel_type', 'engine_cap',
       'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value',
       'mileage', 'omv', 'arf', 'opc_scheme', 'lifespan', 'eco_category',
       'features', 'accessories', 'indicative_price', 'price', 'reg_date_dt',
       'reg_date_year', 'reg_date_month', 'extracted_make', 'model_make',
       'curb_weight_impute', 'power_impute', 'engine_cap_impute', 'omv_new',
       'cylinder_cnt'],
      dtype='object')

## Training models

In [75]:
# display(HTML(train_df.head(50).to_html()))

In [80]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from utils.models import Classification

# Assuming the Classification class is already imported as described
features = ['curb_weight_impute', 'power_impute', 'cylinder_cnt', 'omv_new',
       'engine_cap_impute']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(train_df[features], train_df['price'], test_size=0.3, random_state=42)

# 2. Initialize the Classification class
classifier = Classification()

# 3. Define parameters for hyperparameter tuning
params = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Example for Random Forest with accuracy scoring and KFold validation
tuner, best_params, tuner_results = classifier.hyperparameter_tuning(
    df_tuple=(X_train, y_train),   # The training data (X, y)
    score_metric='accuracy',       # Scoring metric
    kfold_type='kfold',            # Type of fold (KFold used here)
    n_splits=5,                    # Number of splits for KFold
    params=params,                 # Hyperparameter grid to search
    model_name='xgb'      # Model to use (Random Forest in this case)
)

# 4. View the best hyperparameters and fit the model
print(f"Best Parameters: {best_params}")

# 5. Use the best model to predict on the test set
best_model = tuner.best_estimator_
y_pred = best_model.predict(X_test)

# 6. Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Test Set MSE: {mse}")
print(f"Test Set R2 Score: {r2}")

Model Tuning Starttime :: 2024-10-06 23:21:50.479017
Model Tuning Completed :: 2024-10-06 23:22:06.749140
Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 10}
Test Set MSE: 3222797125.3751683
Test Set R2 Score: 0.8590261288206882


 nan nan nan nan nan nan nan nan nan]
Parameters: { "min_samples_split" } are not used.

