In [1]:
from utils.kanhon_utils import *
import pandas as pd
%load_ext autoreload
%autoreload 2
from IPython.display import display, HTML
from utils.lta_omv_scraper import *
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import time
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import os

In [2]:
orig_df = pd.read_csv(r"./dataset/train.csv")
test_df = pd.read_csv(r"./dataset/test.csv")

# clean model
orig_df['model'] = orig_df['model'].apply(lambda x:x.replace('(', ''))
orig_df['reg_date_dt'] = orig_df['reg_date'].apply(lambda x: datetime.strptime(x, "%d-%b-%Y"))
orig_df['reg_date_year'] = orig_df['reg_date_dt'].apply(lambda x:x.year)
orig_df['reg_date_month'] = orig_df['reg_date_dt'].apply(lambda x:x.month)

In [3]:
train_df, val_df = train_test_split(orig_df, test_size=0.2, random_state=42, shuffle=True)

### Fill missing 'make' info from title

In [4]:
# idx_val = test_df[test_df['make'].isna()].index
# test_df.loc[idx_val]

List of makes obtained from:
1. https://www.carlogos.org/car-brands-a-z/ 2.
2. https://www.kaggle.com/datasets/jahaidulislam/car-specification-dataset-1945-2020

If make is still None, match with first word in title

In [5]:
make_df = pd.read_csv(r"./dataset/make.csv")
make_ls = [make.lower() for make in make_df['Make List'].unique()]

In [6]:
make_model_imputer = MakeModelImputer(make_ls)
train_df = make_model_imputer.transform(train_df)
val_df = make_model_imputer.transform(val_df) 
test_df = make_model_imputer.transform(test_df) 

### Cylinder count extractor

In [7]:
cylinder_count_extractor = CylinderExtractor()
train_df = cylinder_count_extractor.transform(train_df)
val_df = cylinder_count_extractor.transform(val_df) 

In [8]:
train_df.head()

Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,...,lifespan,eco_category,features,accessories,indicative_price,price,reg_date_dt,reg_date_year,reg_date_month,cylinder_cnt
23311,1260733,Isuzu NPR85,isuzu,npr85,cheapest in the market! 14 feet full canopy wi...,2019.0,,01-jul-2020,truck,premium ad car,...,30-jun-2040,uncategorized,view specs of the isuzu n series diesel,,,89800.0,2020-07-01,2020,7,
23623,1289505,Honda Fit 1.3A G (COE till 06/2024),honda,fit,200,2009.0,,12-aug-2009,hatchback,coe car,...,,uncategorized,view specs of the honda fit,,,700.0,2009-08-12,2009,8,
1020,1316182,Toyota Sienta 1.5A,toyota,sienta,non phv. excellent condition. 100% accident fr...,2018.0,,28-may-2018,mpv,"parf car, premium ad car",...,,uncategorized,"1.5l dohc 16v vvt-i engine, 7 speed cvt-i auto...",factory touchscreen audio system with reverse ...,,64800.0,2018-05-28,2018,5,
12645,1310514,Volvo V40 T4 Momentum,volvo,v40,"well maintained! the safety, reliable and dura...",2018.0,,30-aug-2019,hatchback,parf car,...,,uncategorized,powerful 2.0l 4 cylinder inline 16 valve turbo...,"keyless entry/start, cruise control, auto head...",,92800.0,2019-08-30,2019,8,4.0
1533,1247893,Kia Carens 2.0A GDI,kia,carens,rare 1 owner unit! low km done! fully maintain...,2015.0,,18-nov-2015,mpv,parf car,...,,uncategorized,view specs of the kia carens,upgraded headunit with reverse camera! factory...,,32888.0,2015-11-18,2015,11,


### Impute cylinder data using make, model

In [9]:
cylinder_imputer = CylinderImputer()
train_df = cylinder_imputer.fit_transform(train_df) # 1479 rows missing
val_df = cylinder_imputer.transform(val_df) # 334 rows missing

# Many rows are missing, impute using median values
imputer = SimpleImputer(strategy='median')
train_df['cylinder_cnt'] = imputer.fit_transform(train_df[['cylinder_cnt']])

val_df['cylinder_cnt'] = imputer.transform(val_df[['cylinder_cnt']])


In [10]:
# idx_val = train_df[train_df['cylinder_cnt'].isna()].index
# train_df.loc[idx_val]

In [11]:
# val_df.loc[idx_val]

### Impute using LTA omv data

In [12]:
csv_filename = r'./dataset/lta_omv_data.csv'

In [13]:
if os.path.exists(csv_filename):
    df_lta_car_data = pd.read_csv(csv_filename)
else:
    result_ls = get_lta_omv_data(2002, 2025)
    df_lta_car_data = preprocess_lta_omv_data(result_ls)
    df_lta_car_data.to_csv(csv_filename)


In [14]:
lta_data_imputer = LTADataImputer(df_lta_car_data)

In [15]:
idx_val = val_df[val_df['omv'].isna()].index
val_df.loc[idx_val]

Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,...,lifespan,eco_category,features,accessories,indicative_price,price,reg_date_dt,reg_date_year,reg_date_month,cylinder_cnt
23948,1226793,Mercedes-Benz 280S (COE till 08/2033),mercedes-benz,280,super rare. selling with special 'q' plate. ni...,1978.0,,31-aug-1978,luxury sedan,"coe car, rare & exotic, vintage cars",...,,uncategorized,very rare collectable w116 model 280s. early m...,"upgraded 20"" sports rim. facelifted edition. c...",,88800.0,1978-08-31,1978,8,6.0
19762,948629,Mercedes-Benz 280S (COE till 04/2025),mercedes-benz,280,10,1969.0,,02-may-1969,luxury sedan,"coe car, rare & exotic, low mileage car, vinta...",...,,uncategorized,,,,88888.0,1969-05-02,1969,5,6.0
20975,1317873,Suzuki LJ80 (COE till 10/2030),suzuki,lj80,immaculate condition. under-utilised classic v...,1981.0,,21-oct-1981,suv,"coe car, direct owner sale, rare & exotic, low...",...,,uncategorized,"spare tire feature, jerrycan feature, recaro s...",spare parts are easily available in malaysia a...,,135000.0,1981-10-21,1981,10,4.0
5474,1304526,Austin Mini 1000 (COE till 04/2029),austin,mini,"well sorted resto-mod mini, with many parts re...",1975.0,,03-apr-1975,others,"coe car, premium ad car, vintage cars",...,,uncategorized,,"restored bodywork , refreshed engine, brand ne...",,70888.0,1975-04-03,1975,4,4.0
3277,1302459,Porsche 911SC (COE till 11/2028),porsche,911,0,1982.0,,10-mar-1983,sports car,"coe car, rare & exotic, premium ad car, vintag...",...,,uncategorized,et brake light. removable targa soft top.,"original indoor cover, porsche classic communi...",,290800.0,1983-03-10,1983,3,6.0
4579,1331211,Mercedes-Benz 200 (COE till 06/2030),mercedes-benz,200,just arrived! direct sale! collection unit for...,1978.0,,26-dec-1978,luxury sedan,"coe car, rare & exotic, vintage cars",...,,uncategorized,,,,55000.0,1978-12-26,1978,12,4.0
13010,1258034,Morris Minor Convertible (COE till 04/2030),morris,minor,morris minor convertible with original su seri...,1961.0,,26-apr-1961,others,"coe car, rare & exotic, premium ad car, vintag...",...,,uncategorized,,aircon. radio.,,55000.0,1961-04-26,1961,4,4.0
16325,1273676,Austin Mini Club (COE till 11/2027),austin,mini,another classic has turned up in our studio. d...,1980.0,,27-dec-1980,others,"coe car, premium ad car, low mileage car, vint...",...,,uncategorized,1.0l 3 cylinder engine. 4 speed manual. it's a...,added air conditioning. tape deck player. it's...,,48800.0,1980-12-27,1980,12,3.0
4982,1292166,Volkswagen Beetle 1200 (COE till 06/2030),volkswagen,beetle,"enthusiasts will know the time, effort maintai...",1972.0,,06-apr-1972,others,"coe car, rare & exotic, consignment car, low m...",...,,uncategorized,"powered by a 1,200 cc petrol engine. transmiss...",original type 3 vw steering and accessories.,,86800.0,1972-04-06,1972,4,4.0
10042,1186350,Mercedes-Benz 280S (COE till 02/2029),mercedes-benz,280,very original and good condition. very well ke...,1980.0,,03-mar-1980,luxury sedan,"coe car, direct owner sale, low mileage car, v...",...,,uncategorized,"very rare collectable w116 model 280s, mercede...","original beautiful interior, power steering, p...",,116880.0,1980-03-03,1980,3,6.0


In [16]:
train_df = lta_data_imputer.transform(train_df) # before transform: 49 rows missing, after transform: 24 rows missing
val_df = lta_data_imputer.transform(val_df) # before transform: 15 rows missing, after transform: 9 rows missing

imputer = SimpleImputer(strategy='median')
train_df['omv'] = imputer.fit_transform(train_df[['omv']])
val_df['omv'] = imputer.transform(val_df[['omv']])

In [17]:
# import matplotlib.pyplot as plt

### Parse category

In [18]:
pd.set_option('display.max_columns', None)


In [19]:
category_parser = CategoryParser()
train_df = category_parser.fit_transform(train_df) 
val_df = category_parser.transform(val_df) 

In [20]:
train_df

Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,transmission,curb_weight,power,fuel_type,engine_cap,no_of_owners,depreciation,coe,road_tax,dereg_value,mileage,omv,arf,opc_scheme,lifespan,eco_category,features,accessories,indicative_price,price,reg_date_dt,reg_date_year,reg_date_month,cylinder_cnt,-,almost new car,coe car,consignment car,direct owner sale,electric cars,hybrid cars,imported used vehicle,low mileage car,opc car,parf car,premium ad car,rare & exotic,sgcarmart warranty cars,sta evaluated car,vintage cars
23311,1260733,Isuzu NPR85,isuzu,npr85,cheapest in the market! 14 feet full canopy wi...,2019.0,,01-jul-2020,truck,premium ad car,auto,2500.0,,diesel,2999.0,2.0,14860.0,22085,,13348.0,,37994.0,1900.0,,30-jun-2040,uncategorized,view specs of the isuzu n series diesel,,,89800.0,2020-07-01,2020,7,4.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
23623,1289505,Honda Fit 1.3A G (COE till 06/2024),honda,fit,200,2009.0,,12-aug-2009,hatchback,coe car,auto,1010.0,73.0,,1339.0,5.0,,14920,885.0,123.0,,14211.0,14211.0,,,uncategorized,view specs of the honda fit,,,700.0,2009-08-12,2009,8,4.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1020,1316182,Toyota Sienta 1.5A,toyota,sienta,non phv. excellent condition. 100% accident fr...,2018.0,,28-may-2018,mpv,"parf car, premium ad car",auto,1325.0,79.0,,1496.0,2.0,14530.0,38001,682.0,25880.0,80346.0,17199.0,17199.0,,,uncategorized,"1.5l dohc 16v vvt-i engine, 7 speed cvt-i auto...",factory touchscreen audio system with reverse ...,,64800.0,2018-05-28,2018,5,4.0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0
12645,1310514,Volvo V40 T4 Momentum,volvo,v40,"well maintained! the safety, reliable and dura...",2018.0,,30-aug-2019,hatchback,parf car,auto,1441.0,140.0,,1969.0,1.0,15770.0,36901,1176.0,35358.0,68000.0,22799.0,23919.0,,,uncategorized,powerful 2.0l 4 cylinder inline 16 valve turbo...,"keyless entry/start, cruise control, auto head...",,92800.0,2019-08-30,2019,8,4.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1533,1247893,Kia Carens 2.0A GDI,kia,carens,rare 1 owner unit! low km done! fully maintain...,2015.0,,18-nov-2015,mpv,parf car,auto,1544.0,122.0,,1999.0,1.0,15540.0,58190,1212.0,20117.0,130000.0,21074.0,21504.0,,,uncategorized,view specs of the kia carens,upgraded headunit with reverse camera! factory...,,32888.0,2015-11-18,2015,11,4.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21575,1330095,BMW 3 Series 318i,bmw,318i,0,2015.0,,26-feb-2016,luxury sedan,"parf car, consignment car, sgcarmart warranty ...",auto,1425.0,100.0,,1499.0,4.0,16790.0,46970,684.0,21136.0,167000.0,27677.0,25748.0,,,uncategorized,powered by 134bhp and 1.5l twinpower turbochar...,"upgraded rims, brembo brakes, carbon steering,...",,37800.0,2016-02-26,2016,2,4.0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0
5390,1300273,Nissan Qashqai 1.2A DIG-T,nissan,qashqai,"no repairs needed, buy and drive as it is! one...",2016.0,,17-jun-2016,suv,"parf car, premium ad car",auto,1285.0,85.0,,1197.0,2.0,13330.0,53694,508.0,19138.0,,18974.0,13974.0,,,uncategorized,1.2l 4 cylinder inline 16 valve dohc turbochar...,"auto headlights, multi function steering wheel...",,33688.0,2016-06-17,2016,6,4.0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0
860,1316903,Honda Jazz 1.3A,honda,jazz,"1 owner, maintain by honda kah motor, at dicks...",2019.0,,10-jul-2019,hatchback,parf car,auto,1052.0,73.0,,1318.0,1.0,14180.0,30009,578.0,19312.0,61600.0,16213.0,6213.0,,,uncategorized,"1.3l i-vtec engine, cvt auto transmission with...","3 day money back, certified cars, one year war...",,73800.0,2019-07-10,2019,7,4.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
15795,1329593,Ferrari 488 GTB,ferrari,488,ital unit with warranty till dec 2024. fully s...,2016.0,,03-nov-2016,sports car,"parf car, direct owner sale, rare & exotic, lo...",auto,1515.0,493.0,,3902.0,3.0,181520.0,55501,3796.0,319267.0,38000.0,300022.0,512040.0,,,uncategorized,a ferrari 488 gtb needs no extra words. view s...,"cf rear air ducts, cf under door cover,!lifter...",,649900.0,2016-11-03,2016,11,8.0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0


### Impute CO2 Emission data from SGCarMart

In [21]:
# from utils.sgcarmart_scraper import get_emission_data
# from tqdm import tqdm

In [22]:
# small_df = orig_df.loc[:50]

In [23]:
# %%time
# # orig_df['scrapped_emission_data'] = orig_df.apply(lambda x:get_emission_data(x['listing_id'], x['title']), axis=1)
# orig_df['scrapped_emission_data'] = None
# failed_idx = []
# # Iterate over each row with index
# for index, row in tqdm(orig_df.iterrows()):
#     # print(index, row)
#     try:
#         # Check if data is already scrapped to resume operation
#         if pd.isna(row['scrapped_emission_data']) or row['scrapped_emission_data'] is None:
#             # Apply the get_emission_data function and store in the DataFrame
#             orig_df.at[index, 'scrapped_emission_data'] = get_emission_data(row['listing_id'], row['title'])
#     except Exception as e:
#         print(e)
#         failed_idx.append(index)
#     # Save progress every few rows to a file 
#     if index % 100 == 0:  
#         orig_df.to_csv("progress.csv", index=False)

# # Save final progress after the loop completes
# orig_df.to_csv("final_scrapped_emission_data.csv", index=False)

In [24]:
# test_df['scrapped_emission_data'] = None
# failed_idx = []
# # Iterate over each row with index
# for index, row in tqdm(test_df.iterrows()):
#     # print(index, row)
#     try:
#         # Check if data is already scrapped to resume operation
#         if pd.isna(row['scrapped_emission_data']) or row['scrapped_emission_data'] is None:
#             # Apply the get_emission_data function and store in the DataFrame
#             test_df.at[index, 'scrapped_emission_data'] = get_emission_data(row['listing_id'], row['title'])
#     except Exception as e:
#         print(e)
#         failed_idx.append(index)
#     # Save progress every few rows to a file 
#     if index % 100 == 0:  
#         test_df.to_csv("progress_test.csv", index=False)

# # Save final progress after the loop completes
# test_df.to_csv("final_scrapped_emission_test_data.csv", index=False)

### ARF Imputer

In [25]:
from sklearn.impute import KNNImputer, SimpleImputer
pd.set_option('display.max_columns', None)
from utils.melissa_utils import generic_one_hotencoding, vehicle_type_fit_transform, vehicle_type_fit
from utils.kanhon_utils import LTADataImputer, EmissionImputer
import os

##### Encode veh type

In [26]:
train_df , vehicle_type_encoder = vehicle_type_fit_transform(train_df, "type_of_vehicle")
val_df = vehicle_type_fit(val_df, vehicle_type_encoder)
# test_df = vehicle_type_fit(test_df, vehicle_type_encoder)

In [27]:
# Select features relevant for imputation
features = ['manufactured', 'reg_date_year', 'omv', 'arf', 'type_of_vehicle_bus/mini bus', 'type_of_vehicle_hatchback',
       'type_of_vehicle_luxury sedan', 'type_of_vehicle_mid-sized sedan',
       'type_of_vehicle_mpv', 'type_of_vehicle_others',
       'type_of_vehicle_sports car', 'type_of_vehicle_stationwagon',
       'type_of_vehicle_suv', 'type_of_vehicle_truck', 'type_of_vehicle_van']

# Extract the subset of data for imputation
impute_df = train_df[features]

# Initialize KNNImputer
imputer = KNNImputer(n_neighbors=5)

# Perform imputation
imputed_array = imputer.fit_transform(impute_df)

# Create a DataFrame from the imputed array
imputed_df = pd.DataFrame(imputed_array, columns=features)

# Add the imputed 'arf' values back to the original DataFrame
train_df['arf_impute'] = imputed_df['arf']

# Display the DataFrame with imputed values
print("\nDataFrame with Imputed ARF Values:")
print(train_df[['listing_id', 'omv', 'arf', 'arf_impute']])


DataFrame with Imputed ARF Values:
       listing_id       omv       arf  arf_impute
0         1260733   37994.0    1900.0      1900.0
1         1289505   14211.0   14211.0     14211.0
2         1316182   17199.0   17199.0     17199.0
3         1310514   22799.0   23919.0     23919.0
4         1247893   21074.0   21504.0     21504.0
...           ...       ...       ...         ...
19995     1330095   27677.0   25748.0     25748.0
19996     1300273   18974.0   13974.0     13974.0
19997     1316903   16213.0    6213.0      6213.0
19998     1329593  300022.0  512040.0    512040.0
19999     1321619   91730.0   91730.0     91730.0

[20000 rows x 4 columns]


### Eco-category

If this does not help predict prices, potentially propose policies for government to implement

In [28]:
emission_imputer = EmissionImputer(train_csv_dir=r'./dataset/train_data_scrapped_co2_emission.csv', test_csv_dir =r'./dataset/test_data_scrapped_co2_emission.csv')

In [29]:
train_df = emission_imputer.impute_values(df=train_df, df_type='train')
test_df = emission_imputer.impute_values(df=test_df, df_type='test')

In [30]:
train_df.head(5)

Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,transmission,curb_weight,power,fuel_type,engine_cap,no_of_owners,depreciation,coe,road_tax,dereg_value,mileage,omv,arf,opc_scheme,lifespan,eco_category,features,accessories,indicative_price,price,reg_date_dt,reg_date_year,reg_date_month,cylinder_cnt,-,almost new car,coe car,consignment car,direct owner sale,electric cars,hybrid cars,imported used vehicle,low mileage car,opc car,parf car,premium ad car,rare & exotic,sgcarmart warranty cars,sta evaluated car,vintage cars,type_of_vehicle_bus/mini bus,type_of_vehicle_hatchback,type_of_vehicle_luxury sedan,type_of_vehicle_mid-sized sedan,type_of_vehicle_mpv,type_of_vehicle_others,type_of_vehicle_sports car,type_of_vehicle_stationwagon,type_of_vehicle_suv,type_of_vehicle_truck,type_of_vehicle_van,arf_impute,emission_data
0,1260733,Isuzu NPR85,isuzu,npr85,cheapest in the market! 14 feet full canopy wi...,2019.0,,01-jul-2020,truck,premium ad car,auto,2500.0,,diesel,2999.0,2.0,14860.0,22085,,13348.0,,37994.0,1900.0,,30-jun-2040,uncategorized,view specs of the isuzu n series diesel,,,89800.0,2020-07-01,2020,7,4.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1900.0,
1,1289505,Honda Fit 1.3A G (COE till 06/2024),honda,fit,200,2009.0,,12-aug-2009,hatchback,coe car,auto,1010.0,73.0,,1339.0,5.0,,14920,885.0,123.0,,14211.0,14211.0,,,uncategorized,view specs of the honda fit,,,700.0,2009-08-12,2009,8,4.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14211.0,
2,1316182,Toyota Sienta 1.5A,toyota,sienta,non phv. excellent condition. 100% accident fr...,2018.0,,28-may-2018,mpv,"parf car, premium ad car",auto,1325.0,79.0,,1496.0,2.0,14530.0,38001,682.0,25880.0,80346.0,17199.0,17199.0,,,uncategorized,"1.5l dohc 16v vvt-i engine, 7 speed cvt-i auto...",factory touchscreen audio system with reverse ...,,64800.0,2018-05-28,2018,5,4.0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,17199.0,
3,1310514,Volvo V40 T4 Momentum,volvo,v40,"well maintained! the safety, reliable and dura...",2018.0,,30-aug-2019,hatchback,parf car,auto,1441.0,140.0,,1969.0,1.0,15770.0,36901,1176.0,35358.0,68000.0,22799.0,23919.0,,,uncategorized,powerful 2.0l 4 cylinder inline 16 valve turbo...,"keyless entry/start, cruise control, auto head...",,92800.0,2019-08-30,2019,8,4.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23919.0,128.0
4,1247893,Kia Carens 2.0A GDI,kia,carens,rare 1 owner unit! low km done! fully maintain...,2015.0,,18-nov-2015,mpv,parf car,auto,1544.0,122.0,,1999.0,1.0,15540.0,58190,1212.0,20117.0,130000.0,21074.0,21504.0,,,uncategorized,view specs of the kia carens,upgraded headunit with reverse camera! factory...,,32888.0,2015-11-18,2015,11,4.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,21504.0,184.0
