In [13]:
from utils.kanhon_utils import *
import pandas as pd
%load_ext autoreload
%autoreload 2
from IPython.display import display, HTML
from utils.lta_omv_scraper import *
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import time
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import os

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
orig_df = pd.read_csv(r"./dataset/train.csv")
test_df = pd.read_csv(r"./dataset/test.csv")

# clean model
orig_df['model'] = orig_df['model'].apply(lambda x:x.replace('(', ''))
orig_df['reg_date_dt'] = orig_df['reg_date'].apply(lambda x: datetime.strptime(x, "%d-%b-%Y"))
orig_df['reg_date_year'] = orig_df['reg_date_dt'].apply(lambda x:x.year)
orig_df['reg_date_month'] = orig_df['reg_date_dt'].apply(lambda x:x.month)

In [7]:
train_df, val_df = train_test_split(orig_df, test_size=0.2, random_state=42, shuffle=True)

### Fill missing 'make' info from title

In [202]:
# idx_val = test_df[test_df['make'].isna()].index
# test_df.loc[idx_val]

List of makes obtained from:
1. https://www.carlogos.org/car-brands-a-z/ 2.
2. https://www.kaggle.com/datasets/jahaidulislam/car-specification-dataset-1945-2020

If make is still None, match with first word in title

In [203]:
make_df = pd.read_csv(r"./dataset/make.csv")
make_ls = [make.lower() for make in make_df['Make List'].unique()]

In [200]:
make_model_imputer = MakeModelImputer(make_ls)
train_df = make_model_imputer.transform(train_df)
val_df = make_model_imputer.transform(val_df) 
test_df = make_model_imputer.transform(test_df) 

### Cylinder count extractor

In [112]:
cylinder_count_extractor = CylinderExtractor()
train_df = cylinder_count_extractor.transform(train_df)
val_df = cylinder_count_extractor.transform(val_df) 

In [113]:
train_df.head()

Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,transmission,curb_weight,power,fuel_type,engine_cap,no_of_owners,depreciation,coe,road_tax,dereg_value,mileage,omv,arf,opc_scheme,lifespan,eco_category,features,accessories,indicative_price,price,reg_date_dt,reg_date_year,reg_date_month,cylinder_cnt
23311,1260733,Isuzu NPR85,isuzu,npr85,cheapest in the market! 14 feet full canopy wi...,2019.0,,01-jul-2020,truck,premium ad car,auto,2500.0,,diesel,2999.0,2.0,14860.0,22085,,13348.0,,37994.0,1900.0,,30-jun-2040,uncategorized,view specs of the isuzu n series diesel,,,89800.0,2020-07-01,2020,7,
23623,1289505,Honda Fit 1.3A G (COE till 06/2024),honda,fit,200,2009.0,,12-aug-2009,hatchback,coe car,auto,1010.0,73.0,,1339.0,5.0,,14920,885.0,123.0,,14211.0,14211.0,,,uncategorized,view specs of the honda fit,,,700.0,2009-08-12,2009,8,
1020,1316182,Toyota Sienta 1.5A,toyota,sienta,non phv. excellent condition. 100% accident fr...,2018.0,,28-may-2018,mpv,"parf car, premium ad car",auto,1325.0,79.0,,1496.0,2.0,14530.0,38001,682.0,25880.0,80346.0,17199.0,17199.0,,,uncategorized,"1.5l dohc 16v vvt-i engine, 7 speed cvt-i auto...",factory touchscreen audio system with reverse ...,,64800.0,2018-05-28,2018,5,
12645,1310514,Volvo V40 T4 Momentum,volvo,v40,"well maintained! the safety, reliable and dura...",2018.0,,30-aug-2019,hatchback,parf car,auto,1441.0,140.0,,1969.0,1.0,15770.0,36901,1176.0,35358.0,68000.0,22799.0,23919.0,,,uncategorized,powerful 2.0l 4 cylinder inline 16 valve turbo...,"keyless entry/start, cruise control, auto head...",,92800.0,2019-08-30,2019,8,4.0
1533,1247893,Kia Carens 2.0A GDI,kia,carens,rare 1 owner unit! low km done! fully maintain...,2015.0,,18-nov-2015,mpv,parf car,auto,1544.0,122.0,,1999.0,1.0,15540.0,58190,1212.0,20117.0,130000.0,21074.0,21504.0,,,uncategorized,view specs of the kia carens,upgraded headunit with reverse camera! factory...,,32888.0,2015-11-18,2015,11,


### Impute cylinder data using make, model

In [114]:
cylinder_imputer = CylinderImputer()
train_df = cylinder_imputer.fit_transform(train_df) # 1479 rows missing
val_df = cylinder_imputer.transform(val_df) # 334 rows missing

# Many rows are missing, impute using median values
imputer = SimpleImputer(strategy='median')
train_df['cylinder_cnt'] = imputer.fit_transform(train_df[['cylinder_cnt']])

val_df['cylinder_cnt'] = imputer.transform(val_df[['cylinder_cnt']])


In [115]:
# idx_val = train_df[train_df['cylinder_cnt'].isna()].index
# train_df.loc[idx_val]

In [116]:
# val_df.loc[idx_val]

### Impute using LTA omv data

In [117]:
csv_filename = r'./dataset/lta_omv_data.csv'

In [118]:
if os.path.exists(csv_filename):
    df_lta_car_data = pd.read_csv(csv_filename)
else:
    result_ls = get_lta_omv_data(2002, 2025)
    df_lta_car_data = preprocess_lta_omv_data(result_ls)
    df_lta_car_data.to_csv(csv_filename)


In [119]:
# lta_data_imputer = LTADataImputer(df_lta_car_data)

In [120]:
idx_val = val_df[val_df['omv'].isna()].index
val_df.loc[idx_val]

Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,transmission,curb_weight,power,fuel_type,engine_cap,no_of_owners,depreciation,coe,road_tax,dereg_value,mileage,omv,arf,opc_scheme,lifespan,eco_category,features,accessories,indicative_price,price,reg_date_dt,reg_date_year,reg_date_month,cylinder_cnt
23948,1226793,Mercedes-Benz 280S (COE till 08/2033),mercedes-benz,280,super rare. selling with special 'q' plate. ni...,1978.0,,31-aug-1978,luxury sedan,"coe car, rare & exotic, vintage cars",auto,,,,2746.0,3.0,9630.0,23672,,21817.0,,,,,,uncategorized,very rare collectable w116 model 280s. early m...,"upgraded 20"" sports rim. facelifted edition. c...",,88800.0,1978-08-31,1978,8,6.0
19762,948629,Mercedes-Benz 280S (COE till 04/2025),mercedes-benz,280,10,1969.0,,02-may-1969,luxury sedan,"coe car, rare & exotic, low mileage car, vinta...",manual,,,,2778.0,6.0,102030.0,7173,,625.0,32113.0,,,,,uncategorized,,,,88888.0,1969-05-02,1969,5,6.0
20975,1317873,Suzuki LJ80 (COE till 10/2030),suzuki,lj80,immaculate condition. under-utilised classic v...,1981.0,,21-oct-1981,suv,"coe car, direct owner sale, rare & exotic, low...",auto,,,,797.0,6.0,21430.0,35200,,22171.0,30000.0,,,,,uncategorized,"spare tire feature, jerrycan feature, recaro s...",spare parts are easily available in malaysia a...,,135000.0,1981-10-21,1981,10,4.0
5474,1304526,Austin Mini 1000 (COE till 04/2029),austin,mini,"well sorted resto-mod mini, with many parts re...",1975.0,,03-apr-1975,others,"coe car, premium ad car, vintage cars",manual,,,,998.0,6.0,14530.0,26175,,12772.0,,,,,,uncategorized,,"restored bodywork , refreshed engine, brand ne...",,70888.0,1975-04-03,1975,4,4.0
3277,1302459,Porsche 911SC (COE till 11/2028),porsche,911,0,1982.0,,10-mar-1983,sports car,"coe car, rare & exotic, premium ad car, vintag...",auto,,,,2994.0,6.0,65640.0,3213,,1423.0,,,,,,uncategorized,et brake light. removable targa soft top.,"original indoor cover, porsche classic communi...",,290800.0,1983-03-10,1983,3,6.0
4579,1331211,Mercedes-Benz 200 (COE till 06/2030),mercedes-benz,200,just arrived! direct sale! collection unit for...,1978.0,,26-dec-1978,luxury sedan,"coe car, rare & exotic, vintage cars",auto,,,,1988.0,6.0,9510.0,3357,,1942.0,,,,,,uncategorized,,,,55000.0,1978-12-26,1978,12,4.0
13010,1258034,Morris Minor Convertible (COE till 04/2030),morris,minor,morris minor convertible with original su seri...,1961.0,,26-apr-1961,others,"coe car, rare & exotic, premium ad car, vintag...",auto,,,,948.0,4.0,9360.0,3288,,1932.0,,,,,,uncategorized,,aircon. radio.,,55000.0,1961-04-26,1961,4,4.0
16325,1273676,Austin Mini Club (COE till 11/2027),austin,mini,another classic has turned up in our studio. d...,1980.0,,27-dec-1980,others,"coe car, premium ad car, low mileage car, vint...",manual,,,,998.0,4.0,14090.0,41994,,14543.0,102800.0,,,,,uncategorized,1.0l 3 cylinder engine. 4 speed manual. it's a...,added air conditioning. tape deck player. it's...,,48800.0,1980-12-27,1980,12,3.0
4982,1292166,Volkswagen Beetle 1200 (COE till 06/2030),volkswagen,beetle,"enthusiasts will know the time, effort maintai...",1972.0,,06-apr-1972,others,"coe car, rare & exotic, consignment car, low m...",manual,,,,1192.0,6.0,14360.0,32875,,19869.0,80000.0,,,,,uncategorized,"powered by a 1,200 cc petrol engine. transmiss...",original type 3 vw steering and accessories.,,86800.0,1972-04-06,1972,4,4.0
10042,1186350,Mercedes-Benz 280S (COE till 02/2029),mercedes-benz,280,very original and good condition. very well ke...,1980.0,,03-mar-1980,luxury sedan,"coe car, direct owner sale, low mileage car, v...",auto,,,,2746.0,4.0,24820.0,31933,,15039.0,125280.0,,,,,uncategorized,"very rare collectable w116 model 280s, mercede...","original beautiful interior, power steering, p...",,116880.0,1980-03-03,1980,3,6.0


In [121]:
train_df = lta_data_imputer.transform(train_df) # before transform: 49 rows missing, after transform: 24 rows missing
val_df = lta_data_imputer.transform(val_df) # before transform: 15 rows missing, after transform: 9 rows missing

imputer = SimpleImputer(strategy='median')
train_df['omv'] = imputer.fit_transform(train_df[['omv']])
val_df['omv'] = imputer.transform(val_df[['omv']])

### Parse category

In [122]:
pd.set_option('display.max_columns', None)


In [123]:
category_parser = CategoryParser()
train_df = category_parser.fit_transform(train_df) 
val_df = category_parser.transform(val_df) 

In [4]:
train_df

NameError: name 'train_df' is not defined

### Impute CO2 Emission data from SGCarMart

In [31]:
from utils.sgcarmart_scraper import get_emission_data
from tqdm import tqdm

In [18]:
small_df = orig_df.loc[:50]

In [37]:
%%time
# orig_df['scrapped_emission_data'] = orig_df.apply(lambda x:get_emission_data(x['listing_id'], x['title']), axis=1)
orig_df['scrapped_emission_data'] = None
failed_idx = []
# Iterate over each row with index
for index, row in tqdm(orig_df.iterrows()):
    # print(index, row)
    try:
        # Check if data is already scrapped to resume operation
        if pd.isna(row['scrapped_emission_data']) or row['scrapped_emission_data'] is None:
            # Apply the get_emission_data function and store in the DataFrame
            orig_df.at[index, 'scrapped_emission_data'] = get_emission_data(row['listing_id'], row['title'])
    except Exception as e:
        print(e)
        failed_idx.append(index)
    # Save progress every few rows to a file 
    if index % 100 == 0:  
        orig_df.to_csv("progress.csv", index=False)

# Save final progress after the loop completes
orig_df.to_csv("final_scrapped_emission_data.csv", index=False)

9508it [4:29:55,  2.17s/it]

502 Server Error: Bad Gateway for url: https://www.sgcarmart.com/new_cars/newcars_specs.php?CarCode=12232&Subcode=7259


18091it [8:20:23,  1.24it/s]

503 Server Error: Service Unavailable for url: https://www.sgcarmart.com/used_cars/info.php?ID=1304498


25000it [11:19:39,  1.63s/it]


CPU times: total: 59min 44s
Wall time: 11h 19min 40s


In [34]:
orig_df

Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,...,lifespan,eco_category,features,accessories,indicative_price,price,reg_date_dt,reg_date_year,reg_date_month,scrapped_emission_data
0,1292132,Land Rover Range Rover Velar 3.0A Si6 R-Dynami...,land rover,range,"1 owner, no repairs needed! it looks great, in...",2018.0,,08-mar-2018,suv,parf car,...,,uncategorized,3l supercharged v6 p380 engine at 375bhp/450nm...,"2 x massage/memory/cooling & warmer seat, rear...",,193788.0,2018-03-08,2018,3,UNKNOWN
1,1294696,Mercedes-Benz C-Class C200 Sport Premium Sunroof,mercedes-benz,c200,rare beautiful white c200 sport premium sunroo...,2017.0,,28-dec-2017,luxury sedan,"parf car, premium ad car",...,,uncategorized,"2.0l 4 cylinders inline turbocharged engine, p...","multi function steering, electric tailgate, re...",,96800.0,2017-12-28,2017,12,136 g/km (As tested by LTA)
2,1311717,Honda Odyssey 2.4A (COE till 09/2027),honda,odyssey,comes with warranty. full service done.,2007.0,,19-sep-2007,mpv,"coe car, premium ad car, low mileage car",...,,uncategorized,"2.4l k24a 4 cylinders inline dohc i-vtec, 5 sp...","cruise control, touchscreen audio, reverse cam...",,39800.0,2007-09-19,2007,9,unknown
3,1310068,Toyota Corolla Altis 1.6A (COE till 12/2028),toyota,altis,0,2008.0,,15-dec-2008,mid-sized sedan,"coe car, premium ad car",...,,uncategorized,super fuel efficient 1.6l 16 valves dohc vvt-i...,"leather seats, pioneer dvd audio system with r...",,44800.0,2008-12-15,2008,12,164 g/km (As tested by LTA)
4,1325280,Lexus GS300 (COE till 06/2026),lexus,gs,wear and tear done up. well maintained and reg...,2006.0,,22-dec-2006,luxury sedan,"coe car, premium ad car",...,,uncategorized,"powerful 3.0l v6 engine, 227bhp, 6 speed , key...",premium upholstery electric seats. memory seat...,,25800.0,2006-12-22,2006,12,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,1329201,Mercedes-Benz C-Class C180 (COE till 06/2031),mercedes-benz,c180,elegant looking and very well maintained by ow...,2011.0,,14-jul-2011,luxury sedan,"coe car, premium ad car, low mileage car",...,,uncategorized,inline 4 16 valves rwd engine displacement 179...,new battery new throttle body new recording ca...,,85000.0,2011-07-14,2011,7,
24996,1285898,Bentley Bentayga 4.0A V8,bentley,bentayga,"ceo car chauffeur driven, always sheltered par...",2018.0,23-aug-2018,29-apr-2020,suv,"parf car, direct owner sale, rare & exotic",...,,uncategorized,"4.0 liter twin turbocharged v8 engine, produci...","panoramic roof, 8""touchscreen display, navigat...",,412888.0,2020-04-29,2020,4,
24997,1306309,Mercedes-Benz A-Class A180,mercedes-benz,a180,great for short term drive or to renew coe. op...,2014.0,,01-apr-2015,hatchback,"parf car, premium ad car, low mileage car, sgc...",...,,uncategorized,1.6l inline 4 turbocharged engine with 7 speed...,"dual electric seats with 3 memory settings, di...",,23888.0,2015-04-01,2015,4,
24998,1306902,Audi A3 Sportback 1.0A TFSI S-tronic,audi,a3,sporty audi a3 in town with upgraded coil over...,2017.0,,30-jun-2017,hatchback,parf car,...,,uncategorized,"fuel efficient 3 cylinder turbocharged engine,...","bbs sport rim, audi mmi/bluetooth/audio sound ...",,53300.0,2017-06-30,2017,6,
