In [119]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns


In [120]:
cars = pd.read_csv('./data/processed_cars.csv', index_col=0)

In [121]:
quantity_by_fuel = cars.groupby('fuels').agg('count').year.reset_index()
quantity_by_origin = cars.groupby('origin').agg('count').year.reset_index()
quantity_by_transmission = cars.groupby('transmission').agg('count').year.reset_index()
quantity_by_type = cars.groupby('type').agg('count').year.reset_index()
quantity_by_year = cars.groupby('year').agg('count').transmission.reset_index()

Unnamed: 0,fuels,year
0,diesel,4047
1,electric,12
2,gasoline,24710
3,hybrid,188


### Drop columns -> Data integration -> Data cleaning -> Data transformation

### Data integration

### Data cleaning (remove missing value: None, NA; fill missing value using source url)

In [109]:
cars.model.value_counts()

Innova          1345
Morning         1326
Vios            1304
Ranger          1029
i10              919
                ... 
Excelle            1
LaCrosse           1
Mulsanne           1
Continental        1
Fengxing CM7       1
Name: model, Length: 417, dtype: int64

In [110]:
cars.type.value_counts()

sedan          10548
suv             8177
hatchback       4000
crossover       2841
pickup          1735
van             1445
coupe            119
convertible       86
wagon              6
Name: type, dtype: int64

https://www.kaggle.com/code/vbmokin/used-cars-price-prediction-by-15-models
https://towardsdatascience.com/used-car-price-prediction-using-machine-learning-e3be02d977b2

In [111]:
cars.km_driven.value_counts()
# drop outliers: km_driven = 0, 1

0.0         16700
50000.0       436
60000.0       425
80000.0       378
30000.0       348
            ...  
67262.0         1
118268.0        1
96218.0         1
82668.0         1
41618.0         1
Name: km_driven, Length: 1522, dtype: int64

In [112]:
cars.external_color.value_counts()
# drop none, khác

Trắng        8577
Đen          5530
Bạc          4050
Đỏ           3424
Xanh         2389
Xám          1222
Vàng          903
Nâu           728
Ghi           712
Cát           665
Cam           359
Đồng          181
Hồng           76
Kem            49
Nhiều màu      42
Màu khác       27
Tím            23
Name: external_color, dtype: int64

In [113]:
cars.seats.value_counts()
# drop none, 0 or use source url to lookup data

5     15761
4     12874
2       215
3        77
6        20
45        2
54        2
7         1
50        1
1         1
56        1
40        1
8         1
Name: seats, dtype: int64

In [114]:
cars.fuels.value_counts()
# drop NONE, 0, None or use source url to lookup data

gasoline    24710
diesel       4047
hybrid        188
electric       12
Name: fuels, dtype: int64

In [115]:
cars.transmission.value_counts()
# drop None, 0 or use source url to lookup date

automatic    20783
manual        8174
Name: transmission, dtype: int64

In [116]:
cars.price.value_counts()
# drop outliers

630000000     181
550000000     165
450000000     160
495000000     146
395000000     145
             ... 
3339000000      1
892000000       1
673000000       1
751000000       1
2840000000      1
Name: price, Length: 2087, dtype: int64

In [117]:
cars.year.value_counts()
# drop None

2021    6760
2019    2333
2018    2112
2016    2100
2017    1762
2022    1761
2015    1563
2020    1512
2009    1186
2010    1109
2014    1099
2011     983
2008     864
2013     658
2012     635
2007     622
2005     360
2004     349
2003     290
2006     228
2002     157
2001     128
2000     103
1995      37
1997      37
1993      31
1999      31
1992      29
1998      27
1996      27
1994      24
1991      22
1990      18
Name: year, dtype: int64

### Data transformation (from categorical to numerical)

### Drop columns

In [118]:
drop_columns = ['name', 'source_url']
cars.drop(columns=drop_columns)

Unnamed: 0,brand,type,origin,km_driven,external_color,seats,engine_capacity,fuels,transmission,wheel_drive,price,year,model,internal_color
0.0,acura,crossover,imported,0.0,Xanh,5,3.7,gasoline,automatic,AWD,1280000000,2010,ZDX,Đen
1.0,acura,suv,imported,0.0,Đen,5,3.7,gasoline,automatic,AWD,600000000,2008,MDX,Ghi
2.0,acura,crossover,imported,110000.0,Trắng,5,3.7,gasoline,automatic,AWD,960000000,2010,ZDX,Nâu
3.0,acura,suv,imported,0.0,Đen,5,3.7,gasoline,automatic,AWD,650000000,2007,MDX,Đen
4.0,acura,suv,imported,10000.0,Đen,5,3.7,gasoline,automatic,AWD,590000000,2009,MDX,Đen
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28952.0,volvo,suv,imported,0.0,Xám,5,2.0,gasoline,automatic,AWD,1580000000,2016,XC60,-
28953.0,volvo,suv,imported,6000.0,Trắng,5,2.0,gasoline,automatic,AWD,3250000000,2017,XC90,Ghi
28954.0,zotye,suv,imported,100000.0,Trắng,5,2.0,gasoline,automatic,FWD,500000000,2019,Z8,Nâu
28955.0,zotye,suv,imported,38000.0,Đen,5,2.0,gasoline,automatic,FWD,495000000,2019,Z8,Nâu
