In [30]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate, KFold, GridSearchCV
df = pd.read_csv('./data.csv')

In [31]:
drop = df[ df.internal_color.isin(['-', 'màu khác', 'nhiều màu']) ].index
df.drop(drop, inplace=True)

In [32]:
drop = df[ df.external_color.isin(['-', 'màu khác', 'nhiều màu']) ].index
df.drop(drop, inplace=True)

In [33]:
type = df['type']
type = pd.get_dummies(type, drop_first=True)

In [34]:
transmission = df['transmission']
transmission = pd.get_dummies(transmission,drop_first=True)

In [35]:
fuels = df['fuels']
fuels = pd.get_dummies(fuels,drop_first=True)

In [36]:
origin = df['origin']
origin = pd.get_dummies(origin,drop_first=True)

In [37]:
e_color = df['external_color']
e_color = pd.get_dummies(e_color, drop_first=True)

In [38]:
i_color = df['internal_color']
i_color = pd.get_dummies(i_color, drop_first=True)

In [39]:
wheel_drive = df['wheel_drive']
wheel_drive = pd.get_dummies(wheel_drive, drop_first=True)

In [40]:
le = LabelEncoder()
df["brand"] = le.fit_transform(df["brand"])
df['model']= le.fit_transform(df['model'])

In [41]:
df = pd.concat([df, type, transmission, fuels, origin, e_color, i_color, wheel_drive], axis=1)

In [42]:
drop = ['Unnamed: 0', 'type', 'transmission', 'origin', 'fuels', 'external_color', 'internal_color', 'wheel_drive', 'name', 'source_url']
df = df.drop(columns=drop)

In [43]:
X = df.drop(["price"], axis=1)
y = df["price"]

In [45]:
lr = LinearRegression()

lr_scores = cross_validate(lr, X, y, cv=5, scoring='neg_root_mean_squared_error', return_train_score=True)
df_lr_scores = pd.DataFrame(lr_scores)
df_lr_scores.mean()

fit_time          0.056398
score_time        0.004449
test_score    -1560.702432
train_score   -1505.867577
dtype: float64

In [51]:
model = DecisionTreeRegressor()
cv = KFold(n_splits=5)
space = dict()
space['max_depth'] = [15, 16, 17, 18, 19]
space['min_samples_leaf'] = [5, 10, 15]
search = GridSearchCV(model, space, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=cv)
result = search.fit(X, y)
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: -1131.8591698289752
Best Hyperparameters: {'max_depth': 18, 'min_samples_leaf': 10}


In [52]:
dtr = DecisionTreeRegressor(max_depth=18, min_samples_leaf=10)
dtr_scores = cross_validate(dtr, X, y, cv=5, scoring='neg_root_mean_squared_error', return_train_score=True)
df_dtr_scores = pd.DataFrame(dtr_scores)
df_dtr_scores.mean()

fit_time          0.077689
score_time        0.003705
test_score    -1158.073959
train_score    -470.108199
dtype: float64

In [48]:
model = RandomForestRegressor()
cv = KFold(n_splits=5)
space = dict()
space['n_estimators'] = [110, 120, 130]
search = GridSearchCV(model, space, scoring='neg_root_mean_squared_error', n_jobs=-1, cv=cv)
result = search.fit(X, y)
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

Best Score: -1050.3957125435911
Best Hyperparameters: {'n_estimators': 120}


In [49]:
rfr = RandomForestRegressor(n_estimators=120)
rfr_scores = cross_validate(rfr, X, y, cv=5, scoring='neg_root_mean_squared_error', return_train_score=True)
df_rfr_scores = pd.DataFrame(rfr_scores)
df_rfr_scores.mean()

fit_time          7.622758
score_time        0.104573
test_score    -1064.256013
train_score    -207.887157
dtype: float64