In [38]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [39]:
df = pd.read_csv("data_cleaned.csv")
df.head()

Unnamed: 0,Name,Year,Body_Type,Origin,Province,District,Transmission,Fuel_Type,Brand,price_num,age,mileage_num
0,Mercedes-Benz E250 2023,2023,Sedan,Nhập khẩu,Hà Nội,Nam Từ Liêm,Số tự động,Máy xăng,Mercedes-Benz,870000000,2,45000
1,Toyota Camry 2.4G 2012,2012,Sedan,Trong nước,Hà Nội,Other,Số tự động,Máy xăng,Toyota,360000000,13,120000
2,Ford Territory Titanium X 2023,2023,SUV,Nhập khẩu,Tp.HCM,Quận 10,Số tự động,Máy xăng,Ford,770000000,2,27000
3,Mazda CX-5 2022,2022,SUV,Trong nước,Hà Nội,Nam Từ Liêm,Số tự động,Máy xăng,Mazda,740000000,3,30000
4,Toyota Previa 2010,2010,Van/Minivan,Nhập khẩu,Tp.HCM,Quận 10,Số tự động,Máy xăng,Toyota,650000000,15,50000


In [40]:
df = df.drop(["Name","Year"],axis=1)

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647 entries, 0 to 646
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Body_Type     647 non-null    object
 1   Origin        647 non-null    object
 2   Province      647 non-null    object
 3   District      647 non-null    object
 4   Transmission  647 non-null    object
 5   Fuel_Type     647 non-null    object
 6   Brand         647 non-null    object
 7   price_num     647 non-null    int64 
 8   age           647 non-null    int64 
 9   mileage_num   647 non-null    int64 
dtypes: int64(3), object(7)
memory usage: 50.7+ KB


In [42]:
X = df.drop(["price_num"],axis=1)

y = df["price_num"]

In [43]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')


categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
numerical_cols = [col for col in X.columns if col not in categorical_cols and X[col].dtype in ['float64', 'int64']]


ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_cat = ohe.fit_transform(X[categorical_cols])
cat_feature_names = ohe.get_feature_names_out(categorical_cols)
X_cat_df = pd.DataFrame(X_cat, columns=cat_feature_names, index=X.index)


scaler = MinMaxScaler()
X_num = scaler.fit_transform(X[numerical_cols])
X_num_df = pd.DataFrame(X_num, columns=numerical_cols, index=X.index)


X_encoded = pd.concat([X_num_df, X_cat_df], axis=1)
X_encoded.shape

(647, 126)

In [44]:
from sklearn.model_selection import train_test_split

np.random.seed(42)

X_train, X_test, y_train, y_test =  train_test_split(X_encoded,
                                                     y,
                                                     test_size=0.2)

In [45]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score


models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Random Forest": RandomForestRegressor()
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {"MSE": mse, "R2": r2}


for name, res in results.items():
    print(f"{name}: R2 Score = {res['R2']:.4f}, MSE = {res['MSE']:.2f}")

Linear Regression: R2 Score = 0.6843, MSE = 66847332489984600.00
Ridge Regression: R2 Score = 0.7258, MSE = 58068523628798648.00
Lasso Regression: R2 Score = 0.6914, MSE = 65350765346962896.00
Random Forest: R2 Score = 0.7168, MSE = 59959910640311568.00


In [46]:
LR = LinearRegression()
Rid = Ridge()
Las = Lasso()
Rfr = RandomForestRegressor()

In [47]:
LR.fit(X_train,y_train)

cvs = cross_val_score(LR, X_encoded, y, cv=5)
print(cvs)
print(cvs.mean())

[0.59499133 0.67987464 0.59235901 0.72561919 0.61622764]
0.6418143611250299


In [48]:
Rid.fit(X_train,y_train)

cvs = cross_val_score(Rid, X_encoded, y, cv=5)
print(cvs)
print(cvs.mean())

[0.61641658 0.72826586 0.65316781 0.71418451 0.6429577 ]
0.6709984928789324


In [49]:
Las.fit(X_train,y_train)

cvs = cross_val_score(Las, X_encoded, y, cv=5)

print(cvs)
print(cvs.mean())

[0.63124982 0.67995692 0.5940217  0.72192199 0.60862679]
0.6471554452406288


In [50]:
Rfr.fit(X_train,y_train)

cvs = cross_val_score(Rfr, X_encoded, y, cv=5)

print(cvs)
print(cvs.mean())

[0.62796584 0.72223601 0.68513287 0.71736219 0.67830428]
0.6862002395191802


In [51]:
from sklearn.model_selection import GridSearchCV

params_ridge = {
    'alpha': [0.01, 0.1, 1, 10, 100, 1000]
}

grid_ridge = GridSearchCV(Rid, params_ridge, cv=5, scoring='neg_mean_squared_error')
grid_ridge.fit(X_train, y_train)


In [52]:
grid_ridge.fit(X_train, y_train)
grid_ridge.best_params_

{'alpha': 1}

In [54]:
ridge_fix = Ridge(alpha=1)

ridge_fix.fit(X_train,y_train)

ridge_fix.score(X_test,y_test)

0.7257567804830155

In [55]:
params_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_rf = GridSearchCV(Rfr, params_rf, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_rf.fit(X_train, y_train)


In [56]:
grid_rf.fit(X_train, y_train)
grid_rf.best_params_

{'max_depth': None,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 100}

In [58]:
rfr_fix = RandomForestRegressor(
    max_depth = 20,
    min_samples_leaf = 1,
    min_samples_split = 2,
    n_estimators = 100
)

rfr_fix.fit(X_train,y_train)

rfr_fix.score(X_test,y_test)

0.7144869164626821