In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("data_cleaned.csv")
df.head()

Unnamed: 0,Name,Year,Body_Type,Origin,Province,District,Transmission,Fuel_Type,Brand,price_num,age,mileage_num
0,Mercedes-Benz GLC 300 2020,2020,SUV,Nhập khẩu,Tp.HCM,Quận 7,Số tự động,Máy xăng,Mercedes-Benz,1079000000,5,49890
1,Hyundai Stargazer 1.5 AT Cao cấp 2022,2022,MPV,Trong nước,Vĩnh Phúc,Phúc Yên,Số tự động,Máy xăng,Hyundai,510000000,3,70000
2,Mazda CX-5 2.0 Luxury 2024,2024,SUV,Trong nước,Hà Nội,Cầu Giấy,Số tự động,Máy xăng,Mazda,770000000,1,20000
3,Mercedes-Benz GLC 250 4Matic 2017,2017,SUV,Trong nước,Tp.HCM,Quận 7,Số tự động,Máy xăng,Mercedes-Benz,899000000,8,62000
4,Honda HR-V L 2019,2019,Crossover,Nhập khẩu,Vĩnh Phúc,Phúc Yên,Số tự động,Máy xăng,Honda,550000000,6,87000


In [3]:
df = df.drop(["Name","Year"],axis=1)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1342 entries, 0 to 1341
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Body_Type     1342 non-null   object
 1   Origin        1342 non-null   object
 2   Province      1342 non-null   object
 3   District      1342 non-null   object
 4   Transmission  1342 non-null   object
 5   Fuel_Type     1342 non-null   object
 6   Brand         1342 non-null   object
 7   price_num     1342 non-null   int64 
 8   age           1342 non-null   int64 
 9   mileage_num   1342 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 105.0+ KB


In [5]:
X = df.drop(["price_num"],axis=1)

y = df["price_num"]

In [6]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')


categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
numerical_cols = [col for col in X.columns if col not in categorical_cols and X[col].dtype in ['float64', 'int64']]


ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_cat = ohe.fit_transform(X[categorical_cols])
cat_feature_names = ohe.get_feature_names_out(categorical_cols)
X_cat_df = pd.DataFrame(X_cat, columns=cat_feature_names, index=X.index)


scaler = MinMaxScaler()
X_num = scaler.fit_transform(X[numerical_cols])
X_num_df = pd.DataFrame(X_num, columns=numerical_cols, index=X.index)


X_encoded = pd.concat([X_num_df, X_cat_df], axis=1)
X_encoded.shape

(1342, 181)

In [7]:
from sklearn.model_selection import train_test_split

np.random.seed(42)

X_train, X_test, y_train, y_test =  train_test_split(X_encoded,
                                                     y,
                                                     test_size=0.2)

In [8]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score


models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Random Forest": RandomForestRegressor()
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {"MSE": mse, "R2": r2}


for name, res in results.items():
    print(f"{name}: R2 Score = {res['R2']:.4f}, MSE = {res['MSE']:.2f}")

Linear Regression: R2 Score = 0.7403, MSE = 37877421764346768.00
Ridge Regression: R2 Score = 0.7458, MSE = 37073520980958808.00
Lasso Regression: R2 Score = 0.7403, MSE = 37873416851261112.00
Random Forest: R2 Score = 0.8027, MSE = 28777446461583536.00


In [9]:
LR = LinearRegression()
Rid = Ridge()
Las = Lasso()
Rfr = RandomForestRegressor()

In [10]:
LR.fit(X_train,y_train)

cvs = cross_val_score(LR, X_encoded, y, cv=5)
print(cvs)
print(cvs.mean())

[0.6882551  0.72671046 0.74043919 0.71414574 0.6463262 ]
0.7031753374794998


In [11]:
Rid.fit(X_train,y_train)

cvs = cross_val_score(Rid, X_encoded, y, cv=5)
print(cvs)
print(cvs.mean())

[0.70460951 0.72223965 0.74941082 0.72605553 0.68962927]
0.7183889549529442


In [12]:
Las.fit(X_train,y_train)

cvs = cross_val_score(Las, X_encoded, y, cv=5)

print(cvs)
print(cvs.mean())

[0.6889305  0.72613561 0.73937919 0.71035714 0.64474061]
0.7019086112187868


In [13]:
Rfr.fit(X_train,y_train)

cvs = cross_val_score(Rfr, X_encoded, y, cv=5)

print(cvs)
print(cvs.mean())

[0.77001157 0.76518456 0.84215258 0.75286215 0.76496069]
0.7790343095197488


In [19]:
from sklearn.model_selection import GridSearchCV

params_ridge = {
    'alpha': [0.01, 0.1, 1, 10, 100, 1000]
}

grid_ridge = GridSearchCV(Rid, params_ridge, cv=5, scoring='neg_mean_squared_error')
grid_ridge.fit(X_train, y_train)


In [20]:
grid_ridge.fit(X_train, y_train)
grid_ridge.best_params_

{'alpha': 1}

In [21]:
ridge_fix = Ridge(alpha=1)

ridge_fix.fit(X_train,y_train)

ridge_fix.score(X_test,y_test)

0.7457918217277967

In [15]:
from sklearn.model_selection import GridSearchCV


params_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_rf = GridSearchCV(Rfr, params_rf, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_rf.fit(X_train, y_train)


In [16]:
grid_rf.best_params_

{'max_depth': 20,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 300}

In [17]:
rfr_fix = RandomForestRegressor(
    max_depth = 20,
    min_samples_leaf = 1,
    min_samples_split = 2,
    n_estimators = 300
)

rfr_fix.fit(X_train,y_train)

rfr_fix.score(X_test,y_test)

0.8028575541318137

In [22]:
import os
from joblib import dump


os.makedirs("model", exist_ok=True)

dump(rfr_fix, "model/random_forest_model_1.joblib")
dump(ohe, "model/onehot_encoder.pkl")
dump(scaler, "model/scaler.pkl")


['model/scaler.pkl']