In [1]:
import warnings
warnings.filterwarnings("ignore")
import sys, os
sys.path.append(os.path.abspath("../.."))
from configs import GOOGLE_APPLICATION_CREDENTIALS,GCS_BUCKET_NAME,GCS_PROJECT_ID
from google.cloud import bigquery
from src.utils.io_utils import upload_to_bigquery

In [2]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error, root_mean_squared_error
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [3]:
client = bigquery.Client.from_service_account_json(GOOGLE_APPLICATION_CREDENTIALS)

In [4]:
query = """SELECT *
FROM `khangtestdbt.xecupredict.data_train_model` """
data = client.query(query).to_dataframe()
data.head(1)

Unnamed: 0,km,age,origin_nhập khẩu,origin_trong nước,body_convertible,body_coupe,body_hatchback,body_kiểu dáng khác,body_minibus,body_minivan,...,brand_suzuki,brand_toyota,brand_vinfast,brand_volkswagen,brand_volvo,age_risk_mid,age_risk_new,age_risk_old,age_risk_very_old,price
0,-1.421628,2.352775,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,125500000.0


In [5]:
df = data.copy()

In [6]:
df.shape

(4820, 53)

In [7]:
X = df.drop(columns=["price"])
y = np.log1p(df["price"])

In [8]:
np.random.seed(42)

X_train, X_test, y_train, y_test =  train_test_split(X,
                                                     y,
                                                     test_size=0.2)

In [9]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),

    "Random Forest": RandomForestRegressor(),

    "Gradient Boosting": GradientBoostingRegressor(),
}

rows = []  

for name, model in models.items():
    model.fit(X_train, y_train)

    train_pred = model.predict(X_train)
    test_pred  = model.predict(X_test)

    r2_train = r2_score(y_train, train_pred)
    r2_test  = r2_score(y_test, test_pred)

    rmse_train = root_mean_squared_error(y_train, train_pred)
    rmse_test  = root_mean_squared_error(y_test, test_pred)

    rows.append([name, r2_train, r2_test, rmse_train, rmse_test])

df_results = pd.DataFrame(
    rows,
    columns=["Model", "R2_Train", "R2_Test", "RMSE_Train", "RMSE_Test"]
)

df_results


Unnamed: 0,Model,R2_Train,R2_Test,RMSE_Train,RMSE_Test
0,Linear Regression,0.763093,0.798128,0.402262,0.37547
1,Ridge Regression,0.762973,0.797724,0.402364,0.375846
2,Random Forest,0.977823,0.875931,0.123076,0.294353
3,Gradient Boosting,0.817542,0.819828,0.353022,0.354716


In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 3, 5]
}

rf_grid = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid=rf_param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

rf_grid.fit(X_train, y_train)
rf_best = rf_grid.best_estimator_

y_tr_pred = rf_best.predict(X_train)
y_te_pred = rf_best.predict(X_test)

print("==== RANDOM FOREST ====")
print("Best params:", rf_grid.best_params_)
print("R2 Train:", r2_score(y_train, y_tr_pred))
print("R2 Test :", r2_score(y_test, y_te_pred))
print("RMSE Train:", np.sqrt(mean_squared_error(y_train, y_tr_pred)))
print("RMSE Test :", np.sqrt(mean_squared_error(y_test, y_te_pred)))

Fitting 5 folds for each of 27 candidates, totalling 135 fits
==== RANDOM FOREST ====
Best params: {'max_depth': 20, 'min_samples_split': 3, 'n_estimators': 300}
R2 Train: 0.9499990454965369
R2 Test : 0.8756645111712412
RMSE Train: 0.18480305137832273
RMSE Test : 0.29466954991968713


In [11]:
gb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5]
}

gb_grid = GridSearchCV(
    GradientBoostingRegressor(random_state=42),
    param_grid=gb_param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

gb_grid.fit(X_train, y_train)
gb_best = gb_grid.best_estimator_

y_tr_pred = gb_best.predict(X_train)
y_te_pred = gb_best.predict(X_test)

print("==== GRADIENT BOOSTING ====")
print("Best params:", gb_grid.best_params_)
print("R2 Train:", r2_score(y_train, y_tr_pred))
print("R2 Test :", r2_score(y_test, y_te_pred))
print("RMSE Train:", np.sqrt(mean_squared_error(y_train, y_tr_pred)))
print("RMSE Test :", np.sqrt(mean_squared_error(y_test, y_te_pred)))


Fitting 5 folds for each of 27 candidates, totalling 135 fits
==== GRADIENT BOOSTING ====
Best params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
R2 Train: 0.9121599633522441
R2 Test : 0.8733336185095901
RMSE Train: 0.24494368345837245
RMSE Test : 0.2974187806282964


* save model

In [12]:
import joblib

joblib.dump(rf_best, "../../model/random_forest_best.joblib")


['../../model/random_forest_best.joblib']

In [13]:
# end