In [1]:
import warnings
warnings.filterwarnings("ignore")
import sys, os
sys.path.append(os.path.abspath("../.."))
from configs import GOOGLE_APPLICATION_CREDENTIALS,GCS_BUCKET_NAME,GCS_PROJECT_ID
from google.cloud import bigquery
from src.utils.io_utils import upload_to_bigquery

In [2]:
client = bigquery.Client.from_service_account_json(GOOGLE_APPLICATION_CREDENTIALS)

In [3]:
query = """SELECT *
FROM `khangtestdbt.xecupredict.data_train_model` """
data = client.query(query).to_dataframe()
data.head(1)

Unnamed: 0,km,age,old_car,high_km,origin_Nhập khẩu,origin_Trong nước,body_Convertible,body_Coupe,body_Hatchback,body_MPV,...,brand_Suzuki,brand_Toyota,brand_VinFast,brand_Volkswagen,brand_Volvo,age_risk_mid,age_risk_new,age_risk_old,age_risk_very_old,price
0,-1.739179,0.678972,-0.342416,-0.375973,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,627000000.0


In [4]:
df = data.copy()

In [5]:
X = df.drop(columns=["price"])
y = df["price"]

In [6]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error, root_mean_squared_error
import pandas as pd


np.random.seed(42)

X_train, X_test, y_train, y_test =  train_test_split(X,
                                                     y,
                                                     test_size=0.2)

In [7]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
}

rows = []  

for name, model in models.items():
    model.fit(X_train, y_train)

    train_pred = model.predict(X_train)
    test_pred  = model.predict(X_test)

    r2_train = r2_score(y_train, train_pred)
    r2_test  = r2_score(y_test, test_pred)

    rmse_train = root_mean_squared_error(y_train, train_pred)
    rmse_test  = root_mean_squared_error(y_test, test_pred)

    rows.append([name, r2_train, r2_test, rmse_train, rmse_test])

df_results = pd.DataFrame(
    rows,
    columns=["Model", "R2_Train", "R2_Test", "RMSE_Train", "RMSE_Test"]
)

df_results


Unnamed: 0,Model,R2_Train,R2_Test,RMSE_Train,RMSE_Test
0,Linear Regression,0.471449,0.463941,199688300.0,208863200.0
1,Ridge Regression,0.47103,0.464131,199767500.0,208826200.0
2,Lasso Regression,0.471449,0.463941,199688300.0,208863200.0


In [8]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


models = {
    "RandomForest": RandomForestRegressor(
                                        n_estimators=500, 
                                        max_depth=15,            
                                        min_samples_split=5,   
                                        min_samples_leaf=3,       
                                        max_features='sqrt',     
                                        bootstrap=True,         
                                        random_state=42,
                                        n_jobs=-1           
                                    ),
    "GradientBoosting": GradientBoostingRegressor(
                                        n_estimators=500,         
                                        learning_rate=0.05,       
                                        max_depth=6,             
                                        min_samples_split=5,   
                                        min_samples_leaf=3,     
                                        subsample=0.8,        
                                        max_features='sqrt',
                                        random_state=42
                                    ),
}

results = []

for name, model in models.items():
 
    if name == "CatBoost":
        model.fit(X_train, y_train, cat_features=X.columns)
    else:
        model.fit(X_train, y_train)
   
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    r2_train = r2_score(y_train, y_train_pred)
    r2_test  = r2_score(y_test, y_test_pred)
    rmse_train = rmse(y_train, y_train_pred)
    rmse_test  = rmse(y_test, y_test_pred)
    
    results.append([name, r2_train, r2_test, rmse_train, rmse_test])

df_results = pd.DataFrame(results, columns=["Model", "R2_Train", "R2_Test", "RMSE_Train", "RMSE_Test"])

df_results.sort_values(by="R2_Test", ascending=False)


Unnamed: 0,Model,R2_Train,R2_Test,RMSE_Train,RMSE_Test
1,GradientBoosting,0.802124,0.644201,122181600.0,170160100.0
0,RandomForest,0.672882,0.595889,157094800.0,181345300.0


In [9]:
from joblib import dump

gbr_model = models["GradientBoosting"]

dump(gbr_model, "../../model/gradient_boosting_model.joblib")

['../../model/gradient_boosting_model.joblib']

In [10]:
# end