In [1]:
import warnings
warnings.filterwarnings("ignore")
import sys, os
sys.path.append(os.path.abspath("../.."))
from configs import GOOGLE_APPLICATION_CREDENTIALS,GCS_BUCKET_NAME,GCS_PROJECT_ID
from google.cloud import bigquery
from src.utils.io_utils import upload_to_bigquery

In [2]:
import pandas as pd
import numpy as np

In [3]:
client = bigquery.Client.from_service_account_json(GOOGLE_APPLICATION_CREDENTIALS)
table_id = f"{GCS_PROJECT_ID}.{GCS_BUCKET_NAME}.data_train_model"

In [4]:
query = """SELECT *
FROM `khangtestdbt.xecupredict.data_done` """
data = client.query(query).to_dataframe()
data.head(1)

Unnamed: 0,km,origin,body,fuel,brand,age,price
0,105.0,Trong Nước,Minivan,Xăng,Toyota,14.0,205000000.0


In [5]:
df = data.copy()

In [6]:
df.shape

(11305, 7)

In [7]:
df.head(5)

Unnamed: 0,km,origin,body,fuel,brand,age,price
0,105.0,Trong Nước,Minivan,Xăng,Toyota,14.0,205000000.0
1,107.0,Trong Nước,Minivan,Xăng,Toyota,15.0,235000000.0
2,110.0,Nhập Khẩu,Hatchback,Xăng,Hyundai,11.0,232000000.0
3,110.0,Nhập Khẩu,Sedan,Xăng,Toyota,18.0,295000000.0
4,112.0,Nhập Khẩu,Mui Trần,Xăng,Mini,20.0,365000000.0


In [8]:
df["age_group"] = pd.cut(
    df["age"],
    bins=[-1, 5, 10, 15, 100],
    labels=["New", "Young", "Mid", "Old"]
)

In [9]:
df["km_per_year"] = df["km"] / (df["age"] + 1)

df["log_age"] = np.log1p(df["age"])

top_body = df["body"].value_counts().nlargest(5).index
df["body_group"] = df["body"].where(df["body"].isin(top_body), "Other")


In [10]:
top_brand = df["brand"].value_counts().nlargest(10).index
df["brand_group"] = df["brand"].where(df["brand"].isin(top_brand), "Other")


In [11]:
df["is_imported"] = (df["origin"] == "Nhập Khẩu").astype(object)

df["imported_age"] = df["is_imported"].astype(str) + "_" + df["age_group"].astype(str)

In [12]:
df.head()

Unnamed: 0,km,origin,body,fuel,brand,age,price,age_group,km_per_year,log_age,body_group,brand_group,is_imported,imported_age
0,105.0,Trong Nước,Minivan,Xăng,Toyota,14.0,205000000.0,Mid,7.0,2.70805,Minivan,Toyota,False,False_Mid
1,107.0,Trong Nước,Minivan,Xăng,Toyota,15.0,235000000.0,Mid,6.6875,2.772589,Minivan,Toyota,False,False_Mid
2,110.0,Nhập Khẩu,Hatchback,Xăng,Hyundai,11.0,232000000.0,Mid,9.166667,2.484907,Hatchback,Hyundai,True,True_Mid
3,110.0,Nhập Khẩu,Sedan,Xăng,Toyota,18.0,295000000.0,Old,5.789474,2.944439,Sedan,Toyota,True,True_Old
4,112.0,Nhập Khẩu,Mui Trần,Xăng,Mini,20.0,365000000.0,Old,5.333333,3.044522,Other,Other,True,True_Old


In [13]:
X = df.drop(columns=["price"])
X_ = df[["price"]]

In [14]:
X.duplicated().sum()

np.int64(0)

In [15]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11305 entries, 0 to 11304
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   km            11305 non-null  float64 
 1   origin        11305 non-null  object  
 2   body          11305 non-null  object  
 3   fuel          11305 non-null  object  
 4   brand         11305 non-null  object  
 5   age           11305 non-null  float64 
 6   age_group     11305 non-null  category
 7   km_per_year   11305 non-null  float64 
 8   log_age       11305 non-null  float64 
 9   body_group    11305 non-null  object  
 10  brand_group   11305 non-null  object  
 11  is_imported   11305 non-null  object  
 12  imported_age  11305 non-null  object  
dtypes: category(1), float64(4), object(8)
memory usage: 1.0+ MB


In [16]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_cols = [col for col in X.columns 
                    if X[col].dtype in ['object','category'] and col not in X_]
numerical_cols = [col for col in X.columns 
                  if X[col].dtype in ['float64'] and col not in X_]

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_cat = ohe.fit_transform(X[categorical_cols])
cat_feature_names = ohe.get_feature_names_out(categorical_cols)
X_cat_df = pd.DataFrame(X_cat, columns=cat_feature_names, index=X.index)


scaler = StandardScaler()
X_num = scaler.fit_transform(X[numerical_cols])
X_num_df = pd.DataFrame(X_num, columns=numerical_cols, index=X.index)

X_encoded = pd.concat([X_num_df, X_cat_df, X_], axis=1)
X_encoded.shape

(11305, 82)

In [17]:
X_encoded.head()

Unnamed: 0,km,age,km_per_year,log_age,origin_Nhập Khẩu,origin_Trong Nước,body_Convertible,body_Coupe,body_Hatchback,body_Kiểu Dáng Khác,...,is_imported_True,imported_age_False_Mid,imported_age_False_New,imported_age_False_Old,imported_age_False_Young,imported_age_True_Mid,imported_age_True_New,imported_age_True_Old,imported_age_True_Young,price
0,-1.376184,1.293177,-1.791607,1.19871,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,205000000.0
1,-1.376144,1.476946,-1.791665,1.28732,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,235000000.0
2,-1.376085,0.74187,-1.791207,0.892337,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,232000000.0
3,-1.376085,2.028253,-1.79183,1.523267,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,295000000.0
4,-1.376045,2.395791,-1.791914,1.66068,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,365000000.0


In [18]:
from joblib import dump

dump(ohe, "../../model/onehot_encoder.pkl")
dump(scaler,"../../model/scaler.pkl")

['../../model/scaler.pkl']

In [19]:
upload_to_bigquery(X_encoded, table_id=table_id, if_exists="replace" )

✅ Uploaded 11305 rows to khangtestdbt.xecupredict.data_train_model


# end