In [1]:
import warnings
warnings.filterwarnings("ignore")
import sys, os
sys.path.append(os.path.abspath("../.."))
from configs import GOOGLE_APPLICATION_CREDENTIALS,GCS_BUCKET_NAME,GCS_PROJECT_ID
from google.cloud import bigquery
from src.utils.io_utils import upload_to_bigquery
from clean_utils import *

In [2]:
client = bigquery.Client.from_service_account_json(GOOGLE_APPLICATION_CREDENTIALS)
table_id = f"{GCS_PROJECT_ID}.{GCS_BUCKET_NAME}.data_train_model"
table_id_done = f"{GCS_PROJECT_ID}.{GCS_BUCKET_NAME}.data_done"

In [3]:
query = """SELECT *
FROM `khangtestdbt.xecupredict.data_cleaned` """
data_cleaned = client.query(query).to_dataframe()
data_cleaned.head(1)

Unnamed: 0,km,origin,body,fuel,name,price,brand,age
0,16913,nhập khẩu,,,Toyota Raize 2024,510000000,toyota,1


In [55]:
df = data_cleaned.copy()

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16159 entries, 0 to 16158
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   km      16159 non-null  Int64 
 1   origin  16143 non-null  object
 2   body    14515 non-null  object
 3   fuel    14793 non-null  object
 4   name    16159 non-null  object
 5   price   16159 non-null  Int64 
 6   brand   16159 non-null  object
 7   age     16159 non-null  Int64 
dtypes: Int64(3), object(5)
memory usage: 1.0+ MB


In [57]:
df.duplicated().sum()

np.int64(655)

In [58]:
df.drop_duplicates(inplace=True)

In [59]:
df.isna().sum()

km           0
origin      16
body      1636
fuel      1337
name         0
price        0
brand        0
age          0
dtype: int64

1.brand

In [60]:
df["brand"].dtype

dtype('O')

In [61]:
df["brand"].unique()

array(['toyota', 'vinfast', 'volkswagen', 'volvo', 'mercedes', 'mg',
       'suzuki', 'thaco', 'mitsubishi', 'nissan', 'peugeot', 'porsche',
       'rolls', 'samsung', 'skoda', 'smart', 'srm', 'subaru', 'mini',
       'kia', 'hyundai', 'ford', 'land', 'lexus', 'maserati', 'mazda',
       'gac', 'honda', 'acura', 'audi', 'bentley', 'bmw', 'chevrolet',
       'citroen', 'daewoo', 'daihatsu', 'dongben', 'jaguar', 'kenbo',
       'landrover', 'lamborghini', 'gaz', 'hongqi', 'hummer', 'dodge',
       'ferrari', 'baic', 'byd', 'cadillac', 'aston', 'infiniti', 'isuzu',
       'jeep', 'lynk', 'mclaren', 'sym', 'omoda', 'ram', 'renault',
       'ssangyong', 'mercedes benz', 'lynk&co', 'alfa romeo',
       'hãng khác', 'wuling', 'changan', 'zotye', 'reult', 'luxgen',
       'rolls royce', 'geely', 'genesis', 'dongfeng', 'chrysler',
       'mekong', 'fiat', 'asia', 'lada'], dtype=object)

In [62]:
brand_alias = {
    "mercedes": "mercedes-benz",
    "mercedes benz": "mercedes-benz",

    "rolls": "rolls-royce",
    "rolls royce": "rolls-royce",

    "land": "land rover",
    "landrover": "land rover",

    "lynk": "lynk & co",
    "lynk&co": "lynk & co",
}

In [63]:
df["brand"] = df["brand"].replace(brand_alias)

top_brands = df["brand"].value_counts().nlargest(25).index
df["brand"] = df["brand"].apply(lambda x: x if x in top_brands else "other")


In [64]:
df["brand"].value_counts()

brand
toyota           2831
ford             1713
mercedes-benz    1646
kia              1472
hyundai          1380
mitsubishi        934
mazda             778
vinfast           769
honda             659
lexus             555
bmw               460
chevrolet         314
suzuki            228
other             226
porsche           222
peugeot           213
land rover        185
nissan            178
mg                176
audi              171
volvo             116
volkswagen         98
daewoo             67
isuzu              44
mini               43
jaguar             26
Name: count, dtype: int64

2.origin

In [65]:
df["origin"].unique()

array(['nhập khẩu', 'trong nước', None], dtype=object)

In [66]:
df["origin"].value_counts()

origin
nhập khẩu     8340
trong nước    7148
Name: count, dtype: int64

In [67]:
origin_mode_per_brand = df.groupby('brand')['origin'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)
origin_mode_per_brand


brand
audi              nhập khẩu
bmw               nhập khẩu
chevrolet         nhập khẩu
daewoo            nhập khẩu
ford              nhập khẩu
honda             nhập khẩu
hyundai          trong nước
isuzu             nhập khẩu
jaguar            nhập khẩu
kia              trong nước
land rover        nhập khẩu
lexus             nhập khẩu
mazda            trong nước
mercedes-benz    trong nước
mg                nhập khẩu
mini              nhập khẩu
mitsubishi        nhập khẩu
nissan            nhập khẩu
other             nhập khẩu
peugeot          trong nước
porsche           nhập khẩu
suzuki            nhập khẩu
toyota            nhập khẩu
vinfast          trong nước
volkswagen        nhập khẩu
volvo             nhập khẩu
Name: origin, dtype: object

In [68]:
df['origin'] = df.apply(
    lambda row: origin_mode_per_brand[row['brand']] if pd.isna(row['origin']) else row['origin'],
    axis=1
)


In [69]:
df['origin'].isna().sum()

np.int64(0)

3.body

In [70]:
df["body"].unique()

array([None, 'suv', 'mpv', 'van/minivan', 'crossover', 'sedan',
       'hatchback', 'special purpose', 'bán tải', 'coupe', 'convertible',
       'sport car', 'xe tải', 'minibus', 'convertible/cabriolet',
       'bán tải / pickup', 'truck', 'suv / cross over',
       'pick-up (bán tải)', 'minivan (mpv)', 'kiểu dáng khác', 'van',
       'coupe (2 cửa)', 'mui trần'], dtype=object)

In [71]:
df['body'] = df['body'].replace({
    'xe tải': 'truck',
    'truck': 'truck',
    'bán tải': 'pickup',
    'bán tải / pickup': 'pickup',
    'pick-up (bán tải)': 'pickup',
    'van/minivan': 'minivan',
    'minivan (mpv)': 'minivan',
    'mpv': 'minivan',
    'suv / cross over': 'suv',
    'crossover': 'suv',
    'convertible/cabriolet': 'convertible',
    'coupe (2 cửa)': 'coupe'
})


In [72]:
body_mode_per_brand = df.groupby('brand')['body'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)
body_mode_per_brand

brand
audi                   suv
bmw                  sedan
chevrolet              suv
daewoo               sedan
ford                   suv
honda                sedan
hyundai                suv
isuzu                  suv
jaguar               sedan
kia                    suv
land rover             suv
lexus                  suv
mazda                sedan
mercedes-benz        sedan
mg                     suv
mini             hatchback
mitsubishi             suv
nissan                 suv
other                  suv
peugeot                suv
porsche                suv
suzuki           hatchback
toyota                 suv
vinfast                suv
volkswagen             suv
volvo                  suv
Name: body, dtype: object

In [73]:
df['body'] = df.apply(
    lambda row: body_mode_per_brand[row['brand']] if pd.isna(row['body']) else row['body'],
    axis=1
)

In [74]:
df['body'].isna().sum()

np.int64(0)

4.fuel

In [75]:
df["fuel"].unique()

array([None, 'Điện', 'Hybrid', 'Xăng', 'Dầu'], dtype=object)

In [76]:
df['fuel'] = df['fuel'].str.lower()

In [77]:
fuel_mode_per_brand_body = df.groupby(["brand","body"])['fuel'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)
def fill_fuel(row):
    if pd.isna(row['fuel']):
        return fuel_mode_per_brand_body.get((row['brand'], row['body']), None)
    else:
        return row['fuel']

df['fuel'] = df.apply(fill_fuel, axis=1)

In [78]:
df['fuel'].isna().sum()

np.int64(8)

In [79]:
fuel_mode_per_brand = df.groupby('brand')['fuel'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)
df['fuel'] = df.apply(
    lambda row: fuel_mode_per_brand[row['brand']] if pd.isna(row['fuel']) else row['fuel'],
    axis=1
)

df['fuel'].isna().sum()

np.int64(0)

In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15504 entries, 0 to 16158
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   km      15504 non-null  Int64 
 1   origin  15504 non-null  object
 2   body    15504 non-null  object
 3   fuel    15504 non-null  object
 4   name    15504 non-null  object
 5   price   15504 non-null  Int64 
 6   brand   15504 non-null  object
 7   age     15504 non-null  Int64 
dtypes: Int64(3), object(5)
memory usage: 1.1+ MB


5.outlier

In [81]:
df["price"] = df["price"].astype(float)
df["km"] = df["km"].astype(float)
df["age"] = df["age"].astype(float)

In [82]:
def cap_outliers_group(df, column, group_cols):
    def cap_group(x):
        Q1 = x[column].quantile(0.25)
        Q3 = x[column].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        return x[column].clip(lower, upper)
    
    df[column] = df.groupby(group_cols, group_keys=False).apply(cap_group)
    return df

In [83]:
df = cap_outliers_group(df, "km", ['age'])
df = cap_outliers_group(df, "price", ['brand','age'])

In [84]:
df.head()

Unnamed: 0,km,origin,body,fuel,name,price,brand,age
0,16913.0,nhập khẩu,suv,xăng,Toyota Raize 2024,510000000.0,toyota,1.0
1,130000.0,nhập khẩu,suv,xăng,Toyota RAV4 LE 2007,280000000.0,toyota,18.0
2,80000.0,nhập khẩu,suv,xăng,Toyota Rush S 1 5 AT 2020,475000000.0,toyota,5.0
3,44000.0,nhập khẩu,suv,xăng,Toyota Rush S 1 5 AT 2021,490000000.0,toyota,4.0
4,30000.0,nhập khẩu,suv,xăng,Toyota Rush 1 5 AT 2022,480000000.0,toyota,3.0


In [85]:
df.drop(columns=["name"], inplace=True)

In [86]:
X = df.drop(columns=["price"])
X.head()

Unnamed: 0,km,origin,body,fuel,brand,age
0,16913.0,nhập khẩu,suv,xăng,toyota,1.0
1,130000.0,nhập khẩu,suv,xăng,toyota,18.0
2,80000.0,nhập khẩu,suv,xăng,toyota,5.0
3,44000.0,nhập khẩu,suv,xăng,toyota,4.0
4,30000.0,nhập khẩu,suv,xăng,toyota,3.0


In [87]:
X[X.duplicated(keep=False)]


Unnamed: 0,km,origin,body,fuel,brand,age
0,16913.0,nhập khẩu,suv,xăng,toyota,1.0
1,130000.0,nhập khẩu,suv,xăng,toyota,18.0
2,80000.0,nhập khẩu,suv,xăng,toyota,5.0
3,44000.0,nhập khẩu,suv,xăng,toyota,4.0
4,30000.0,nhập khẩu,suv,xăng,toyota,3.0
...,...,...,...,...,...,...
16087,150000.0,nhập khẩu,sedan,xăng,toyota,25.0
16098,119999.0,nhập khẩu,sedan,xăng,toyota,25.0
16099,150000.0,nhập khẩu,sedan,xăng,toyota,25.0
16142,123456.0,nhập khẩu,sedan,xăng,toyota,32.0


In [88]:
df_unique = df.groupby(list(X.columns), as_index=False)["price"].mean()
df_unique

Unnamed: 0,km,origin,body,fuel,brand,age,price
0,105.0,trong nước,minivan,xăng,toyota,14.0,205000000.0
1,107.0,trong nước,minivan,xăng,toyota,15.0,235000000.0
2,110.0,nhập khẩu,hatchback,xăng,hyundai,11.0,232000000.0
3,110.0,nhập khẩu,sedan,xăng,toyota,18.0,295000000.0
4,112.0,nhập khẩu,mui trần,xăng,mini,20.0,365000000.0
...,...,...,...,...,...,...,...
11300,350000.0,nhập khẩu,suv,xăng,mitsubishi,35.0,450000000.0
11301,358950.0,nhập khẩu,sedan,xăng,mitsubishi,25.0,65000000.0
11302,412027.5,nhập khẩu,suv,xăng,toyota,24.0,97500000.0
11303,451000.0,nhập khẩu,sedan,xăng,toyota,27.0,92500000.0


In [89]:
cols_show = ["brand", "body", "fuel", "origin"]

df_unique[cols_show] = df_unique[cols_show].apply(lambda c: c.str.title())


In [90]:
df_unique.sample(10)

Unnamed: 0,km,origin,body,fuel,brand,age,price
8866,100000.0,Nhập Khẩu,Suv,Xăng,Chevrolet,11.0,148000000.0
10983,183000.0,Nhập Khẩu,Suv,Dầu,Hyundai,10.0,360000000.0
337,4000.0,Nhập Khẩu,Suv,Xăng,Mitsubishi,1.0,495000000.0
5966,66000.0,Nhập Khẩu,Suv,Xăng,Porsche,4.0,3860000000.0
2540,28000.0,Nhập Khẩu,Suv,Dầu,Ford,3.0,1146500000.0
8721,99000.0,Trong Nước,Suv,Xăng,Toyota,13.0,360000000.0
1210,12500.0,Nhập Khẩu,Suv,Xăng,Chevrolet,21.0,75000000.0
178,1500.0,Trong Nước,Suv,Dầu,Toyota,16.0,295000000.0
4162,46000.0,Trong Nước,Suv,Xăng,Mazda,3.0,777000000.0
3860,42338.0,Trong Nước,Sedan,Xăng,Toyota,4.0,579000000.0


In [91]:
df_unique.duplicated().sum()

np.int64(0)

In [92]:
import json


unique_values = {
    'origin': df_unique['origin'].value_counts().index.tolist(),
    'fuel':   df_unique['fuel'].value_counts().index.tolist(),
    'body':   df_unique['body'].value_counts().index.tolist(),
    'brand':  df_unique['brand'].value_counts().index.tolist(),
}

with open('../../model/unique_values.json', 'w', encoding='utf-8') as f:
    json.dump(unique_values, f, ensure_ascii=False, indent=2)

In [93]:
upload_to_bigquery(df_unique, table_id=table_id_done, if_exists="replace" )

✅ Uploaded 11305 rows to khangtestdbt.xecupredict.data_done


* feature engineering

In [94]:
import numpy as np

df_unique["km_per_year"] = df_unique["km"] / (df_unique["age"] + 1)
df_unique["log_km"] = np.log1p(df_unique["km"])
df_unique["log_age"] = np.log1p(df_unique["age"])

In [95]:
df_unique["is_imported"] = (df_unique["origin"] == "Nhập Khẩu").astype(object)
df_unique["age_x_km"] = df_unique["age"] * df_unique["km"]

In [96]:
df_unique["age_group"] = pd.cut(
    df_unique["age"],
    bins=[-1, 5, 10, 15, 100],
    labels=["New", "Young", "Mid", "Old"]
)

In [97]:
df_unique["km_group"] = pd.cut(
    df_unique["km"],
    bins=[0, 50000, 100000, 150000, 300000, 1e8],
    labels=["Very_low", "Low", "Medium", "High", "Very_high"]
)

In [98]:
df_unique["is_common_body"] = df_unique["body"].isin(
    ["Suv", "Minivan", "Sedan"]
).astype(object)

In [99]:
df_unique.head()

Unnamed: 0,km,origin,body,fuel,brand,age,price,km_per_year,log_km,log_age,is_imported,age_x_km,age_group,km_group,is_common_body
0,105.0,Trong Nước,Minivan,Xăng,Toyota,14.0,205000000.0,7.0,4.663439,2.70805,False,1470.0,Mid,Very_low,True
1,107.0,Trong Nước,Minivan,Xăng,Toyota,15.0,235000000.0,6.6875,4.682131,2.772589,False,1605.0,Mid,Very_low,True
2,110.0,Nhập Khẩu,Hatchback,Xăng,Hyundai,11.0,232000000.0,9.166667,4.70953,2.484907,True,1210.0,Mid,Very_low,False
3,110.0,Nhập Khẩu,Sedan,Xăng,Toyota,18.0,295000000.0,5.789474,4.70953,2.944439,True,1980.0,Old,Very_low,True
4,112.0,Nhập Khẩu,Mui Trần,Xăng,Mini,20.0,365000000.0,5.333333,4.727388,3.044522,True,2240.0,Old,Very_low,False


* scale

In [113]:
X = df_unique.copy()

In [114]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11305 entries, 0 to 11304
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   km              11305 non-null  float64 
 1   origin          11305 non-null  object  
 2   body            11305 non-null  object  
 3   fuel            11305 non-null  object  
 4   brand           11305 non-null  object  
 5   age             11305 non-null  float64 
 6   price           11305 non-null  float64 
 7   km_per_year     11305 non-null  float64 
 8   log_km          11305 non-null  float64 
 9   log_age         11305 non-null  float64 
 10  is_imported     11305 non-null  object  
 11  age_x_km        11305 non-null  float64 
 12  age_group       11305 non-null  category
 13  km_group        11305 non-null  category
 14  is_common_body  11305 non-null  object  
dtypes: category(2), float64(7), object(6)
memory usage: 1.1+ MB


In [115]:
X_ = X[["price"]]

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_cols = [col for col in X.columns 
                    if X[col].dtype in ['object','category'] and col not in X_]
numerical_cols = [col for col in X.columns 
                  if X[col].dtype in ['float64'] and col not in X_]

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_cat = ohe.fit_transform(X[categorical_cols])
cat_feature_names = ohe.get_feature_names_out(categorical_cols)
X_cat_df = pd.DataFrame(X_cat, columns=cat_feature_names, index=X.index)


scaler = StandardScaler()
X_num = scaler.fit_transform(X[numerical_cols])
X_num_df = pd.DataFrame(X_num, columns=numerical_cols, index=X.index)

X_encoded = pd.concat([X_num_df, X_cat_df, X_], axis=1)
X_encoded.shape

(11305, 65)

In [117]:
X_encoded.head()

Unnamed: 0,km,age,km_per_year,log_km,log_age,age_x_km,origin_Nhập Khẩu,origin_Trong Nước,body_Convertible,body_Coupe,...,age_group_New,age_group_Old,age_group_Young,km_group_High,km_group_Low,km_group_Medium,km_group_Very_high,km_group_Very_low,is_common_body_False,is_common_body_True
0,-1.376184,1.293177,-1.791607,-5.667706,1.19871,-0.690388,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,-1.376144,1.476946,-1.791665,-5.650369,1.28732,-0.690244,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,-1.376085,0.74187,-1.791207,-5.624955,0.892337,-0.690664,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,-1.376085,2.028253,-1.79183,-5.624955,1.523267,-0.689845,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,-1.376045,2.395791,-1.791914,-5.608392,1.66068,-0.689568,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [111]:
from joblib import dump

dump(ohe, "../../model/onehot_encoder.pkl")
dump(scaler,"../../model/scaler.pkl")

['../../model/scaler.pkl']

In [112]:
upload_to_bigquery(X_encoded, table_id=table_id, if_exists="replace" )

✅ Uploaded 11305 rows to khangtestdbt.xecupredict.data_train_model
