In [8]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_absolute_error
from xgboost import XGBRegressor

import pickle
import joblib

In [9]:
df = pd.read_csv('VN_housing_dataset.csv')
df = df.drop(columns=['Unnamed: 0'])
print(df.shape)
print(df.info())
df.head()

(82497, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82497 entries, 0 to 82496
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Ng√†y             82496 non-null  object
 1   ƒê·ªãa ch·ªâ          82449 non-null  object
 2   Qu·∫≠n             82495 non-null  object
 3   Huy·ªán            82449 non-null  object
 4   Lo·∫°i h√¨nh nh√† ·ªü  82465 non-null  object
 5   Gi·∫•y t·ªù ph√°p l√Ω  53610 non-null  object
 6   S·ªë t·∫ßng          36399 non-null  object
 7   S·ªë ph√≤ng ng·ªß     82458 non-null  object
 8   Di·ªán t√≠ch        82495 non-null  object
 9   D√†i              19827 non-null  object
 10  R·ªông             35445 non-null  object
 11  Gi√°/m2           82484 non-null  object
dtypes: object(12)
memory usage: 7.6+ MB
None


Unnamed: 0,Ng√†y,ƒê·ªãa ch·ªâ,Qu·∫≠n,Huy·ªán,Lo·∫°i h√¨nh nh√† ·ªü,Gi·∫•y t·ªù ph√°p l√Ω,S·ªë t·∫ßng,S·ªë ph√≤ng ng·ªß,Di·ªán t√≠ch,D√†i,R·ªông,Gi√°/m2
0,2020-08-05,"ƒê∆∞·ªùng Ho√†ng Qu·ªëc Vi·ªát, Ph∆∞·ªùng Nghƒ©a ƒê√¥, Qu·∫≠n C...",Qu·∫≠n C·∫ßu Gi·∫•y,Ph∆∞·ªùng Nghƒ©a ƒê√¥,"Nh√† ng√µ, h·∫ªm",ƒê√£ c√≥ s·ªï,4.0,5 ph√≤ng,46 m¬≤,,,"86,96 tri·ªáu/m¬≤"
1,2020-08-05,"ƒê∆∞·ªùng Kim Giang, Ph∆∞·ªùng Kim Giang, Qu·∫≠n Thanh ...",Qu·∫≠n Thanh Xu√¢n,Ph∆∞·ªùng Kim Giang,"Nh√† m·∫∑t ph·ªë, m·∫∑t ti·ªÅn",,,3 ph√≤ng,37 m¬≤,,,"116,22 tri·ªáu/m¬≤"
2,2020-08-05,"ph·ªë minh khai, Ph∆∞·ªùng Minh Khai, Qu·∫≠n Hai B√† T...",Qu·∫≠n Hai B√† Tr∆∞ng,Ph∆∞·ªùng Minh Khai,"Nh√† ng√µ, h·∫ªm",ƒê√£ c√≥ s·ªï,4.0,4 ph√≤ng,40 m¬≤,10 m,4 m,65 tri·ªáu/m¬≤
3,2020-08-05,"ƒê∆∞·ªùng V√µng Th·ªã, Ph∆∞·ªùng Th·ª•y Khu√™, Qu·∫≠n T√¢y H·ªì,...",Qu·∫≠n T√¢y H·ªì,Ph∆∞·ªùng Th·ª•y Khu√™,"Nh√† ng√µ, h·∫ªm",ƒê√£ c√≥ s·ªï,,6 ph√≤ng,51 m¬≤,12.75 m,4 m,100 tri·ªáu/m¬≤
4,2020-08-05,"ƒê∆∞·ªùng Kim Giang, Ph∆∞·ªùng Kim Giang, Qu·∫≠n Thanh ...",Qu·∫≠n Thanh Xu√¢n,Ph∆∞·ªùng Kim Giang,"Nh√† ng√µ, h·∫ªm",,,4 ph√≤ng,36 m¬≤,9 m,4 m,"86,11 tri·ªáu/m¬≤"


In [10]:
df.columns = ['ngay', 'diachi', 'quan', 'huyen', 'loaihinhnhao', 'giaytophaply', 'sotang', 'sophongngu', 'dientich', 'dai', 'rong', 'dongia']
df = df[df.dongia.notna()]
print(df.shape[0])
df

82484


Unnamed: 0,ngay,diachi,quan,huyen,loaihinhnhao,giaytophaply,sotang,sophongngu,dientich,dai,rong,dongia
0,2020-08-05,"ƒê∆∞·ªùng Ho√†ng Qu·ªëc Vi·ªát, Ph∆∞·ªùng Nghƒ©a ƒê√¥, Qu·∫≠n C...",Qu·∫≠n C·∫ßu Gi·∫•y,Ph∆∞·ªùng Nghƒ©a ƒê√¥,"Nh√† ng√µ, h·∫ªm",ƒê√£ c√≥ s·ªï,4,5 ph√≤ng,46 m¬≤,,,"86,96 tri·ªáu/m¬≤"
1,2020-08-05,"ƒê∆∞·ªùng Kim Giang, Ph∆∞·ªùng Kim Giang, Qu·∫≠n Thanh ...",Qu·∫≠n Thanh Xu√¢n,Ph∆∞·ªùng Kim Giang,"Nh√† m·∫∑t ph·ªë, m·∫∑t ti·ªÅn",,,3 ph√≤ng,37 m¬≤,,,"116,22 tri·ªáu/m¬≤"
2,2020-08-05,"ph·ªë minh khai, Ph∆∞·ªùng Minh Khai, Qu·∫≠n Hai B√† T...",Qu·∫≠n Hai B√† Tr∆∞ng,Ph∆∞·ªùng Minh Khai,"Nh√† ng√µ, h·∫ªm",ƒê√£ c√≥ s·ªï,4,4 ph√≤ng,40 m¬≤,10 m,4 m,65 tri·ªáu/m¬≤
3,2020-08-05,"ƒê∆∞·ªùng V√µng Th·ªã, Ph∆∞·ªùng Th·ª•y Khu√™, Qu·∫≠n T√¢y H·ªì,...",Qu·∫≠n T√¢y H·ªì,Ph∆∞·ªùng Th·ª•y Khu√™,"Nh√† ng√µ, h·∫ªm",ƒê√£ c√≥ s·ªï,,6 ph√≤ng,51 m¬≤,12.75 m,4 m,100 tri·ªáu/m¬≤
4,2020-08-05,"ƒê∆∞·ªùng Kim Giang, Ph∆∞·ªùng Kim Giang, Qu·∫≠n Thanh ...",Qu·∫≠n Thanh Xu√¢n,Ph∆∞·ªùng Kim Giang,"Nh√† ng√µ, h·∫ªm",,,4 ph√≤ng,36 m¬≤,9 m,4 m,"86,11 tri·ªáu/m¬≤"
...,...,...,...,...,...,...,...,...,...,...,...,...
82491,2019-08-23,"ƒê∆∞·ªùng H·ªì T√πng M·∫≠u, Ph∆∞·ªùng Ph√∫c Di·ªÖn, Qu·∫≠n B·∫Øc ...",Qu·∫≠n B·∫Øc T·ª´ Li√™m,Ph∆∞·ªùng Ph√∫c Di·ªÖn,Nh√† ph·ªë li·ªÅn k·ªÅ,,,3 ph√≤ng,38 m¬≤,,,"81,58 tri·ªáu/m¬≤"
82492,2019-08-07,"ƒê∆∞·ªùng Tr·∫ßn Qu·ªëc Ho√†n, Ph∆∞·ªùng Quan Hoa, Qu·∫≠n C·∫ß...",Qu·∫≠n C·∫ßu Gi·∫•y,Ph∆∞·ªùng Quan Hoa,"Nh√† m·∫∑t ph·ªë, m·∫∑t ti·ªÅn",,,3 ph√≤ng,50 m¬≤,,,292 tri·ªáu/m¬≤
82493,2019-08-07,"ƒê∆∞·ªùng Nguy·ªÖn Kh√°nh To√†n, Ph∆∞·ªùng Quan Hoa, Qu·∫≠n...",Qu·∫≠n C·∫ßu Gi·∫•y,Ph∆∞·ªùng Quan Hoa,"Nh√† m·∫∑t ph·ªë, m·∫∑t ti·ªÅn",ƒê√£ c√≥ s·ªï,,4 ph√≤ng,41 m¬≤,,,"341,46 tri·ªáu/m¬≤"
82494,2019-08-05,"ƒê∆∞·ªùng Quan Hoa, Ph∆∞·ªùng Quan Hoa, Qu·∫≠n C·∫ßu Gi·∫•y...",Qu·∫≠n C·∫ßu Gi·∫•y,Ph∆∞·ªùng Quan Hoa,"Nh√† ng√µ, h·∫ªm",ƒê√£ c√≥ s·ªï,,4 ph√≤ng,60 m¬≤,,,"101,67 tri·ªáu/m¬≤"


In [11]:
df = df[df['sotang'] != 'Nhi·ªÅu h∆°n 10']
df['sotang'] = df['sotang'].fillna(0)
df = df[df['sophongngu'] != 'nhi·ªÅu h∆°n 10 ph√≤ng']
print(df.shape[0])

81615


In [12]:
df['duong'] = df['diachi'].str.split(', ', expand=True)[0]
df['sotang'] = df['sotang'].astype(int)
df['sophongngu'] = df['sophongngu'].str.replace(' ph√≤ng','').str.strip().astype(float)
df['dientich'] = df['dientich'].str.split('m', expand=True)[0].astype(float)
df['dai'] = df['dai'].str.split('m', expand=True)[0].astype(float)
df['rong'] = df['rong'].str.split('m', expand=True)[0].astype(float)
print(df.shape[0] == 81615)
df.head()

True


Unnamed: 0,ngay,diachi,quan,huyen,loaihinhnhao,giaytophaply,sotang,sophongngu,dientich,dai,rong,dongia,duong
0,2020-08-05,"ƒê∆∞·ªùng Ho√†ng Qu·ªëc Vi·ªát, Ph∆∞·ªùng Nghƒ©a ƒê√¥, Qu·∫≠n C...",Qu·∫≠n C·∫ßu Gi·∫•y,Ph∆∞·ªùng Nghƒ©a ƒê√¥,"Nh√† ng√µ, h·∫ªm",ƒê√£ c√≥ s·ªï,4,5.0,46.0,,,"86,96 tri·ªáu/m¬≤",ƒê∆∞·ªùng Ho√†ng Qu·ªëc Vi·ªát
1,2020-08-05,"ƒê∆∞·ªùng Kim Giang, Ph∆∞·ªùng Kim Giang, Qu·∫≠n Thanh ...",Qu·∫≠n Thanh Xu√¢n,Ph∆∞·ªùng Kim Giang,"Nh√† m·∫∑t ph·ªë, m·∫∑t ti·ªÅn",,0,3.0,37.0,,,"116,22 tri·ªáu/m¬≤",ƒê∆∞·ªùng Kim Giang
2,2020-08-05,"ph·ªë minh khai, Ph∆∞·ªùng Minh Khai, Qu·∫≠n Hai B√† T...",Qu·∫≠n Hai B√† Tr∆∞ng,Ph∆∞·ªùng Minh Khai,"Nh√† ng√µ, h·∫ªm",ƒê√£ c√≥ s·ªï,4,4.0,40.0,10.0,4.0,65 tri·ªáu/m¬≤,ph·ªë minh khai
3,2020-08-05,"ƒê∆∞·ªùng V√µng Th·ªã, Ph∆∞·ªùng Th·ª•y Khu√™, Qu·∫≠n T√¢y H·ªì,...",Qu·∫≠n T√¢y H·ªì,Ph∆∞·ªùng Th·ª•y Khu√™,"Nh√† ng√µ, h·∫ªm",ƒê√£ c√≥ s·ªï,0,6.0,51.0,12.75,4.0,100 tri·ªáu/m¬≤,ƒê∆∞·ªùng V√µng Th·ªã
4,2020-08-05,"ƒê∆∞·ªùng Kim Giang, Ph∆∞·ªùng Kim Giang, Qu·∫≠n Thanh ...",Qu·∫≠n Thanh Xu√¢n,Ph∆∞·ªùng Kim Giang,"Nh√† ng√µ, h·∫ªm",,0,4.0,36.0,9.0,4.0,"86,11 tri·ªáu/m¬≤",ƒê∆∞·ªùng Kim Giang


In [13]:
# df['dongia'] = df['dongia'].replace([','], ['.'])
# df

In [14]:
# Clean and convert all prices to million/m2 instead of VND/m2 or billion/m2
df.loc[df['dongia'].str.contains(' t·ª∑/m¬≤'), 'dongia'] = df.loc[df['dongia'].str.contains(' t·ª∑/m¬≤'), 'dongia'].str.replace(' t·ª∑/m¬≤','').str.replace('.','').str.replace(',','.').astype(float) * 1000
df.loc[df['dongia'].str.contains(' tri·ªáu/m¬≤', na=False), 'dongia'] = df.loc[df['dongia'].str.contains(' tri·ªáu/m¬≤', na=False), 'dongia'].str.replace(' tri·ªáu/m¬≤','').str.replace(',','.').astype(float)
df.loc[df['dongia'].str.contains(' ƒë/m¬≤', na=False), 'dongia'] = df.loc[df['dongia'].str.contains(' ƒë/m¬≤', na=False), 'dongia'].str.replace(' ƒë/m¬≤','').str.replace('.','').astype(float) * 0.000001
# 4. Cu·ªëi c√πng: chuy·ªÉn to√†n b·ªô c·ªôt sang float
df['dongia'] = pd.to_numeric(df['dongia'], errors='coerce')
df.head()

Unnamed: 0,ngay,diachi,quan,huyen,loaihinhnhao,giaytophaply,sotang,sophongngu,dientich,dai,rong,dongia,duong
0,2020-08-05,"ƒê∆∞·ªùng Ho√†ng Qu·ªëc Vi·ªát, Ph∆∞·ªùng Nghƒ©a ƒê√¥, Qu·∫≠n C...",Qu·∫≠n C·∫ßu Gi·∫•y,Ph∆∞·ªùng Nghƒ©a ƒê√¥,"Nh√† ng√µ, h·∫ªm",ƒê√£ c√≥ s·ªï,4,5.0,46.0,,,86.96,ƒê∆∞·ªùng Ho√†ng Qu·ªëc Vi·ªát
1,2020-08-05,"ƒê∆∞·ªùng Kim Giang, Ph∆∞·ªùng Kim Giang, Qu·∫≠n Thanh ...",Qu·∫≠n Thanh Xu√¢n,Ph∆∞·ªùng Kim Giang,"Nh√† m·∫∑t ph·ªë, m·∫∑t ti·ªÅn",,0,3.0,37.0,,,116.22,ƒê∆∞·ªùng Kim Giang
2,2020-08-05,"ph·ªë minh khai, Ph∆∞·ªùng Minh Khai, Qu·∫≠n Hai B√† T...",Qu·∫≠n Hai B√† Tr∆∞ng,Ph∆∞·ªùng Minh Khai,"Nh√† ng√µ, h·∫ªm",ƒê√£ c√≥ s·ªï,4,4.0,40.0,10.0,4.0,65.0,ph·ªë minh khai
3,2020-08-05,"ƒê∆∞·ªùng V√µng Th·ªã, Ph∆∞·ªùng Th·ª•y Khu√™, Qu·∫≠n T√¢y H·ªì,...",Qu·∫≠n T√¢y H·ªì,Ph∆∞·ªùng Th·ª•y Khu√™,"Nh√† ng√µ, h·∫ªm",ƒê√£ c√≥ s·ªï,0,6.0,51.0,12.75,4.0,100.0,ƒê∆∞·ªùng V√µng Th·ªã
4,2020-08-05,"ƒê∆∞·ªùng Kim Giang, Ph∆∞·ªùng Kim Giang, Qu·∫≠n Thanh ...",Qu·∫≠n Thanh Xu√¢n,Ph∆∞·ªùng Kim Giang,"Nh√† ng√µ, h·∫ªm",,0,4.0,36.0,9.0,4.0,86.11,ƒê∆∞·ªùng Kim Giang


In [15]:
num_cols = []
cat_cols = []
for col in df.columns:
    if col != 'dongia':
        if df[col].dtype == 'O':
            cat_cols.append(col)
        else:
            num_cols.append(col)
        
print(num_cols)
print()
print(cat_cols)

['sotang', 'sophongngu', 'dientich', 'dai', 'rong']

['ngay', 'diachi', 'quan', 'huyen', 'loaihinhnhao', 'giaytophaply', 'duong']


In [16]:
label = 'dongia'
print(num_cols)
print()
cat_cols = cat_cols[2:]
print(cat_cols)
features = cat_cols + num_cols
features

['sotang', 'sophongngu', 'dientich', 'dai', 'rong']

['quan', 'huyen', 'loaihinhnhao', 'giaytophaply', 'duong']


['quan',
 'huyen',
 'loaihinhnhao',
 'giaytophaply',
 'duong',
 'sotang',
 'sophongngu',
 'dientich',
 'dai',
 'rong']

In [17]:
print(df.ngay.min())
print(df.ngay.max())

2019-08-05
2020-08-05


In [18]:
# inter quantile range |---------Q1---------Q2-----------Q3------------|      ---
#                                 -------------------------   => IQR, any point < Q1 - 1.5*IQR, any point > Q3 + 1.5IQR  

## feature1, feature2, feature3, outlier cho tung feature, , neu row_index ma xay ra bat thuong it nhat n cot ==> nhieu kha nang
from collections import Counter

def detect_outliers(df,n,features):
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col],25)
        
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers fro feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index

        # append the found outlier indices for col to the list of outlier indices
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than n outliers
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(k for k,v in outlier_indices.items() if v > n)
    
    return multiple_outliers

outlier_to_drop = detect_outliers(df, 2, num_cols)
outlier_to_drop

[]

In [19]:
train, test = train_test_split(df, test_size=0.2, random_state=42)


In [20]:
train.loc[outlier_to_drop]

Unnamed: 0,ngay,diachi,quan,huyen,loaihinhnhao,giaytophaply,sotang,sophongngu,dientich,dai,rong,dongia,duong


In [21]:
train = train.drop(outlier_to_drop, axis = 0).reset_index(drop=True)

In [22]:
X_train, y_train, X_test, y_test = train[features], train[label], test[features], test[label]
print(train.shape[0] == X_train.shape[0])
print(test.shape[0] == X_test.shape[0])
X_train

True
True


Unnamed: 0,quan,huyen,loaihinhnhao,giaytophaply,duong,sotang,sophongngu,dientich,dai,rong
0,Qu·∫≠n B·∫Øc T·ª´ Li√™m,Ph∆∞·ªùng C·ªï Nhu·∫ø 1,"Nh√† ng√µ, h·∫ªm",ƒê√£ c√≥ s·ªï,ƒê∆∞·ªùng Tr·∫ßn Cung,0,5.0,52.0,11.0,5.0
1,Qu·∫≠n Thanh Xu√¢n,Ph∆∞·ªùng Kh∆∞∆°ng Trung,"Nh√† ng√µ, h·∫ªm",,ƒê∆∞·ªùng Kh∆∞∆°ng Trung,0,10.0,46.0,,
2,Qu·∫≠n C·∫ßu Gi·∫•y,Ph∆∞·ªùng Quan Hoa,"Nh√† m·∫∑t ph·ªë, m·∫∑t ti·ªÅn",,ƒê∆∞·ªùng Ho√†ng Qu·ªëc Vi·ªát,0,6.0,50.0,,4.0
3,Qu·∫≠n Long Bi√™n,Ph∆∞·ªùng Vi·ªát H∆∞ng,Nh√† bi·ªát th·ª±,ƒê√£ c√≥ s·ªï,vi·ªát h∆∞ng,3,7.0,320.0,,20.0
4,Qu·∫≠n C·∫ßu Gi·∫•y,Ph∆∞·ªùng Nghƒ©a T√¢n,"Nh√† m·∫∑t ph·ªë, m·∫∑t ti·ªÅn",,ƒê∆∞·ªùng Ph√πng Ch√≠ Ki√™n,0,7.0,80.0,11.0,7.0
...,...,...,...,...,...,...,...,...,...,...
65287,Qu·∫≠n H√† ƒê√¥ng,Ph∆∞·ªùng Ki·∫øn H∆∞ng,Nh√† ph·ªë li·ªÅn k·ªÅ,ƒê√£ c√≥ s·ªï,Ph√∫c la,5,5.0,83.0,12.0,7.0
65288,Qu·∫≠n Thanh Xu√¢n,Ph∆∞·ªùng Kh∆∞∆°ng Mai,"Nh√† ng√µ, h·∫ªm",ƒê√£ c√≥ s·ªï,ƒê∆∞·ªùng L√™ Tr·ªçng T·∫•n,5,3.0,32.0,,
65289,Qu·∫≠n T√¢y H·ªì,Ph∆∞·ªùng Xu√¢n La,"Nh√† ng√µ, h·∫ªm",,ƒê∆∞·ªùng Xu√¢n La,0,4.0,52.0,,
65290,Qu·∫≠n Ho√†ng Mai,Ph∆∞·ªùng ƒê·ªãnh C√¥ng,"Nh√† ng√µ, h·∫ªm",ƒê√£ c√≥ s·ªï,ƒê∆∞·ªùng ƒê·ªãnh C√¥ng||956,5,4.0,44.0,,


In [23]:
def Log_Dientich_Transform(dientich):
    return np.log1p(dientich)
Log_Dientich_Transformer = FunctionTransformer(Log_Dientich_Transform, validate=False)

class TotalSoPhongTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Assume X is a DataFrame with 'price' and 'sqft' columns
        X = X.copy()
        X['total_sophong'] = X['sophongngu'] * X['sotang']
        return X[['total_sophong']]

In [None]:
# # Create column transformer for OHE
# preprocessor = ColumnTransformer(
#     transformers=[
#         ("log_dientich", Log_Dientich_Transformer, ['dientich']),
#         ("total_sophong", TotalSoPhongTransformer(), ['sophongngu', 'sotang']),
#         ("cat", OneHotEncoder(handle_unknown='ignore'), cat_cols),
#         ("scale", StandardScaler(), num_cols)
#     ]
# )

# # Create a pipeline with preprocessing and random forest classifier
# pipeline_xgb = Pipeline(steps=[
#     ("preprocessor", preprocessor),
#     ("regressor", XGBRegressor(objective='reg:squarederror', random_state=42))
# ])

# # Define hyperparameter grid
# param_grid = {
#     "regressor__n_estimators": [100, 200],
#     "regressor__max_depth": [3, 5, 7],
#     "regressor__learning_rate": [0.01, 0.1, 0.2],
#     "regressor__subsample": [0.8, 1.0],
#     "regressor__colsample_bytree": [0.8, 1.0]
# }

# # Grid search setup
# grid_search_pipeline = GridSearchCV(
#     estimator=pipeline_xgb,
#     param_grid=param_grid,
#     cv=5,
#     scoring='neg_mean_absolute_percentage_error',  # or 'r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error'
#     verbose=2,
#     n_jobs=-1
# )

# # Fit to training data
# grid_search_pipeline.fit(X_train, y_train)

In [None]:
# # Results
# print("‚úÖ Best parameters:", grid_search_pipeline.best_params_)
# print("üìâ Best RMSE:", grid_search_pipeline.best_score_)

In [26]:
# # Gi·∫£ s·ª≠ grid_search l√† ƒë·ªëi t∆∞·ª£ng GridSearchCV ƒë√£ fit
# joblib.dump(grid_search_pipeline, "grid_search_pipeline_model.pkl")

In [27]:
# del grid_search_pipeline

In [None]:
# V·ªõi joblib
grid_search_pipeline = joblib.load("grid_search_pipeline_model.pkl")

In [41]:
y_train_pred = grid_search_pipeline.predict(X_train)
print(r2_score(y_train_pred, y_train))
print(mean_absolute_percentage_error(y_train_pred, y_train))

0.9999999981443491
0.2469513863989205


In [42]:
y_test_pred = grid_search_pipeline.predict(X_test)
print(r2_score(y_test_pred, y_test))
print(mean_absolute_percentage_error(y_test_pred, y_test))

-24.288062518674778
55.570276170928096


In [35]:
# # Gi·∫£ s·ª≠ grid_search l√† ƒë·ªëi t∆∞·ª£ng GridSearchCV ƒë√£ fit
# joblib.dump(grid_search_pipeline, "grid_search_pipeline_model.pkl")