In [None]:
import numpy as np
import pandas as pd
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz 
import os
import pickle
from matplotlib.font_manager import FontProperties
sns.set(font=['sans-serif'])
sns.set_style("whitegrid",{"font.sans-serif":['Microsoft JhengHei']})
# scikit-learn
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer, RobustScaler
from sklearn import datasets, linear_model, preprocessing, neighbors
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix,mean_absolute_error,mean_squared_error, r2_score
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,KFold,RandomizedSearchCV
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
import xgboost as xgb

In [None]:
df_oh = pd.read_csv('../新北市_predictset_onehot.csv', engine='python',index_col=[0])
df_oh

In [None]:
import seaborn as sns
from matplotlib.font_manager import FontProperties
sns.set(font=['sans-serif'])
sns.set_style("whitegrid",{"font.sans-serif":['Microsoft JhengHei']})

plt.figure(figsize=(10,8))
sns.heatmap(df_oh.iloc[:,0:10].corr(),annot=True)

## XGBRegressor

In [None]:
X=df_oh.drop(columns=['每坪價格','車位'])
y=df_oh['每坪價格']

In [None]:
# 調參前
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# 建立 XGBRegressor 模型
xgbr=xgb.XGBRegressor(tree_method="gpu_hist")
# xgbr=xgb.XGBClassifier(tree_method="auto",enable_categorical=True)
# 使用訓練資料訓練模型
xgbr.fit(X_train,y_train)

y_train_pred = xgbr.predict(X_train)
y_test_pred = xgbr.predict(X_test)

In [None]:
print(f'訓練集RMSE: {(mean_squared_error(y_train, y_train_pred,squared=False)):.3f},測試集: {(mean_squared_error(y_test, y_test_pred,squared=False)):.3f}')
print(f'訓練集MAE: {(mean_absolute_error(y_train, y_train_pred)):.3f},測試集: {(mean_absolute_error(y_test, y_test_pred)):.3f}')
print(f'訓練集R2: {(r2_score(y_train, y_train_pred)):.3f},測試集: {(r2_score(y_test, y_test_pred)):.3f}')

In [None]:
importance=xgbr.feature_importances_
importance

In [None]:
X_del=pd.DataFrame({'importance':xgbr.feature_importances_,'features':X.columns}).sort_values(by=['importance'],ascending=False)
mask_selected=X_del.iloc[0:20].features
X_selected=X_train[list(mask_selected)]
X_selected

In [None]:
os.environ["PATH"] += os.pathsep + '../../pylab-Roger/BDSE_機器學習/實作/hands-on_part5/example/release/bin'
# Get a graph
graph = xgb.to_graphviz(xgbr, num_trees=1)
# Or get a matplotlib axis
ax = xgb.plot_tree(xgbr, num_trees=1)
# Get feature importances
xgbr.feature_importances_

In [None]:
# grid 調參

params = {'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7, 10],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.5, 0.7],
        'colsample_bytree': [0.5, 0.7],
        'n_estimators' : [100, 200, 500],
        'objective': ['reg:squarederror']
              }

xgbclf = xgb.XGBRegressor(tree_method='gpu_hist',enable_categorical=True)
clf = RandomizedSearchCV(estimator=xgbclf,
                         param_distributions=params,
                         n_iter=10,
                         n_jobs=-1,
                         cv=3,
                         verbose=1)

clf.fit(X_train, y_train)

print(clf.best_params_,clf.best_score_)

In [None]:
# 調參後
xgbr=xgb.XGBRegressor(tree_method="gpu_hist",enable_categorical=True,n_jobs=-1,
                      colsample_bytree=0.5,
                      min_child_weight=5,
                      subsample=0.7,
                      max_depth=11,
                      n_estimators=500,
                      learning_rate=0.03)
# 使用訓練資料訓練模型
xgbr.fit(X_train,y_train)

# X_test = scaler.transform(X_test)
y_train_pred = xgbr.predict(X_train)
y_test_pred = xgbr.predict(X_test)

In [None]:
print(f'訓練集RMSE: {(mean_squared_error(y_train, y_train_pred,squared=False)):.3f},測試集: {(mean_squared_error(y_test, y_test_pred,squared=False)):.3f}')
print(f'訓練集MAE: {(mean_absolute_error(y_train, y_train_pred)):.3f},測試集: {(mean_absolute_error(y_test, y_test_pred)):.3f}')
print(f'訓練集R2: {(r2_score(y_train, y_train_pred)):.3f},測試集: {(r2_score(y_test, y_test_pred)):.3f}')

In [None]:
kf =KFold(n_splits=5, shuffle=True, random_state=42)
sc=cross_val_score(model,X, y, cv= kf)

In [None]:
sc.mean()

In [None]:
# Saving model
with open("model_XGBR_newtaipei.pickle","wb") as f:
    pickle.dump(xgbr, f, protocol=pickle.HIGHEST_PROTOCOL)