## Import Libraries

In [1]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import r2_score

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import plotly.express as px
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import lightgbm as lgbm
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

ModuleNotFoundError: No module named 'xgboost'

## Reading the data

In [None]:
housing_data = pd.read_csv("housing_price_dataset.csv")
housing_data.head()

In [None]:
display(housing_data.head())

## EDA

In [None]:
display(housing_data.isnull().sum())

In [None]:
plt.figure(figsize = (10, 2))

plt.subplot(1, 3, 1)
plt.title("housing data")
sns.heatmap(housing_data.isnull())

plt.show()

In [None]:
msno.matrix(df=housing_data, figsize=(3,2), color=(0,.3,.3))

In [None]:
display(housing_data.info())

In [None]:
plt.subplot(1, 3, 1)
sns.countplot(x = housing_data["Neighborhood"])
plt.xticks(rotation = 90);

In [None]:
housing_data["Neighborhood"] = housing_data["Neighborhood"].replace({'Rural':1,'Urban':2,'Suburb':3})

In [None]:
display(housing_data)

In [None]:
help(housing_data._append)

In [None]:
train_feature = housing_data.columns.drop('Price').tolist()
train_feature

In [None]:
housing_data[train_feature].describe().T\
        .style.bar(subset=['mean'], color=px.colors.qualitative.G10[0])\
        .background_gradient(subset=['std'], cmap='BuPu')\
        .background_gradient(subset=['50%'], cmap='Reds')

In [None]:
for feat in train_feature:
    plt.figure(figsize=(15,3))
    ax1 = plt.subplot(1,2,1)
    housing_data[feat].plot(kind='hist', bins=50, color='blue')
    plt.title(feat)
    plt.show()

In [None]:
# Calculate skewness and kurtosis for each column
skewness = housing_data.skew()
kurtosis = housing_data.kurt()

# Printing the results
for column in housing_data.columns[:-1]:
    print(f"Column: {column}")
    print(f"Skewness: {skewness[column]}")
    print(f"Kurtosis: {kurtosis[column]}\n")

In [None]:
#Split the dataset into train and test
train, test = train_test_split(housing_data, test_size=0.2, random_state=42)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
display(train.describe())
display(test.describe())

## Feature Selection

In [None]:
#Feature_Importance_Analysis with XGBRegressor , Prices are highly correlated to SquareFeet.
X_data_feature= train.drop(columns=['Price'],axis=1)
y_data_feature= train['Price']

model = [XGBRegressor()]

model = [model[i].fit(X_data_feature,y_data_feature) for i in range(len(model))]

num_chr = [12, 12, 10]

for i in range(len(model)):
    print(str(model[i])[:num_chr[i]] + ': \n',
          model[i].feature_importances_)
    feat_importances = pd.Series(model[i].feature_importances_,
                                 index=X_data_feature.columns)
    feat_importances.nlargest(10).plot(kind='barh', color='royalblue')
    plt.xlim(0, 1.0)
    plt.show()

In [None]:
#Feature_Importance_Analysis with LGBMRegressor , Prices also are highly correlated to SquareFeet.
model = [LGBMRegressor()]

model = [model[i].fit(X_data_feature,y_data_feature) for i in range(len(model))]

num_chr = [12, 12, 10]

for i in range(len(model)):
    print(str(model[i])[:num_chr[i]] + ': \n',
          model[i].feature_importances_)
    feat_importances = pd.Series(model[i].feature_importances_,
                                 index=X_data_feature.columns)
    feat_importances.nlargest(10).plot(kind='barh', color='royalblue')
    plt.xlim(0, 1500)
    plt.show()

In [None]:
# Feature_Importance_Analysis with CatBoostRegressor , Prices also are highly correlated to SquareFeet.
model = [CatBoostRegressor(logging_level='Silent')]

model = [model[i].fit(X_data_feature,y_data_feature) for i in range(len(model))]

num_chr = [12, 12, 10]

for i in range(len(model)):
    print(str(model[i])[:num_chr[i]] + ': \n',
          model[i].feature_importances_)
    feat_importances = pd.Series(model[i].feature_importances_,
                                 index=X_data_feature.columns)
    feat_importances.nlargest(10).plot(kind='barh', color='royalblue')
    plt.xlim(0, 80)
    plt.show()

In [None]:
# using Heatmap to find relations between features.
corr = train.corr(method='pearson')
fig, ax = plt.subplots(figsize=(15, 15))
sns.heatmap(corr, cmap='RdBu', annot=True, fmt=".2f")
plt.xticks(range(len(corr.columns)), corr.columns);
plt.yticks(range(len(corr.columns)), corr.columns)
plt.show()

In [None]:
# So We need to remove Neighborhood and YearBuilt feature .
X = train.drop(columns=['Price','Neighborhood','YearBuilt'],axis=1)
y = train['Price']

In [None]:
X_train = X
y_train = y

StandardScaler = StandardScaler()
X_train = StandardScaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train)
X_train

## Modeling

In [None]:
X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train, test_size=0.2,random_state=2019)
print("Shape of X_train: ",X_train.shape)
print("Shape of X_eval: ", X_eval.shape)
print("Shape of y_train: ",y_train.shape)
print("Shape of y_eval",y_eval.shape)

In [None]:
# the distribution of the Price
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(5, 4))
sns.histplot(y_train)
ax.xaxis.grid(False)

sns.despine(trim=True, left=True)
plt.show()

print("Skewness: %f" % y_train.skew())
print("Kurtosis: %f" % y_train.kurt())

In [None]:
plt.figure(figsize=(2,5))
plt.boxplot(y_train,showmeans=True)
plt.title('Price')
plt.show()

In [None]:
def objective(trial):
   params = {
       'n_estimators' :     trial.suggest_int('n_estimators', 300, 4000),
       'learning_rate':     trial.suggest_loguniform("learning_rate", 0.01, 0.1),
       'max_depth':         trial.suggest_int("max_depth", 2, 6), 
       'min_child_weight':  trial.suggest_loguniform("min_child_weight", 0.001, 10),
       'subsample':         trial.suggest_discrete_uniform("subsample", 0.1, 1.0, 0.025),
       'colsample_bytree':  trial.suggest_discrete_uniform("colsample_bytree",  0.1, 1.0, 0.025),
       'colsample_bylevel': trial.suggest_discrete_uniform("colsample_bylevel",  0.1, 1.0, 0.025),
       'gamma':             trial.suggest_loguniform("gamma", 1e-10, 10), 
       'reg_lambda':        trial.suggest_loguniform("reg_lambda", 1e-10, 100),
       'reg_alpha':         trial.suggest_loguniform("reg_alpha", 1e-10, 100),
   } 

   model = xgb.XGBRegressor(**params, objective = 'reg:absoluteerror')
   model.fit(X_train, y_train)
   y_pred = model.predict(X_train)
   score = median_absolute_error(y_train, y_pred)
    
   return score

In [None]:
# study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=123))
# study.optimize(objective, n_trials=50) 
# study.best_params

In [None]:
def objective(trial):
   params = {
       'n_estimators' :     trial.suggest_int('n_estimators', 300, 4000),
       'learning_rate':     trial.suggest_loguniform("learning_rate", 0.01, 0.1),
       'max_depth':         trial.suggest_int("max_depth", 2, 6), 
       'min_child_weight':  trial.suggest_loguniform("min_child_weight", 0.001, 10),
       'subsample':         trial.suggest_discrete_uniform("subsample", 0.1, 1.0, 0.025),
       'colsample_bytree':  trial.suggest_discrete_uniform("colsample_bytree",  0.1, 1.0, 0.025),
       'reg_lambda':        trial.suggest_loguniform("reg_lambda", 1e-10, 100),
       'reg_alpha':         trial.suggest_loguniform("reg_alpha", 1e-10, 100),
   } 

   model = lgbm.LGBMRegressor(**params,objective='regression_l1')
   model.fit(X_train,y_train)
   y_pred = model.predict(X_train)
   score = median_absolute_error(y_train, y_pred)
    
   return score

In [None]:
# study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=123))
# study.optimize(objective, n_trials=50) 
# study.best_params

In [None]:
params_XGB_best ={'n_estimators': 2565,
 'learning_rate': 0.09732287808197748,
 'max_depth': 6,
 'min_child_weight': 0.007737991528812258,
 'subsample': 0.1,
 'colsample_bytree': 1.0,
 'colsample_bylevel': 0.9,
 'gamma': 1.3007598689577686,
 'reg_lambda': 9.683728618282095e-06,
 'reg_alpha': 0.08617584406570492}

In [None]:
params_LGBM_best ={'n_estimators': 2309,
 'learning_rate': 0.04942932762026841,
 'max_depth': 6,
 'min_child_weight': 0.21030657642741743,
 'subsample': 0.30000000000000004,
 'colsample_bytree': 0.775,
 'reg_lambda': 0.21539132638034386,
 'reg_alpha': 2.5398950960373498e-08}

In [None]:
XGBRegressor_model = xgb.XGBRegressor(**params_XGB_best,objective='reg:absoluteerror').fit(X_train,y_train)
LGBM_model = lgbm.LGBMRegressor(**params_LGBM_best,objective='regression_l1').fit(X_train,y_train)

In [None]:
#Evaluate XGB and LGBM By CV、R2_score、MedAE. 
cv_XGBoost = cross_val_score(estimator = XGBRegressor_model, X = X_train, y = y_train, cv = 5)
y_pred_XGBoost_eval = XGBRegressor_model.predict(X_eval)
r2_score_XGBoost_eval = r2_score(y_eval, y_pred_XGBoost_eval)
MedAE_XGBoost = (np.sqrt(median_absolute_error(y_eval, y_pred_XGBoost_eval)))
print("CV: ", cv_XGBoost.mean())
print('R2_score (eval): ', r2_score_XGBoost_eval)
print("MedAE: ", MedAE_XGBoost)

cv_LGBM = cross_val_score(estimator = LGBM_model, X = X_train, y = y_train, cv = 5)
y_pred_LGBM_eval = LGBM_model.predict(X_eval)
r2_score_LGBM_eval = r2_score(y_eval, y_pred_LGBM_eval)
MedAE_LGBM = (np.sqrt(median_absolute_error(y_eval, y_pred_LGBM_eval)))
print("CV: ", cv_LGBM.mean())
print('R2_score (eval): ', r2_score_LGBM_eval)
print("MedAE: ", MedAE_LGBM)

In [None]:
#Convert to a dataframe to observe the evaluation results of the model.
models = [('XGB', MedAE_XGBoost,  r2_score_XGBoost_eval, cv_XGBoost.mean()),
          ('LGBM', MedAE_LGBM, r2_score_LGBM_eval, cv_LGBM.mean())
         ]                                          


predict = pd.DataFrame(data = models, columns=['Model', 'MedAE', 'R2_Score(eval)', 'Cross-Validation'])
predict

In [None]:
# Visualization of MODEL's MedAE.
f, axe = plt.subplots(1,1, figsize=(8,1))

predict.sort_values(by=['MedAE'], ascending=False, inplace=True)

sns.barplot(x='MedAE', y='Model',  data = predict, ax = axe)
axe.set_xlabel('MedAE(Val)', size=15)
axe.set_ylabel('Model', size=15)
axe.set_xlim(0,200.0)
plt.show()

In [None]:
# Visualization of MODEL's R2_Score.
f, axe = plt.subplots(1,1, figsize=(8,1))

predict.sort_values(by=['R2_Score(eval)'], ascending=False, inplace=True)

sns.barplot(x='R2_Score(eval)', y='Model',  data = predict, ax = axe)
axe.set_xlabel('R2_Score(eval)', size=15)
axe.set_ylabel('Model', size=15)
axe.set_xlim(0,1.0)
plt.show()

In [None]:
# Visualization of MODEL's Cross-Validation.
f, axe = plt.subplots(1,1, figsize=(8,1))

predict.sort_values(by=['Cross-Validation'], ascending=False, inplace=True)

sns.barplot(x='Cross-Validation', y='Model',  data = predict, ax = axe)
axe.set_xlabel('Cross-Validation', size=15)
axe.set_ylabel('Model', size=15)
axe.set_xlim(0,1.0)
plt.show()

## Predict Test

In [None]:
import_test0 = test.reset_index(drop=True)
import_test = import_test0.drop(columns=['Price','Neighborhood','YearBuilt'],axis=1)
import_test

In [None]:
import_train = X.reset_index(drop=True)
import_train

In [None]:
Row_Number=10000
X_test_target1_df=import_train._append(import_test,ignore_index=True)

from sklearn.preprocessing import StandardScaler
StandardScaler = StandardScaler() 
X_test_target1_df = StandardScaler.fit_transform(X_test_target1_df)
test_pred_target0= pd.DataFrame(X_test_target1_df)
test_pred_target0 = pd.DataFrame(test_pred_target0).tail(Row_Number)
test_pred_target0

In [None]:
test_pred_target0 = test_pred_target0.reset_index(drop=True)
test_pred_target0

In [None]:
predict_XGB = XGBRegressor_model.predict(test_pred_target0)
predict_LGBM = LGBM_model.predict(test_pred_target0)
  
predict_XGB_df=pd.DataFrame(predict_XGB) 
predict_LGBM_df=pd.DataFrame(predict_LGBM)                                                                   

predict_XGB_df=predict_XGB_df.set_axis(axis=1,labels=['XGB_pred'])
predict_LGBM_df=predict_LGBM_df.set_axis(axis=1,labels=['LGBM_pred'])

display(predict_XGB_df.head())
display(predict_LGBM_df.head())  

In [None]:
test_pred=test.merge(predict_XGB_df,how='inner',left_index=True,right_index=True)
test_pred=test_pred.merge(predict_LGBM_df,how='inner',left_index=True,right_index=True)
test_pred

In [None]:
r2_score_XGBoost_test = r2_score(test_pred['Price'], test_pred['XGB_pred'])
MedAE_XGBoost_test = (np.sqrt(median_absolute_error(test_pred['Price'], test_pred['XGB_pred'])))
print('R2_score (eval): ', r2_score_XGBoost_test)
print("MedAE: ", MedAE_XGBoost_test)

r2_score_LGBM_test = r2_score(test_pred['Price'], test_pred['LGBM_pred'])
MedAE_LGBM_test = (np.sqrt(median_absolute_error(test_pred['Price'], test_pred['LGBM_pred'])))
print('R2_score (eval): ', r2_score_LGBM_test)
print("MedAE: ", MedAE_LGBM_test)

In [None]:
models_Compare0 = [ ('XGB', MedAE_XGBoost ,MedAE_XGBoost_test,r2_score_XGBoost_eval,r2_score_XGBoost_test),
                    ('LGBM', MedAE_LGBM ,MedAE_LGBM_test, r2_score_LGBM_eval,r2_score_LGBM_test)         
                  ]                                         

predict_Compare0 = pd.DataFrame(data = models_Compare0, columns=['Model', 'MedAE(eval)','MedAE(test)', 'R2_Score(eval)', 'R2_Score(test)'])
predict_Compare0.sort_values(by=['R2_Score(test)'], ascending=False)

In [None]:
# Visualization of MODEL's MedAE.
f, axe = plt.subplots(1,1, figsize=(8,1))

predict_Compare0.sort_values(by=['MedAE(test)'], ascending=False, inplace=True)

sns.barplot(x='MedAE(test)', y='Model',  data = predict_Compare0, ax = axe)
axe.set_xlabel('MedAE(test)', size=15)
axe.set_ylabel('Model', size=15)
axe.set_xlim(0,200.0)
plt.show()

In [None]:
# Visualization of MODEL's R2_Score.
f, axe = plt.subplots(1,1, figsize=(8,1))

predict_Compare0.sort_values(by=['R2_Score(test)'], ascending=False, inplace=True)

sns.barplot(x='R2_Score(test)', y='Model',  data = predict_Compare0, ax = axe)
axe.set_xlabel('R2_Score(test)', size=15)
axe.set_ylabel('Model', size=15)
axe.set_xlim(0,1.0)
plt.show()