## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
from xgboost import XGBRegressor
from statsmodels.stats.outliers_influence import variance_inflation_factor  
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn import neighbors
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
plt.rcParams['figure.figsize'] = (9, 6)
sns.set(style = "whitegrid")
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

## Pre-Processing

In [None]:
#import df
train_undum_df = pd.read_csv('./Data/train_undum_df.csv', index_col=0)
test_undum_df = pd.read_csv('./Data/test_undum_df.csv', index_col=0)

In [None]:
#assign predictor and target variables
y = train_undum_df['SalePrice'].apply(np.log)
X = train_undum_df.drop(["SalePrice"], axis = 1)

In [None]:
#train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

## Random Forest

In [None]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

In [None]:
#Baseline Values
train_baseline_rf = rf.predict(X_train)
RMSE_train = np.sqrt(mean_squared_error(y_train, train_baseline_rf))

test_baseline_rf = rf.predict(X_test)
RMSE_test = np.sqrt(mean_squared_error(y_test,test_baseline_rf))

print("Baseline Values: ")
print("R^2 for train data is: %.3f" %(rf.score(X_train, y_train)))
print("R^2 for test data is: %.3f" %(rf.score(X_test,y_test)))
print("-" * 50)
print("RMSE for train data is: %.3f" % RMSE_train)
print("RMSE for test data is: %.3f" % RMSE_test)

In [None]:
#hyperparameter tuning using grid search
# grid_para_forest = {'n_estimators': [150,300,450,600,750,900],
#                    'max_depth': [40,50,60,70],
#                    'max_features' : [15,20,25]}

# grid_para_forest = {'n_estimators': [600,700,800,900,1000,1100],
#                    'max_depth': [40,50,60,70],
#                    'max_features' : [5,10,15,20]}

grid_para_forest = {'criterion': ['mse'],
                    'min_samples_split': [2,3],
                    'max_depth': [20,25,30],
                    'n_estimators': [200,300,400,500],
                    'min_samples_leaf':[1,2],
                    'max_leaf_nodes':[None],
                    'max_samples': [None],
                    'bootstrap': [False],
                    'max_features': ['sqrt', 8, 15, 20],
                    'ccp_alpha': [0.00,0.02],
                    'random_state' :[42]}

In [None]:
grid_search_forest = GridSearchCV(rf, grid_para_forest, cv=5, n_jobs=5, scoring='r2', verbose=1, return_train_score=True)
grid_search_forest.fit(X_train, y_train)

print("RF Grid Search Best params: ", grid_search_forest.best_params_)
print("RF Grid Search Best score: ", grid_search_forest.best_score_)
print("RF Grid Search Best estimators: ", grid_search_forest.best_estimator_)

In [None]:
best_RF = grid_search_forest.best_estimator_

y_train_pred_rf = best_RF.predict(X_train)
RMSE_train = np.sqrt(mean_squared_error(y_train,y_train_pred_rf))

y_test_pred_rf = best_RF.predict(X_test)
RMSE_test = np.sqrt(mean_squared_error(y_test,y_test_pred_rf))


print("R^2 for train data is: %.3f" %(grid_search_forest.score(X_train, y_train)))
print("R^2 for test data is: %.3f" %(grid_search_forest.score(X_test,y_test)))
print("-" * 50)
print('RMSE for train data is: %.3f' % RMSE_train)
print('RMSE for test data is: %.3f' % RMSE_test)

In [None]:
# best parameters with lowered max_depth
rf2 = RandomForestRegressor(bootstrap= False,
                    ccp_alpha= 0.0,
                    criterion= 'mse',
                    max_depth= 6,
                    max_features= 8,
                    max_leaf_nodes= None,
                    max_samples= None,
                    min_samples_leaf= 1,
                    min_samples_split= 2,
                    n_estimators= 500,
                    random_state= 42)

rf2.fit(X_train, y_train)

In [None]:
#Baseline Values
train_baseline_rf2 = rf2.predict(X_train)
RMSE_train2 = np.sqrt(mean_squared_error(y_train, train_baseline_rf2))

test_baseline_rf2 = rf2.predict(X_test)
RMSE_test2 = np.sqrt(mean_squared_error(y_test,test_baseline_rf2))

print("Baseline Values: ")
print("R^2 for train data is: %.3f" %(rf2.score(X_train, y_train)))
print("R^2 for test data is: %.3f" %(rf2.score(X_test,y_test)))
print("-" * 50)
print("RMSE for train data is: %.3f" % RMSE_train2)
print("RMSE for test data is: %.3f" % RMSE_test2)

In [None]:
sorted_importance = sorted(zip(X.columns, rf2.feature_importances_), key=lambda t:t[1], reverse=True)
a, b = zip(*sorted_importance)
plt.figure(figsize = (10,10))
df = pd.DataFrame({'feature_name':a, 'importance_score':b})
sns.barplot(data = df, x = 'importance_score', y= 'feature_name', orient = 'h');
plt.title('Feature Importance Using Random Forest')
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.ylim(bottom=(9.5,-0.5))


## Gradient Boosting

In [None]:
gbm = GradientBoostingRegressor()
gbm.fit(X_train, y_train)

In [None]:
#Baseline Values
train_baseline_rf = rf.predict(X_train)
RMSE_train = np.sqrt(mean_squared_error(y_train, train_baseline_rf))

test_baseline_rf = rf.predict(X_test)
RMSE_test = np.sqrt(mean_squared_error(y_test,test_baseline_rf))

print("Baseline Values: ")
print("R^2 for train data is: %.3f" %(rf.score(X_train, y_train)))
print("R^2 for test data is: %.3f" %(rf.score(X_test,y_test)))
print("-" * 50)
print("RMSE for train data is: %.3f" % RMSE_train)
print("RMSE for test data is: %.3f" % RMSE_test)

In [None]:
grid_para_gb = {'n_estimators': [200,300,400,500,600,700],
                   'learning_rate':[0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
                   'max_depth':range(1,8),
                   'max_features' : [7,8,9]}


In [None]:
grid_search_gb = GridSearchCV(gbm, grid_para_gb, cv=10, n_jobs=-1, verbose=1)
grid_search_gb.fit(X_train, y_train)

print("RF Grid Search Best params: ", grid_search_gb.best_params_)
print("RF Grid Search Best score: ", grid_search_gb.best_score_)
print("RF Grid Search Best estimators: ", grid_search_gb.best_estimator_)

In [None]:
best_GB = grid_search_gb.best_estimator_

y_train_pred_gb = best_GB.predict(X_train)
RMSE_train = np.sqrt(mean_squared_error(y_train,y_train_pred_gb))

y_test_pred_gb = best_GB.predict(X_test)
RMSE_test = np.sqrt(mean_squared_error(y_test,y_test_pred_gb))


print("R^2 for train data is: %.3f" %(grid_search_gb.score(X_train, y_train)))
print("R^2 for test data is: %.3f" %(grid_search_gb.score(X_test,y_test)))
print("-" * 50)
print('RMSE for train data is: %.3f' % RMSE_train)
print('RMSE for test data is: %.3f' % RMSE_test)

In [None]:
sorted_importance = sorted(zip(X.columns, best_GB.feature_importances_), key=lambda t:t[1], reverse=True)
a, b = zip(*sorted_importance)
plt.figure(figsize = (10,10))
df = pd.DataFrame({'feature_name':a, 'importance_score':b})
sns.barplot(data = df, x = 'importance_score', y= 'feature_name', orient = 'h');
plt.title('Feature Importance Using Gradient Boosting')
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')

In [None]:
#manually lowered max depth 
gbm2 = GradientBoostingRegressor(n_estimators=700,
                                 learning_rate=0.01,
                                 max_depth=2,
                                 max_features=9)

gbm2.fit(X_train, y_train)

In [None]:
train_baseline_gbm2 = gbm2.predict(X_train)
RMSE_train2 = np.sqrt(mean_squared_error(y_train, train_baseline_gbm2))

test_baseline_gbm2 = gbm2.predict(X_test)
RMSE_test2 = np.sqrt(mean_squared_error(y_test,test_baseline_gbm2))

print("Final Values: ")
print("R^2 for train data is: %.3f" %(gbm2.score(X_train, y_train)))
print("R^2 for test data is: %.3f" %(gbm2.score(X_test,y_test)))
print("-" * 50)
print("RMSE for train data is: %.3f" % RMSE_train2)
print("RMSE for test data is: %.3f" % RMSE_test2)

## XG Boost

In [None]:
xgbr = XGBRegressor()

In [None]:
xgbr.fit(X_train, y_train)

In [None]:
#Baseline Values
train_baseline_xgb = xgbr.predict(X_train)
RMSE_train = np.sqrt(mean_squared_error(y_train, train_baseline_xgb))

test_baseline_xgb = xgbr.predict(X_test)
RMSE_test = np.sqrt(mean_squared_error(y_test,test_baseline_xgb))

print("Baseline Values: ")
print("R^2 for train data is: %.3f" %(xgbr.score(X_train, y_train)))
print("R^2 for test data is: %.3f" %(xgbr.score(X_test,y_test)))
print("-" * 50)
print("RMSE for train data is: %.3f" % RMSE_train)
print("RMSE for test data is: %.3f" % RMSE_test)

In [None]:
#hyperparameter tuning using grid search
xgbr = XGBRegressor()
grid_param_xgbr = [{'max_depth': range(1, 4),
               'n_estimators':range(10,500,20)}]

In [None]:
grid_search_xgbr = GridSearchCV(xgbr, grid_param_xgbr, cv=10, n_jobs=-1, verbose=1)
grid_search_xgbr.fit(X_train, y_train)

print("RF Grid Search Best params: ", grid_search_xgbr.best_params_)
print("RF Grid Search Best score: ", grid_search_xgbr.best_score_)
print("RF Grid Search Best estimators: ", grid_search_xgbr.best_estimator_)

In [None]:
#results with best XGBR estimators
best_XGBR = grid_search_xgbr.best_estimator_

y_train_pred_xgbr = best_XGBR.predict(X_train)
RMSE_train = np.sqrt(mean_squared_error(y_train,y_train_pred_xgbr))

y_test_pred_xgbr = best_XGBR.predict(X_test)
RMSE_test = np.sqrt(mean_squared_error(y_test,y_test_pred_xgbr))

print("R^2 for train data is: %.3f" %(grid_search_xgbr.score(X_train, y_train)))
print("R^2 for test data is: %.3f" %(grid_search_xgbr.score(X_test,y_test)))
print("-" * 50)
print('RMSE for train data is: %.3f' % RMSE_train)
print('RMSE for test data is: %.3f' % RMSE_test)

In [None]:
#create important and unimportant columns
feature_importance = 100.0 * (best_XGBR.feature_importances_ / best_XGBR.feature_importances_.max())
imp_col = X_train.columns[feature_importance >= 4]
unimp_col = X_train.columns[feature_importance < 4]
print(imp_col)

#feature importance graph
sorted_importance = sorted(zip(X.columns, best_XGBR.feature_importances_), key=lambda t:t[1], reverse=True)
a, b = zip(*sorted_importance)
plt.figure(figsize = (10,10))
df = pd.DataFrame({'feature_name':a, 'importance_score':b})
sns.barplot(data = df, x = 'importance_score', y= 'feature_name', orient = 'h');
plt.title('Feature Importance Using Random Forest')
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.ylim(bottom=(9.5,-0.5))

### Repeat XGBOOST with reduced features

In [None]:
# Drop unimportant features
X_train.drop(unimp_col, axis=1, inplace=True)
X_test.drop(unimp_col, axis=1, inplace=True)

In [None]:
# hyperparamter tuning using grid search
xgbr_final = XGBRegressor()
grid_param2 = [{'max_depth': range(1, 6),
               'n_estimators':range(1000,10000,500),
               'learning_rate':[.001, .01, .1]}]

In [None]:
grid_search_xgbr2 = GridSearchCV(xgbr_final, grid_param2, cv=10, n_jobs=-1, verbose=1)
grid_search_xgbr2.fit(X_train, y_train)

print("RF Grid Search Best params: ", grid_search_xgbr2.best_params_)
print("RF Grid Search Best score: ", grid_search_xgbr2.best_score_)
print("RF Grid Search Best estimators: ", grid_search_xgbr2.best_estimator_)

In [None]:
#record results using new best estimators
best_XGBR2 = grid_search_xgbr2.best_estimator_

y_train_pred_xgbr2 = best_XGBR2.predict(X_train)
RMSE_train2 = np.sqrt(mean_squared_error(y_train,y_train_pred_xgbr2))

y_test_pred_xgbr2 = best_XGBR2.predict(X_test)
RMSE_test2 = np.sqrt(mean_squared_error(y_test,y_test_pred_xgbr2))


print("R^2 for train data is: %.3f" %(grid_search_xgbr2.score(X_train, y_train)))
print("R^2 for test data is: %.3f" %(grid_search_xgbr2.score(X_test,y_test)))
print("-" * 50)
print('RMSE for train data is: %.3f' % RMSE_train)
print('RMSE for test data is: %.3f' % RMSE_test)