In [1]:
import pandas as pd
import numpy as np 
import plotly.graph_objects as go 

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import ElasticNet, Lasso
from sklearn.svm import LinearSVR, SVR
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor, VotingRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [13]:
train_df = pd.read_csv(r'C:\Users\markc\OneDrive\Documents\Python\100Hours\Regression\House Prices\train.csv')
test_df = pd.read_csv(r'C:\Users\markc\OneDrive\Documents\Python\100Hours\Regression\House Prices\test.csv')

In [14]:
#ID not needed for predictions, but required for submission
train_ID = train_df['Id']
test_ID = test_df['Id']

train_df.drop('Id', axis=1, inplace=True)
test_df.drop('Id', axis=1, inplace=True)

y = train_df['SalePrice'].copy()

comb_df = pd.concat([train_df, test_df]).reset_index(drop='True')
comb_df = comb_df.drop('SalePrice', axis=1)

In [15]:
#feature type for visualisation

cat_features = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
                'HouseStyle', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'CentralAir', 'Electrical', 
                'Functional', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'PavedDrive', 'MiscFeature', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition'] #categorical features

num_features = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF',
                'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal'] #numerical features

ord_features = ['OverallQual', 'OverallCond', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars'] #ordinal features (already have an order)
map_ord_features = ['ExterQual', 'ExterCond', 'BsmtCond', 'BsmtQual', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC'] #features which map to ordinal
exp_ord_features = ['BsmtExposure']
bmst_ord_features = ['BsmtFinType1', 'BsmtFinType2']
fence_ord_features = ['Fence']

ord_mapping = {'Ex' : 5, 'Gd' : 4, 'TA' : 3, 'Fa' : 2, 'Po' : 1, 'NA' : 0}
exp_mapping = {'Gd' : 4, 'Av' : 3, 'Mn' : 2, 'No' : 1, 'NA' : 0}
bmst_mapping = {'GLQ' : 6, 'ALQ' : 5, 'BLQ' : 4, 'Rec' : 3, 'LwQ' : 2, 'Unf' : 1, 'NA' : 0}
fence_mapping = {'GdPrv' : 4, 'MnPrv' : 3, 'GdWo' : 2, 'MnWw' : 1, 'NA' : 0}

train_df.shape


(1460, 80)

In [17]:
#evaluate missing values

def graph_missing_values(df):
    missing_values = (df.isnull().sum() / len(df))
    missing_values = missing_values[missing_values > 0].sort_values(ascending=False)

    missing_value_df = pd.DataFrame(missing_values, columns=['Missing pct'])

    fig = go.Figure()

    fig.add_trace(go.Bar(
        x=missing_value_df.index,
        y=missing_value_df['Missing pct'].values
    ))

    fig.update_layout(title='Missing Value Percentage by Feature', xaxis_title='Feature', yaxis_title='% Missing')

    if missing_value_df.empty:
        print("No Missing Values")
    else: return fig

fig = graph_missing_values(comb_df)

fig.show()


In [18]:
#decide how to impute missing values for each column

#for missing values which might differ by neighborhood -> impute test df with moments from the training set
#method = 'mean', 'mode', 'median'

impute_na = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Functional', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature'] # means that there is likely to be that the feature does not exist for that particular entry
impute_zero = ['MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'Fireplaces', 
                'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal'] # means that there is likely to be that the feature does not exist for that particular entry

impute_mode = ['MSSubClass', 'LotShape', 'Utilities', 'LotConfig', 'PavedDrive', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']

impute_nhood_mode = ['MSZoning', 'Street', 'LandContour', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'RoofStyle', 'RoofMatl',
                    'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 
                    'Functional'] # most common type might vary depending on where you live

impute_nhood_median = ['LotFrontage', 'LotArea']


In [19]:
neighborhoods = list(comb_df['Neighborhood'].unique())

#impute the string NA for each col
for col in impute_na:
    comb_df[col] = comb_df[col].fillna('NA')

#impute zero for each col
for col in impute_zero:
    comb_df[col] = comb_df[col].fillna(0)

#impute the mode for each col (on training dataset only)
for col in impute_mode:
    mode_val = train_df[col].mode().item()
    #print(col + ' - ' + str(mode_val))
    comb_df[col] = comb_df[col].fillna(mode_val)

#impute the neighborhood mode from training set for each col
for col in impute_nhood_mode:
    nhood_mode_df = train_df.groupby(['Neighborhood'])[col].agg(lambda x: x.value_counts().index[0]).reset_index()
        
    for neighborhood in neighborhoods:
        nhood_mode_val = nhood_mode_df.loc[nhood_mode_df['Neighborhood'].isin([neighborhood]), col].item()
        #print('Mode: ' + col + ' - ' + neighborhood + ' - ' + str(nhood_mode_val))
        comb_df.loc[(comb_df['Neighborhood'].isin([neighborhood])) & (comb_df[col].isna()), col] = nhood_mode_val

#impute the neighborhood median from training set for each col
for col in impute_nhood_median:
    nhood_median_df = train_df.groupby(['Neighborhood'])[col].median().reset_index()

    for neighborhood in neighborhoods:
        nhood_median_val = nhood_median_df.loc[nhood_median_df['Neighborhood'].isin([neighborhood]), col].item()
        #print('Median: ' + col + ' - ' + neighborhood + ' - ' + str(nhood_median_val))
        comb_df.loc[(comb_df['Neighborhood'].isin([neighborhood])) & (comb_df[col].isna()), col] = nhood_median_val


In [20]:
#map features which are ordinal by string to ordinal by numerical

for col in map_ord_features:
    comb_df[col] = comb_df[col].map(ord_mapping)

for col in exp_ord_features:
    comb_df[col] = comb_df[col].map(exp_mapping)

for col in bmst_ord_features:
    comb_df[col] = comb_df[col].map(bmst_mapping)

for col in fence_ord_features:
    comb_df[col] = comb_df[col].map(fence_mapping)

#for col in cat_features:
#    Lbl = LabelEncoder()
#    Lbl.fit(train_df[col].values)
#    train_df[col] = Lbl.transform(train_df[col].values)
#    Lbl.fit(test_df[col].values)
#    test_df[col] = Lbl.transform(test_df[col].values)


In [21]:
#One Hot Encode categorical features

comb_df[cat_features] = comb_df[cat_features].astype(str)

X_comb = pd.get_dummies(comb_df)

ntrain = train_df.shape[0]

X = X_comb[:ntrain]
X_test = X_comb[ntrain:]


In [22]:
#function to return cross validation score
def rmse_score(model, X, y, cv=5):
    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv)
    rmse = np.sqrt(-scores)
    return(rmse)


In [25]:
#create ML pipelines for each model
#Lasso regression
#Lasso Scores : mean 41350.3159, std 7871.0152

lasso_reg = Pipeline([
    ('std_scaler', StandardScaler()),
    ('lasso_reg', Lasso(alpha=0.1, random_state=42))
])

lasso_scores = rmse_score(lasso_reg, X, y)
print("Lasso Scores : mean {:.4f}, std {:.4f}".format(lasso_scores.mean(), lasso_scores.std()))

Lasso Scores : mean 41350.3159, std 7871.0152


In [27]:
#Elastic Net regression
#Elastic Net Scores : mean 36320.1158, std 7636.1439

ENet_reg = Pipeline([
    ('std_scaler', StandardScaler()),
    ('ENet_reg', ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42))
])

ENet_scores = rmse_score(ENet_reg, X, y)
print("Elastic Net Scores : mean {:.4f}, std {:.4f}".format(ENet_scores.mean(), ENet_scores.std()))

Elastic Net Scores : mean 36320.1158, std 7636.1439


In [46]:
#Linear SVR regression

#lin_SVR_reg = Pipeline([
#    ('std_scaler', StandardScaler()),
#    ('SVR_reg', LinearSVR(C=1, loss='hinge'))
#])

#lin_SVR_scores = rmse_score(lin_SVR_reg, X, y)
#print("Linear SVR Scores : mean {:.4f}, std {:.4f}".format(lin_SVR_scores.mean(), lin_SVR_scores.std()))

In [40]:
#Non-linear SVR regression

#nonlin_SVR_reg = Pipeline([
#    ('poly_features', PolynomialFeatures(degree=3)),
#    ('std_scaler', StandardScaler()),
#    ('SVR_reg', LinearSVR(C=1, loss='hinge'))
#])

#nonlin_SVR_scores = rmse_score(nonlin_SVR_reg, X, y)
#print("Non-Linear SVR Scores : mean {:.4f}, std {:.4f}".format(nonlin_SVR_scores.mean(), nonlin_SVR_scores.std()))

array([32459.45058606, 42703.87187205, 45455.0990956 , 32914.13000612,
       53219.02788428])

In [32]:
#Random Forest regression
#Random Forest Scores : mean 35499.2621, std 5033.3159

rf_reg = RandomForestRegressor(n_estimators=500, max_depth=5, min_samples_leaf=25, random_state=42)

rf_scores = rmse_score(rf_reg, X, y)
print("Random Forest Scores : mean {:.4f}, std {:.4f}".format(rf_scores.mean(), rf_scores.std()))

Random Forest Scores : mean 35499.2621, std 5033.3159


In [29]:
#Gradient Boosting Regresion loss=huber makes it robust to outliers
#GBoost Scores : mean 28809.2752, std 5330.3700

GBoost_reg = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05, max_depth=5, min_samples_leaf=25, loss='huber', random_state=42)

GBoost_scores = rmse_score(GBoost_reg, X, y)
print("GBoost Scores : mean {:.4f}, std {:.4f}".format(GBoost_scores.mean(), GBoost_scores.std()))

GBoost Scores : mean 28809.2752, std 5330.3700


In [31]:
#XGBoost
#XGB Scores : mean 27965.0031, std 3293.8520

XGB_reg = xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=5, random_state=42)

XGB_scores = rmse_score(XGB_reg, X, y)
print("XGB Scores : mean {:.4f}, std {:.4f}".format(XGB_scores.mean(), XGB_scores.std()))

XGB Scores : mean 27965.0031, std 3293.8520


In [30]:
#Light GBM
#Light GBM Scores : mean 27267.6667, std 4409.2111

lgb_reg = lgb.LGBMRegressor(objective='regression', n_estimators=500, num_leaves=5, learning_rate=0.05, min_data_in_leaf=25, random_state=42)

lgb_scores = rmse_score(lgb_reg, X, y)
print("Light GBM Scores : mean {:.4f}, std {:.4f}".format(lgb_scores.mean(), lgb_scores.std()))

Light GBM Scores : mean 27267.6667, std 4409.2111


In [56]:
#Ensemble Method
#Voting Scores : mean 28856.6040, std 4827.6064

voting_reg = VotingRegressor(
    estimators=[('rf_reg', rf_reg), ('GBoost_reg', GBoost_reg), ('LGBM_reg', lgb_reg)]
)

voting_scores = rmse_score(voting_reg, X, y)
print("Voting Scores : mean {:.4f}, std {:.4f}".format(voting_scores.mean(), voting_scores.std()))

Voting Scores : mean 28856.6040, std 4827.6064


In [70]:
#XGBoost gave the lowest mean & std from the base models. Let's choose this model and use GridSearch to see if we can improve it.
#[Parallel(n_jobs=1)]: Done 900 out of 900 | elapsed: 122.0min finished

XGB_reg = xgb.XGBRegressor(random_state=42, learning_rate=0.05)

xgb_param_grid = [
    {'n_estimators' : [250, 500, 1000], 'max_depth' : [3, 5, 7, 10], 'reg_alpha' : [0, 0.25, 0.5, 0.75, 1], 'reg_lambda' : [0, 0.25, 0.5, 0.75, 1]}
]

#grid_search = GridSearchCV(XGB_reg, xgb_param_grid, cv=3, scoring='neg_mean_squared_error', return_train_score=True, verbose=3)

#grid_search.fit(X, y)

#XGB_reg_best = grid_search.best_estimator_


=-976788753.689), total= 5.8min
[CV] max_depth=10, n_estimators=500, reg_alpha=0.75, reg_lambda=0.25 .
[CV]  max_depth=10, n_estimators=500, reg_alpha=0.75, reg_lambda=0.25, score=(train=-7041.057, test=-912506714.785), total=  12.9s
[CV] max_depth=10, n_estimators=500, reg_alpha=0.75, reg_lambda=0.5 ..
[CV]  max_depth=10, n_estimators=500, reg_alpha=0.75, reg_lambda=0.5, score=(train=-16707.606, test=-636031638.071), total=  12.0s
[CV] max_depth=10, n_estimators=500, reg_alpha=0.75, reg_lambda=0.5 ..
[CV]  max_depth=10, n_estimators=500, reg_alpha=0.75, reg_lambda=0.5, score=(train=-16418.267, test=-1060212014.042), total=  10.6s
[CV] max_depth=10, n_estimators=500, reg_alpha=0.75, reg_lambda=0.5 ..
[CV]  max_depth=10, n_estimators=500, reg_alpha=0.75, reg_lambda=0.5, score=(train=-23189.709, test=-849774735.448), total=   9.8s
[CV] max_depth=10, n_estimators=500, reg_alpha=0.75, reg_lambda=0.75 .
[CV]  max_depth=10, n_estimators=500, reg_alpha=0.75, reg_lambda=0.75, score=(train=-344

GridSearchCV(cv=3,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.05, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=...
                                    num_parallel_tree=None, random_state=42,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_paramete

In [37]:
#Chosen best estimator using grid search -> taken results below to save running again
XGB_reg_best = xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=3, reg_alpha=0, reg_lambda=0.25, random_state=42)

XGB_reg_best.fit(X, y)

final_predictions = XGB_reg_best.predict(X_test)

In [38]:
submission_df = pd.DataFrame({'Id' : test_ID, 'SalePrice' : final_predictions})

In [40]:
submission_df.to_csv(r"C:\Users\markc\OneDrive\Documents\Python\100Hours\Regression\House Prices\Submission.csv", index=False)

In [39]:
submission_df

Unnamed: 0,Id,SalePrice
0,1461,125328.109375
1,1462,161754.453125
2,1463,176564.250000
3,1464,190734.437500
4,1465,195412.546875
...,...,...
1454,2915,82865.367188
1455,2916,81452.359375
1456,2917,154342.500000
1457,2918,120713.257812
