### Import necessary packages

In [1]:
import numpy as np
from math import sqrt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

%matplotlib inline

### Load data 

In [2]:
# Set data path and load training data
file_path = "../data/input"

train_full = pd.read_csv(f'{file_path}/train.csv')

### Feature Engineering 

In [3]:
# Get a count of missing values for each variable
for var in train_full.columns:
    total_na = train_full[var].isnull().sum()
    print(f'{var}: {total_na/len(train_full)}')

Id: 0.0
MSSubClass: 0.0
MSZoning: 0.0
LotFrontage: 0.1773972602739726
LotArea: 0.0
Street: 0.0
Alley: 0.9376712328767123
LotShape: 0.0
LandContour: 0.0
Utilities: 0.0
LotConfig: 0.0
LandSlope: 0.0
Neighborhood: 0.0
Condition1: 0.0
Condition2: 0.0
BldgType: 0.0
HouseStyle: 0.0
OverallQual: 0.0
OverallCond: 0.0
YearBuilt: 0.0
YearRemodAdd: 0.0
RoofStyle: 0.0
RoofMatl: 0.0
Exterior1st: 0.0
Exterior2nd: 0.0
MasVnrType: 0.005479452054794521
MasVnrArea: 0.005479452054794521
ExterQual: 0.0
ExterCond: 0.0
Foundation: 0.0
BsmtQual: 0.025342465753424658
BsmtCond: 0.025342465753424658
BsmtExposure: 0.026027397260273973
BsmtFinType1: 0.025342465753424658
BsmtFinSF1: 0.0
BsmtFinType2: 0.026027397260273973
BsmtFinSF2: 0.0
BsmtUnfSF: 0.0
TotalBsmtSF: 0.0
Heating: 0.0
HeatingQC: 0.0
CentralAir: 0.0
Electrical: 0.0006849315068493151
1stFlrSF: 0.0
2ndFlrSF: 0.0
LowQualFinSF: 0.0
GrLivArea: 0.0
BsmtFullBath: 0.0
BsmtHalfBath: 0.0
FullBath: 0.0
HalfBath: 0.0
BedroomAbvGr: 0.0
KitchenAbvGr: 0.0
KitchenQual

In [None]:
# LotFrontage can be imputed using a simple mean strategy
# Alley NA values correspond to no alley. Should convert "Na" to "No" alley
# MasVnrType and MasVnArea missing values can be dropped 
# For all basement variables, NA values correspond to there being no basement. Should convert "Na" to "No" basement
# Electrical missing values can be dropped
# FireplaceQu missing means no fireplace in the house. Should convert "Na" to "No" fireplace. 
# For all garage variables, Na values correspond to there being no garage. Should convert "Na" to "No" garage. 
# For pool, fence and miscfeature variables, NA values correspond to there being no feature. Should convert "Na" to "No" feature. 

In [4]:
# Converting "Nas" to "No" for variables where it makes sense 

# Create a list for categorical variables to convert
vars_to_convert = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 
                   'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish',
                   'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']

# Loop through the list and convert variable value 
for var in vars_to_convert:
    train_full[var] = train_full[var].fillna('feature_missing')

In [5]:
# Drop rows that have a small proportion of missing values for categorical variables that we cannot impute

vars_to_drop = ['MasVnrType', 'MasVnrArea', 'Electrical']
train_full = train_full.dropna(subset = vars_to_drop)

In [6]:
# Drop 'ID' as that is not useful 
train_full = train_full.drop('Id', axis=1)

In [7]:
# Make sure all values of object columns are recast as strings
object_cols = [col for col in train_full.columns if train_full[col].dtype == 'object']

for col in object_cols:
    train_full[col] = train_full[col].astype(str)

### Model Building Data Preparation

In [8]:
# Create X and y objects
X = train_full.drop('SalePrice', axis=1)
y = train_full['SalePrice']

In [9]:
# Create objects for numerical and categorical columns
object_cols = [col for col in X.columns if X[col].dtype == 'object']
numerical_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]

In [10]:
# Split into training and validation set
X_train, X_valid, y_train, y_valid = train_test_split(X,y, random_state=100)

In [11]:
# Impute LotFrontage variable in train and validation set
imp = SimpleImputer(strategy='mean')
X_train['LotFrontage'] = imp.fit_transform(X_train[['LotFrontage']])
X_valid['LotFrontage'] = imp.transform(X_valid[['LotFrontage']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [12]:
# For categorical variables in train and validation, convert to dummies

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_train.columns = OH_encoder.get_feature_names()
OH_cols_valid.index = X_valid.index
OH_cols_valid.columns = OH_encoder.get_feature_names()

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

In [13]:
# For numerical variables in train and test, standardize the variables
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_valid[numerical_cols] = scaler.transform(X_valid[numerical_cols])

### Model Building

In [14]:
# Create function to calculate root mean squared error
def root_mse(y_test,y_preds):
    error = mean_squared_error(y_test,y_preds)
    return sqrt(error)

In [15]:
# Simple linear regression with no regularization
lm = LinearRegression()
lm_model = lm.fit(X_train,y_train)
lm_preds = lm_model.predict(X_valid)
print(f'Root Mean Squared Error: {root_mse(y_valid, lm_preds)}')

Root Mean Squared Error: 20197107555668.18


In [16]:
# Ridge regression with cross-validation
alphas = [0.001,0.01,0.1,1.0,10.0]
ridge = RidgeCV(alphas = alphas, scoring = 'neg_mean_squared_error', cv =5)
ridge_model = ridge.fit(X_train,y_train)
ridge_preds = ridge_model.predict(X_valid)
print(f'Root Mean Squared Error: {root_mse(y_valid, ridge_preds)}')

Root Mean Squared Error: 26555.665762069588


In [17]:
# Lasso regression with cross-validation
lasso = LassoCV(alphas=alphas, cv=10, max_iter=10000)
lasso_model = lasso.fit(X_train,y_train)
lasso_preds = lasso_model.predict(X_valid)
print(f'Root Mean Squared Error: {root_mse(y_valid, lasso_preds)}')

  tol, rng, random, positive)


Root Mean Squared Error: 34632.26494565253


In [18]:
# Polynomial regression with cross-validation
# Only run on numerical columns

X_train_num = X_train[numerical_cols]
X_valid_num = X_valid[numerical_cols]

# initialize empty dict
scores = {}

# Cross val with polynomial with degree 2 through 4
for d in range(2,5):
    poly = PolynomialFeatures(degree=d)
    X_train_poly = poly.fit_transform(X_train_num)
    X_valid_poly = poly.transform(X_valid_num)
    lm = LinearRegression()
    lm_model = lm.fit(X_train_poly,y_train)
    lm_preds = lm_model.predict(X_valid_poly)
    score = root_mse(y_valid,lm_preds)
    scores[d] = score
    
minval = min(scores.values())
mindegree = [k for k,v in scores.items() if v == minval]
    
print(scores)
print(f'Optimal polynomial degree: {mindegree}')

{2: 2.4146377870429336e+16, 3: 50427.33190209553, 4: 46756.11647712288}
Optimal polynomial degree: [4]


In [20]:
# Random forest regression with cross-validation
rf_params = {'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)],
             'max_depth': [10,20,40,80,100] }

rf = RandomForestRegressor(random_state=100)

rf_cv = GridSearchCV(rf, rf_params)
rf_cv.fit(X_train,y_train)

print(f'Best parameters: {rf_cv.best_params_}') 
rf_cv_preds = rf_cv.predict(X_valid) 
print(f'Root Mean Squared Error from Optimized Random Forest: {root_mse(y_valid, rf_cv_preds)}')

Best parameters: {'max_depth': 20, 'n_estimators': 600}
Root Mean Squared Error from optimized random forest: 27219.474693756893


In [19]:
# Gradient boosting regressor with cross-validation
gb_params = {'n_estimators': [100,200,400,800,1000], 'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3]}

gb = GradientBoostingRegressor(random_state=100)

gb_cv = GridSearchCV(gb, gb_params)
gb_cv.fit(X_train,y_train)

print(f'Best parameters: {gb_cv.best_params_}') 
gb_cv_preds = gb_cv.predict(X_valid) 
print(f'Root Mean Squared Error from Optimized GB: {root_mse(y_valid, gb_cv_preds)}')

Best parameters: {'learning_rate': 0.2, 'n_estimators': 100}
Score from optimized GB: 24718.41572118271


#### GBM with 0.2 learning rate and 100 estimators provides the lowest RMSE