# ENGSCI 762 - AS1 - Part I
## Noel D'Souza - ndso092 - 449609993

## 4. Prepare the data

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import sklearn
sklearn.__version__

%matplotlib inline
%load_ext autoreload
%autoreload 2

'0.20.3'

In [32]:
# Import the raw data as a pandas dataframe
train = pd.read_csv('../data/train.csv', index_col=0)
test = pd.read_csv('../data/test.csv', index_col=0)

# Separate the label (SalePrice column) from the training data
housing = train.drop('SalePrice', axis=1)
y_train = train['SalePrice'].copy()
housing_test = test

In [33]:
# Edit all occurrences of string "NA", convert them to "None" to avoid syntax confusion when applying transformations.
for item in ('Alley','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',
             'FireplaceQu','GarageType','GarageFinish','GarageCond','PoolQC','Fence',
             'MiscFeature','Utilities','MasVnrType','GarageQual','MSZoning','Electrical','Exterior1st','PoolQC','Exterior2nd','KitchenQual','SaleType','Functional'):
    housing[item]=housing[item].fillna('None')
    housing_test[item]=housing_test[item].fillna('None')
    
# For any "numerical" variables (e.g. areas etc) if missing, set value to zero
for item in ('LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF',
             '1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','BsmtFullBath','BsmtHalfBath',
             'FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces',
             'GarageYrBlt','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch',
             '3SsnPorch','ScreenPorch','PoolArea','MiscVal','LotFrontage'):
    housing[item]=housing[item].fillna(0)
    housing_test[item]=housing_test[item].fillna(0)
    
# For the following categorical variables, 
# for col in ('MSZoning','Electrical','Exterior1st','PoolQC','Exterior2nd','KitchenQual','SaleType','Functional'):
#     housing[col]=housing[col].fillna(housing[col].mode()[0])
#     housing_test[col]=housing_test[col].fillna(housing[col].mode()[0])

# Replace missing values with mean 
# housing['LotFrontage']=housing['LotFrontage'].fillna(housing['LotFrontage'].mean())
# housing_test['LotFrontage']=housing_test['LotFrontage'].fillna(housing['LotFrontage'].mean())

In [34]:
# Get numerical features
housing_num = housing.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64'])
housing_test_num = housing_test[housing_num.columns]
# housing_num.info()

# Get categorical features
housing_cat = housing.select_dtypes(include='object')
housing_test_cat = housing_test[housing_cat.columns]
# housing_cat.info()

### 4.1 Identify suitable transformations

#### 4.1.1 Transforming numerical features

In [35]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(housing_num)
X_test_scaled = scaler.transform(housing_test_num)

X_train_scaled.shape, X_test_scaled.shape

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


((1460, 36), (1459, 36))

#### 4.1.2 Decomposing categorical features

In [36]:
from sklearn.preprocessing import OneHotEncoder

# Encodes the categorical columns into binary (unique) features
encoder = OneHotEncoder(handle_unknown='ignore')

X_train_cat_encoded = encoder.fit_transform(housing_cat)
X_test_cat_encoded = encoder.transform(housing_test_cat)

# The method .toarray() converts the sparse array representation into a matrix
X_train_cat_encoded = X_train_cat_encoded.toarray()
X_test_cat_encoded = X_test_cat_encoded.toarray()

X_train_cat_encoded.shape, X_test_cat_encoded.shape

((1460, 267), (1459, 267))

#### 4.1.3 Merging numerical and categorical features manually

In [37]:
# Combine numerical and categorical feature matrices for train and test set
X_train = np.c_[X_train_scaled, X_train_cat_encoded]
X_test = np.c_[X_test_scaled, X_test_cat_encoded]

X_train.shape, X_test.shape

((1460, 303), (1459, 303))

### 4.2 Create new features

In [38]:
# Create new features (ratios and fractions) which are combinations of existing variables.
# Equivaelnt "full size" bathrooms
train_num_bathrooms = housing.FullBath + housing.BsmtFullBath + 0.5 * housing.HalfBath + 0.5 * housing.BsmtHalfBath
test_num_bathrooms = housing_test.FullBath + housing_test.BsmtFullBath + 0.5 * housing_test.HalfBath + 0.5 * housing_test.BsmtHalfBath

# Proportion of total rooms that are bedrooms (above grade and excluding bathrooms)
train_proportion_bedrooms = housing.BedroomAbvGr / housing.TotRmsAbvGrd
test_proportion_bedrooms = housing_test.BedroomAbvGr / housing_test.TotRmsAbvGrd

# Total indoor SF area
train_total_SF = housing.TotalBsmtSF + housing['1stFlrSF'] + housing['2ndFlrSF']
test_total_SF = housing_test.TotalBsmtSF + housing_test['1stFlrSF'] + housing_test['2ndFlrSF']

# Total above grade living SF area as a fraction of total lot area
train_livingSF_to_lot = housing.GrLivArea / housing.LotArea
test_livingSF_to_lot = housing_test.GrLivArea / housing_test.LotArea

# Proportion of total rooms that are kitchen(s)
train_proportion_kitchens = housing.KitchenAbvGr / housing.TotRmsAbvGrd
test_proportion_kitchens = housing_test.KitchenAbvGr / housing_test.TotRmsAbvGrd

# Total porch and/or deck SF area
train_total_porch_SF = housing.WoodDeckSF + housing.OpenPorchSF + housing.EnclosedPorch + housing['3SsnPorch'] + housing.ScreenPorch
test_total_porch_SF = housing_test.WoodDeckSF + housing_test.OpenPorchSF + housing_test.EnclosedPorch + housing_test['3SsnPorch'] + housing_test.ScreenPorch

# Create boolean features that denote whether a house exhibits the characteristic being considered.
# This is done for both the test and training set.

# Does the house have have a 2nd floor?
train_bool_2ndfloor = housing['2ndFlrSF'].apply(lambda x: 1 if x>0 else 0)
test_bool_2ndfloor = housing_test['2ndFlrSF'].apply(lambda x: 1 if x>0 else 0)

# Does the house have a pool?
train_bool_pool = housing.PoolArea.apply(lambda x: 1 if x>0 else 0)
test_bool_pool = housing_test.PoolArea.apply(lambda x: 1 if x>0 else 0)

# Does the house have a porch?
train_bool_porch = train_total_porch_SF.apply(lambda x: 1 if x>0 else 0)
test_bool_porch = test_total_porch_SF.apply(lambda x: 1 if x>0 else 0)

# Does the house have one or more fireplaces?
train_bool_fireplace = housing.Fireplaces.apply(lambda x: 1 if x>0 else 0)
test_bool_fireplace = housing_test.Fireplaces.apply(lambda x: 1 if x>0 else 0)

# Does the house have a basement
train_bool_basement = housing.TotalBsmtSF.apply(lambda x: 1 if x>0 else 0)
test_bool_basement = housing_test.TotalBsmtSF.apply(lambda x: 1 if x>0 else 0)

# Does the house have a garage?
train_bool_garage = housing.GarageArea.apply(lambda x: 1 if x>0 else 0)
test_bool_garage = housing_test.GarageArea.apply(lambda x: 1 if x>0 else 0)

In [39]:
# Add new feature columns to train and test feature matrices 
X_train_final = np.c_[X_train, 
                      train_num_bathrooms,
                      train_proportion_bedrooms,
                      train_total_SF,
                      train_livingSF_to_lot,
                      train_proportion_kitchens,
                      train_total_porch_SF,
                      train_bool_2ndfloor,
                      train_bool_pool,
                      train_bool_porch,
                      train_bool_fireplace,
                      train_bool_basement,
                      train_bool_garage]
X_test_final = np.c_[X_test, 
                     test_num_bathrooms, 
                     test_proportion_bedrooms,
                     test_total_SF,
                     test_livingSF_to_lot,
                     test_proportion_kitchens,
                     test_total_porch_SF,
                     test_bool_2ndfloor,
                     test_bool_pool,
                     test_bool_porch,
                     test_bool_fireplace,
                     test_bool_basement,
                     test_bool_garage]

X_train_final.shape, X_test_final.shape

((1460, 315), (1459, 315))

## 5. Evaluate three different regression algorithms

### 5.1 Linear Regression

In [40]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

param_grid = {'fit_intercept': [True, False],
             'normalize': [True, False]}

lin_reg = LinearRegression()
lin_search = GridSearchCV(lin_reg, param_grid, cv=10, 
                          scoring = 'neg_mean_squared_error')

In [41]:
lin_search.fit(X_train_final, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'fit_intercept': [True, False], 'normalize': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [42]:
np.sqrt(-lin_search.best_score_)

5662666801.594317

In [43]:
lin_search.best_params_

{'fit_intercept': True, 'normalize': False}

In [44]:
from sklearn.metrics import mean_squared_error

in_sample_mse = mean_squared_error(y_train, 
                           lin_search.predict(X_train_final))

In [45]:
lin_reg_rmse = pd.Series({'in sample': np.sqrt(in_sample_mse),
                         'out of sample': np.sqrt(-lin_search.best_score_)})
lin_reg_rmse

in sample        2.026626e+04
out of sample    5.662667e+09
dtype: float64

The out-of-sample performance is significantly worse compared to the in-sample performace, hence the linear regression model does not generalise very well. The model's performance itself is not acceptable either.

### 5.2 RandomForrest

In [46]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = [{'n_estimators': [3, 10, 30], 'max_features': [2,4,6,8]},
              {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
             ]
forest_reg = RandomForestRegressor()
rfr_search = GridSearchCV(forest_reg, param_grid, cv=10, scoring='neg_mean_squared_error')

rfr_search.fit(X_train_final, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [47]:
rfr_search.best_params_ 

{'max_features': 8, 'n_estimators': 30}

In [48]:
rfr_in_sample_mse = mean_squared_error(y_train, 
                           rfr_search.predict(X_train_final))

In [49]:
# Out-of-sample
np.sqrt(-rfr_search.best_score_)

32595.02730897675

In [50]:
rfr_reg_rmse = pd.Series({'in sample': np.sqrt(rfr_in_sample_mse),
                         'out of sample': np.sqrt(-rfr_search.best_score_)})
rfr_reg_rmse 

in sample        12045.383538
out of sample    32595.027309
dtype: float64

The RandomForrest Regressor outperforms the linear regression model. While RF generalises better than the linear model, it still does not exhibit a good enough generalisation since the out-of-sample performance is much worse than the in-sample performance.

### 5.3 Support Vector Machines

#### 5.3.1 Support Vector Regression

In [51]:
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error

svr = LinearSVR(max_iter=100000)
svr.fit(X_train_final, y_train)

svr_in_sample_mse = mean_squared_error(y_train,
                                      svr.predict(X_train_final))

In [52]:
svr_reg_rmse = pd.Series({'in sample': np.sqrt(svr_in_sample_mse),
                         'out of sample': np.nan})
svr_reg_rmse 

in sample        47091.801893
out of sample             NaN
dtype: float64

In [53]:
rmse = pd.DataFrame({'LinReg': lin_reg_rmse,
                    'RFR': rfr_reg_rmse,
                    'SVR': svr_reg_rmse}
                   )
rmse

Unnamed: 0,LinReg,RFR,SVR
in sample,20266.26,12045.383538,47091.801893
out of sample,5662667000.0,32595.027309,


SVR's in-sample performance indicates sever overfitting.

#### 5.3.2 Grid Search

In [54]:
X_test_final = np.nan_to_num(X_test_final)

from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

# The following parameter grid has been adapted from 
# https://scikit-learn.org/stable/auto_examples/svm/plot_svm_regression.html
param_grid = [dict(kernel=['rbf'], gamma=[0.1], 
               epsilon=[0.1]),
              dict(kernel=['linear'], gamma=['auto']),
              dict(kernel=['poly'], gamma=['auto'], degree=[3], 
                   epsilon=[.1])
             ]

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_final_mm = scaler.fit_transform(X_train_final)
X_test_final_mm = scaler.transform(X_test_final)

sv_reg = SVR(max_iter=100000, C=100)

svr_search = GridSearchCV(sv_reg, param_grid, 
                          cv=10, scoring='neg_mean_squared_error')

svr_search.fit(X_train_final_mm, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=100000, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'kernel': ['rbf'], 'gamma': [0.1], 'epsilon': [0.1]}, {'kernel': ['linear'], 'gamma': ['auto']}, {'kernel': ['poly'], 'gamma': ['auto'], 'degree': [3], 'epsilon': [0.1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [55]:
svr_search.best_params_

{'gamma': 'auto', 'kernel': 'linear'}

In [56]:
# Out-of-sample
np.sqrt(-svr_search.best_score_)

48597.9999035758

In [57]:
svr_in_sample_mse = mean_squared_error(y_train, 
                           svr_search.predict(X_train_final))

In [58]:
svr_reg_rmse = pd.Series({'in sample': np.sqrt(svr_in_sample_mse),
                         'out of sample': np.sqrt(-svr_search.best_score_)})
svr_reg_rmse 

in sample        1.276237e+07
out of sample    4.859800e+04
dtype: float64

In [59]:
rmse = pd.DataFrame({'LinReg': lin_reg_rmse,
                    'RFR': rfr_reg_rmse,
                    'SVR': svr_reg_rmse}
                   )
rmse

Unnamed: 0,LinReg,RFR,SVR
in sample,20266.26,12045.383538,12762370.0
out of sample,5662667000.0,32595.027309,48598.0


It is clear that the RandomForrest Regressor has the best fitting of the three options. It outperforms both Linear Regression and the Support Vector Regressor and hence will be used for the purposes of prediction.

## 6. Submit predictions to Kaggle

In [60]:
y_test = pd.Series(rfr_search.predict(X_test_final),
                   index = housing_test.index,
                  name='SalePrice')
y_test.head()

Id
1461    124303.600000
1462    147443.333333
1463    176201.166667
1464    182556.666667
1465    189180.000000
Name: SalePrice, dtype: float64

In [61]:
y_test.to_csv('prediction.csv',
             header=True)

![image.png](attachment:image.png)