# ENGSCI 762 - AS1 - Part I
## Noel D'Souza - ndso092 - 449609993

## 4. Prepare the data

In [9]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
# Load in train and test data
df = pd.read_csv('../data/train.csv', index_col=0)
df1 = pd.read_csv('../data/test.csv', index_col=0)

In [11]:
# # Removing outliers found from looking at visualisations and contributions from Kaggle Discussions
# df.drop(df[(df['OverallQual']<5) & (df['SalePrice']>200000)].index, inplace=True)
# df.drop(df[(df['GrLivArea']>4000) & (df['SalePrice']<300000)].index, inplace=True)
# df.reset_index(drop=True, inplace=True)

In [12]:
# Format relevant dataframes
housing = df.drop('SalePrice', axis=1)
y_train = df['SalePrice'].copy()

housing_test = df1

In [13]:
# Edit missing values to mean that there is none for relevant categories
for col in ('Alley','Utilities','MasVnrType','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1',
            'BsmtFinType2','FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond',
           'Fence','MiscFeature'):
    housing[col]=housing[col].fillna('None')
    housing_test[col]=housing_test[col].fillna('None')

In [14]:
# Missing values not available, so replaced with mode
for col in ('MSZoning','Electrical','Exterior1st','PoolQC','Exterior2nd','KitchenQual','SaleType','Functional'):
    housing[col]=housing[col].fillna(housing[col].mode()[0])
    housing_test[col]=housing_test[col].fillna(housing[col].mode()[0])

In [15]:
# Replace missing values with 0 (indicating this is none)
for col in ('MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath','GarageYrBlt','GarageCars','GarageArea'):
    housing[col]=housing[col].fillna(0)
    housing_test[col]=housing_test[col].fillna(0)

In [16]:
# Replace missing values with mean 
housing['LotFrontage']=housing['LotFrontage'].fillna(housing['LotFrontage'].mean())
housing_test['LotFrontage']=housing_test['LotFrontage'].fillna(housing['LotFrontage'].mean())

In [17]:
# Seperating data types that could be numeric
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

In [18]:
# Get numerical features from the relevant columns
housing_num = housing.select_dtypes(include=numeric_dtypes)
housing_test_num = housing_test[housing_num.columns]
# housing_num.info()

In [19]:
# Get categorical features from the relevant columns
housing_cat = housing.select_dtypes(include='object')
housing_test_cat = housing_test[housing_cat.columns]

In [20]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(housing_num)
X_test_scaled = scaler.transform(housing_test_num)

X_train_scaled.shape, X_test_scaled.shape

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  """


((1460, 36), (1459, 36))

In [21]:
from sklearn.preprocessing import OneHotEncoder

# Encoding to derive the column categories based on the unique values
encoder = OneHotEncoder(handle_unknown='ignore')

X_train_cat_encoded = encoder.fit_transform(housing_cat)
X_test_cat_encoded = encoder.transform(housing_test_cat)

# The method .toarray() converts the sparse array representation into a matrix
X_train_cat_encoded = X_train_cat_encoded.toarray()
X_test_cat_encoded = X_test_cat_encoded.toarray()

X_train_cat_encoded.shape, X_test_cat_encoded.shape

((1460, 265), (1459, 265))

In [22]:
# Combine numerical and categorical feature matrices for train and test set
X_train = np.c_[X_train_scaled, X_train_cat_encoded]
X_test = np.c_[X_test_scaled, X_test_cat_encoded]

X_train.shape, X_test.shape

((1460, 301), (1459, 301))

In [23]:
# Adding for features for both Train and Test data

#ratio of the ground living area and the total lot area
train_livingarea_lot_ratio = housing.GrLivArea / housing.LotArea
test_livingarea_lot_ratio = housing_test.GrLivArea / housing_test.LotArea

#number of bedrooms per rooms (excluding bathrooms)
train_bedrooms_per_room = housing.BedroomAbvGr / housing.TotRmsAbvGrd
test_bedrooms_per_room = housing_test.BedroomAbvGr / housing_test.TotRmsAbvGrd

#total bathrooms
train_bathrooms = housing.FullBath + (0.5 * housing.HalfBath) + housing.BsmtFullBath + (0.5 * housing.BsmtHalfBath)
test_bathrooms = housing_test.FullBath + (0.5 * housing_test.HalfBath) + housing_test.BsmtFullBath + (0.5 * housing_test.BsmtHalfBath)

#total SF indoors area
train_totalSF=housing.TotalBsmtSF + housing['1stFlrSF'] + housing['2ndFlrSF']
test_totalSF=housing_test.TotalBsmtSF + housing_test['1stFlrSF'] + housing_test['2ndFlrSF']

#total porch area
train_total_porch_sf = (housing.OpenPorchSF + housing['3SsnPorch'] + housing.EnclosedPorch + housing.ScreenPorch + housing.WoodDeckSF)
test_total_porch_sf = (housing_test.OpenPorchSF + housing_test['3SsnPorch'] + housing_test.EnclosedPorch + housing_test.ScreenPorch + housing_test.WoodDeckSF)
# indicating if a house has certain aspects (Train data)
# e.g. if PoolArea = 0 , Then HasPool = 0 too

train_haspool = housing['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
train_has2ndfloor = housing['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
train_hasgarage = housing['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
train_hasbsmt = housing['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
train_hasfireplace = housing['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
# indicating if a house has certain aspects (Test data)
# e.g. if PoolArea = 0 , Then HasPool = 0 too

test_haspool = housing_test['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
test_has2ndfloor = housing_test['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
test_hasgarage = housing_test['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
test_hasbsmt = housing_test['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
test_hasfireplace = housing_test['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

In [24]:
# Add all new feature columns to train and test feature matrices 
X_train_final = np.c_[X_train, 
                    train_livingarea_lot_ratio, 
                    train_bedrooms_per_room,
                    train_bathrooms,
                    train_totalSF,
                    train_total_porch_sf,
                    train_haspool,
                    train_has2ndfloor,
                    train_hasgarage,
                    train_hasbsmt,
                    train_hasfireplace]
X_test_final = np.c_[X_test, 
                    test_livingarea_lot_ratio, 
                    test_bedrooms_per_room,
                    test_bathrooms,
                    test_totalSF,
                    test_total_porch_sf,
                    test_haspool,
                    test_has2ndfloor,
                    test_hasgarage,
                    test_hasbsmt,
                    test_hasfireplace]

X_train_final.shape, X_test_final.shape

((1460, 311), (1459, 311))

# LR

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

# Infer param_grid from 
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
# However, for simple linear regression this is kind of over-the-top.
param_grid = {'fit_intercept': [True, False],
             'normalize': [True, False]}

lin_reg = LinearRegression()
lin_search = GridSearchCV(lin_reg, param_grid, cv=10, 
                          scoring = 'neg_mean_squared_error')

In [13]:
lin_search.fit(X_train_final, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'fit_intercept': [True, False], 'normalize': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [14]:
np.sqrt(-lin_search.best_score_)

25072860313.592373

In [15]:
lin_search.best_params_

{'fit_intercept': True, 'normalize': False}

In [16]:
from sklearn.metrics import mean_squared_error

in_sample_mse = mean_squared_error(y_train, 
                           lin_search.predict(X_train_final))

In [17]:
lin_reg_rmse = pd.Series({'in sample': np.sqrt(in_sample_mse),
                         'out of sample': np.sqrt(-lin_search.best_score_)})
lin_reg_rmse # The linear regression model is severly overfitted.

in sample        1.889630e+04
out of sample    2.507286e+10
dtype: float64

# RF

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = [{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
              {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
             ]
forest_reg = RandomForestRegressor()
rfr_search = GridSearchCV(forest_reg, param_grid, cv=10, scoring='neg_mean_squared_error')

rfr_search.fit(X_train_final, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [19]:
# At least one of the best performing parameters is at the boundary of the param_grid,
# therefore you should increase the respective search region
rfr_search.best_params_ 

{'max_features': 8, 'n_estimators': 30}

In [20]:
rfr_in_sample_mse = mean_squared_error(y_train, 
                           rfr_search.predict(X_train_final))


In [28]:
# Out-of-sample
np.sqrt(-rfr_search.best_score_)

NameError: name 'rfr_search' is not defined

In [22]:
rfr_reg_rmse = pd.Series({'in sample': np.sqrt(rfr_in_sample_mse),
                         'out of sample': np.sqrt(-rfr_search.best_score_)})
rfr_reg_rmse 

in sample        11758.493637
out of sample    30272.930026
dtype: float64

# SVR

In [25]:
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error

svr = LinearSVR(max_iter=100000)
svr.fit(X_train_final, y_train)

svr_in_sample_mse = mean_squared_error(y_train,
                                      svr.predict(X_train_final))

In [26]:
svr_reg_rmse = pd.Series({'in sample': np.sqrt(svr_in_sample_mse),
                         'out of sample': np.nan})
svr_reg_rmse 

in sample        47045.653397
out of sample             NaN
dtype: float64

In [25]:
rmse = pd.DataFrame({'LinReg': lin_reg_rmse,
                    'RFR': rfr_reg_rmse,
                    'SVR': svr_reg_rmse}
                   )
rmse

Unnamed: 0,LinReg,RFR,SVR
in sample,18896.3,11758.493637,42595.262621
out of sample,25072860000.0,30272.930026,


## Grid

In [27]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

# The following parameter grid has been adapted from 
# https://scikit-learn.org/stable/auto_examples/svm/plot_svm_regression.html
param_grid = [dict(kernel=['rbf'], gamma=[0.1], 
               epsilon=[0.1]),
              dict(kernel=['linear'], gamma=['auto']),
              dict(kernel=['poly'], gamma=['auto'], degree=[3], 
                   epsilon=[.1])
             ]

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_final_mm = scaler.fit_transform(X_train_final)
X_test_final_mm = scaler.transform(X_test_final)

sv_reg = SVR(max_iter=100000, C=100)

svr_search = GridSearchCV(sv_reg, param_grid, 
                          cv=10, scoring='neg_mean_squared_error')

svr_search.fit(X_train_final_mm, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=100000, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'kernel': ['rbf'], 'gamma': [0.1], 'epsilon': [0.1]}, {'kernel': ['linear'], 'gamma': ['auto']}, {'kernel': ['poly'], 'gamma': ['auto'], 'degree': [3], 'epsilon': [0.1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [21]:
svr_search.best_params_

{'gamma': 'auto', 'kernel': 'linear'}

In [22]:
np.sqrt(-svr_search.best_score_)

48670.01203967182

In [23]:
svr_in_sample_mse = mean_squared_error(y_train, 
                           svr_search.predict(X_train_final))

In [24]:
svr_reg_rmse = pd.Series({'in sample': np.sqrt(svr_in_sample_mse),
                         'out of sample': np.sqrt(-svr_search.best_score_)})
svr_reg_rmse 

in sample        1.281932e+07
out of sample    4.867001e+04
dtype: float64

In [25]:
rmse = pd.DataFrame({'LinReg': lin_reg_rmse,
                    'RFR': rfr_reg_rmse,
                    'SVR': svr_reg_rmse}
                   )
rmse

NameError: name 'lin_reg_rmse' is not defined