# House Prices - Advanced Regression Techniques

## Imports and Data Loading

In [1]:
import kaggle
import matplotlib.pyplot as plt
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()

In [2]:
api.competition_download_files('house-prices-advanced-regression-techniques')

In [3]:
import zipfile
with zipfile.ZipFile('house-prices-advanced-regression-techniques.zip', 'r') as zip_ref:
    zip_ref.extractall('house-prices-advanced-regression-techniques')

In [4]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [5]:
import pandas as pd
train = pd.read_csv('house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('house-prices-advanced-regression-techniques/test.csv')

## Data Preprocessing

In [6]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [7]:
train.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [8]:
train.isna().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

### Selecting Features

In [9]:
numeric_features = train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = train.select_dtypes(include=['object']).columns
#Dropping the target variable from the list of numeric features
numeric_features = numeric_features.drop('SalePrice')

In [10]:
categorical_features

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [11]:
#Deleting id element from the list of numeric features
numeric_features = numeric_features.drop('Id')
numeric_features

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')

### Binary Encoding and Scaling

In [12]:
ohe = OneHotEncoder(sparse_output=False)
ohe.fit(train[categorical_features])
temp = ohe.transform(train[categorical_features]).astype(int)
feature_names = ohe.get_feature_names_out(categorical_features)

train = pd.concat([train, pd.DataFrame(temp, columns=feature_names)], axis=1)
train.drop(categorical_features, axis=1, inplace=True)
train.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0


In [13]:
#Doing the same for the test set
ohe.fit(test[categorical_features])
temp = ohe.transform(test[categorical_features]).astype(int)
feature_names = ohe.get_feature_names_out(categorical_features)
test = pd.concat([test, pd.DataFrame(temp, columns=feature_names)], axis=1)
test.drop(categorical_features, axis=1, inplace=True)
test.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleType_nan,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,0,0,1,0,0,0,0,0,1,0
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,0,0,1,0,0,0,0,0,1,0
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,0,0,1,0,0,0,0,0,1,0
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,0,0,1,0,0,0,0,0,1,0
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,0,0,1,0,0,0,0,0,1,0


In [14]:
#Checking the missing columns in the test set
missing_cols = set(train.columns) - set(test.columns)
missing_cols

{'Condition2_RRAe',
 'Condition2_RRAn',
 'Condition2_RRNn',
 'Electrical_Mix',
 'Electrical_nan',
 'Exterior1st_ImStucc',
 'Exterior1st_Stone',
 'Exterior2nd_Other',
 'GarageQual_Ex',
 'Heating_Floor',
 'Heating_OthW',
 'HouseStyle_2.5Fin',
 'MiscFeature_TenC',
 'PoolQC_Fa',
 'RoofMatl_ClyTile',
 'RoofMatl_Membran',
 'RoofMatl_Metal',
 'RoofMatl_Roll',
 'SalePrice',
 'Utilities_NoSeWa'}

In [15]:
#Adding the missing columns to the test set
for col in missing_cols:
    test[col] = 0

test.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,Condition2_RRAn,Electrical_nan,Exterior1st_ImStucc,RoofMatl_Roll,RoofMatl_ClyTile,RoofMatl_Membran,Exterior1st_Stone,MiscFeature_TenC,Heating_OthW,SalePrice
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,0,0,0,0,0,0,0,0,0,0
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,0,0,0,0,0,0,0,0,0,0
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,0,0,0,0,0,0,0,0,0,0
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,0,0,0,0,0,0,0,0,0,0
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
missing_cols = set(test.columns) - set(train.columns)
missing_cols

{'Exterior1st_nan',
 'Exterior2nd_nan',
 'Functional_nan',
 'KitchenQual_nan',
 'MSZoning_nan',
 'SaleType_nan',
 'Utilities_nan'}

In [17]:
test.drop(missing_cols, axis=1, inplace=True)

In [18]:
scaler = MinMaxScaler(feature_range=(0, 10))
scaler.fit(train[numeric_features])
temp = scaler.fit_transform(train[numeric_features])

train[numeric_features] = temp
train.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,2.352941,1.506849,0.334198,6.666667,5.0,9.492754,8.833333,1.225,1.250886,...,0,0,0,1,0,0,0,0,1,0
1,2,0.0,2.020548,0.38795,5.555556,8.75,7.536232,4.333333,0.0,1.732814,...,0,0,0,1,0,0,0,0,1,0
2,3,2.352941,1.609589,0.465073,6.666667,5.0,9.347826,8.666667,1.0125,0.861091,...,0,0,0,1,0,0,0,0,1,0
3,4,2.941176,1.335616,0.385613,6.666667,5.0,3.115942,3.333333,0.0,0.382707,...,0,0,0,1,1,0,0,0,0,0
4,5,2.352941,2.157534,0.605763,7.777778,5.0,9.275362,8.333333,2.1875,1.160524,...,0,0,0,1,0,0,0,0,1,0


In [19]:
#same for the test set
scaler.fit(test[numeric_features])
temp = scaler.fit_transform(test[numeric_features])

test[numeric_features] = temp
test.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,Condition2_RRAn,Electrical_nan,Exterior1st_ImStucc,RoofMatl_Roll,RoofMatl_ClyTile,RoofMatl_Membran,Exterior1st_Stone,MiscFeature_TenC,Heating_OthW,SalePrice
0,1461,0.0,3.296089,1.841466,4.444444,6.25,6.259542,1.833333,0.0,1.167082,...,0,0,0,0,0,0,0,0,0,0
1,1462,0.0,3.351955,2.321241,5.555556,6.25,6.030534,1.333333,0.837209,2.301746,...,0,0,0,0,0,0,0,0,0,0
2,1463,2.352941,2.960894,2.241974,4.444444,5.0,9.007634,8.0,0.0,1.972569,...,0,0,0,0,0,0,0,0,0,0
3,1464,2.352941,3.184358,1.543261,5.555556,6.25,9.083969,8.0,0.155039,1.501247,...,0,0,0,0,0,0,0,0,0,0
4,1465,5.882353,1.22905,0.641212,7.777778,5.0,8.625954,7.0,0.0,0.65586,...,0,0,0,0,0,0,0,0,0,0


### Handling Missing Values

In [20]:
#LotFrontage with 0
train['LotFrontage'] = train['LotFrontage'].fillna(0)
test['LotFrontage'] = test['LotFrontage'].fillna(0)

In [21]:
#GarageYrBlt with the median
train['GarageYrBlt'] = train['GarageYrBlt'].fillna(train['GarageYrBlt'].median())
test['GarageYrBlt'] = test['GarageYrBlt'].fillna(train['GarageYrBlt'].median())

In [22]:
#MasVnrArea with 0
train['MasVnrArea'] = train['MasVnrArea'].fillna(0)
test['MasVnrArea'] = test['MasVnrArea'].fillna(0)

In [23]:
#Removing and adding the SalePrice column to the end of the dataframe (to keep the order of the columns)
train = train[[col for col in train.columns if col != 'SalePrice'] + ['SalePrice']]
train.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SalePrice
0,1,2.352941,1.506849,0.334198,6.666667,5.0,9.492754,8.833333,1.225,1.250886,...,0,0,1,0,0,0,0,1,0,208500
1,2,0.0,2.020548,0.38795,5.555556,8.75,7.536232,4.333333,0.0,1.732814,...,0,0,1,0,0,0,0,1,0,181500
2,3,2.352941,1.609589,0.465073,6.666667,5.0,9.347826,8.666667,1.0125,0.861091,...,0,0,1,0,0,0,0,1,0,223500
3,4,2.941176,1.335616,0.385613,6.666667,5.0,3.115942,3.333333,0.0,0.382707,...,0,0,1,1,0,0,0,0,0,140000
4,5,2.352941,2.157534,0.605763,7.777778,5.0,9.275362,8.333333,2.1875,1.160524,...,0,0,1,0,0,0,0,1,0,250000


In [24]:
#Removing the SalePrice column from the test set
test.drop('SalePrice', axis=1, inplace=True)

## Model Building

In [25]:
#Splitting the data into train and test sets
X = train.drop(['SalePrice', 'Id'], axis=1)
y = train['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [26]:
import numpy as np
from sklearn.metrics import mean_squared_error
#Random forest regressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
#Predict
y_pred = rf.predict(X_test)
#Mean squared error (root)
print('Root Mean Squared Error:',np.sqrt(mean_squared_error(y_test, y_pred)))
print('Score:',rf.score(X_test, y_test))

Root Mean Squared Error: 34918.263082410085
Score: 0.8234417047795894


In [27]:
#Decision tree regressor
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
print('Root Mean Squared Error:',np.sqrt(mean_squared_error(y_test, y_pred)))
print('Score:',dt.score(X_test, y_test))

Root Mean Squared Error: 43478.96639157205
Score: 0.7262580959138724


In [28]:
#Ridge regressor
from sklearn.linear_model import Ridge
ridge = Ridge()
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
print('Root Mean Squared Error:',np.sqrt(mean_squared_error(y_test, y_pred)))
print('Score:',ridge.score(X_test, y_test))

Root Mean Squared Error: 49662.94142891686
Score: 0.6428523984407326


In [29]:
#Lasso regressor
from sklearn.linear_model import Lasso
lasso = Lasso(max_iter=10000)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
print('Root Mean Squared Error:',np.sqrt(mean_squared_error(y_test, y_pred)))
print('Score:',lasso.score(X_test, y_test))

Root Mean Squared Error: 56828.99415654628
Score: 0.5323479957313617


In [30]:
#Creating a dataframe for the scores of the models. (ordered the scores in descending order)
scores = pd.DataFrame({'Model':['Random Forest', 'Decision Tree', 'Ridge', 'Lasso'],
                       'Score':[rf.score(X_test, y_test), dt.score(X_test, y_test),
                                ridge.score(X_test, y_test), lasso.score(X_test, y_test)]})
scores.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
0,Random Forest,0.823442
1,Decision Tree,0.726258
2,Ridge,0.642852
3,Lasso,0.532348


In [31]:
#5-fold cross validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf, X_train, y_train, cv=5)
print('Scores:',scores)
print('Mean Score:',scores.mean())

Scores: [0.89621584 0.77123759 0.89390455 0.83887065 0.88606573]
Mean Score: 0.8572588706297684


### Quick Hyperparameter Tuning

In [32]:
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}

In [33]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
grid_search = GridSearchCV(RandomForestRegressor(),
                           param_grid=param_grid,
                           refit=True,
                           error_score=0,
                           n_jobs=-1)
grid_search.fit(X_train, y_train)
print('Best Parameters:',grid_search.best_params_, 'Best Score:',grid_search.best_score_)

Best Parameters: {'max_depth': 9, 'max_features': None, 'max_leaf_nodes': 9, 'n_estimators': 150} Best Score: 0.7732417345534854


In [34]:
grid_search.best_estimator_.fit(X_train, y_train)
y_pred = grid_search.best_estimator_.predict(X_test)
print('Root Mean Squared Error:',np.sqrt(mean_squared_error(y_test, y_pred)))
print('Score:',grid_search.best_estimator_.score(X_test, y_test))

Root Mean Squared Error: 40609.1225873857
Score: 0.7612023362926817


In [35]:
#5-fold cross validation
scores = cross_val_score(grid_search.best_estimator_, X_train, y_train, cv=5)
print('Scores:',scores)
print('Mean Score:',scores.mean())

Scores: [0.80612016 0.69437242 0.80634141 0.73453317 0.78196108]
Mean Score: 0.7646656492272588


## Predictions

In [36]:
#Last check for the columns
X_train.columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       ...
       'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth', 'SaleType_WD',
       'SaleCondition_Abnorml', 'SaleCondition_AdjLand',
       'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=303)

In [37]:
temp_id = test['Id']

In [38]:
test = test[X_train.columns]

In [39]:
#Predicting the test set and generating the submission file using forest regressor
y_pred = rf.predict(test)
y_pred

array([133827.33, 158772.  , 185152.79, ..., 154208.72, 125144.  ,
       281691.3 ])

In [40]:
submission = pd.DataFrame({'Id':temp_id, 'SalePrice':y_pred})
submission.to_csv('submission.csv', index=False)