In [None]:
#Capstone Project - TWO
#House Price Prediction

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime

from library.sb_utils import save_file

In [2]:
# Loading data to use in this excercies.
housing_data = pd.read_csv('/data/new_housing_data_cleaned.csv')

In [12]:
#Calling the info method on housing_data to see a summary of the data
housing_data.info

<bound method DataFrame.info of       MSSubClass MSZoning  LotArea LotConfig BldgType  OverallCond  YearBuilt  \
0             60       RL     8450    Inside     1Fam            5       2003   
1             20       RL     9600       FR2     1Fam            8       1976   
2             60       RL    11250    Inside     1Fam            5       2001   
3             70       RL     9550    Corner     1Fam            5       1915   
4             60       RL    14260       FR2     1Fam            5       2000   
...          ...      ...      ...       ...      ...          ...        ...   
2908         160       RM     1936    Inside    Twnhs            7       1970   
2909         160       RM     1894    Inside   TwnhsE            5       1970   
2910          20       RL    20000    Inside     1Fam            7       1960   
2911          85       RL    10441    Inside     1Fam            5       1992   
2912          60       RL     9627    Inside     1Fam            5       1993

In [4]:
from sklearn.preprocessing import OneHotEncoder

s = (housing_data.dtypes == 'object')
object_cols = list(s[s].index)
print("Categorical variables:")
print(object_cols)
print('No. of categorical features:' , len(object_cols))

Categorical variables:
['MSZoning', 'LotConfig', 'BldgType', 'Exterior1st']
No. of categorical features: 4


In [5]:
#Applying OneHotEncoding
OH_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
OH_cols = pd.DataFrame(OH_encoder.fit_transform(housing_data[object_cols]))
OH_cols.index = housing_data.index
OH_columns = OH_encoder.get_feature_names_out()
df_final = housing_data.drop(object_cols, axis=1)
df_final = pd.concat([df_final, OH_cols], axis=1)

In [6]:
#Splitting data set into training and testing
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

X = df_final.drop(['SalePrice'], axis=1)
y = df_final['SalePrice']

X_train, X_valid, y_train, y_valid, = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)


In [7]:
X_train.shape

(2330, 37)

In [16]:
y_train.shape

(2330,)

In [8]:
#Check the `dtypes` attribute of `X_train` to verify all features are numeric
X_train.dtypes

MSSubClass        int64
LotArea           int64
OverallCond       int64
YearBuilt         int64
YearRemodAdd      int64
BsmtFinSF2      float64
TotalBsmtSF     float64
0               float64
1               float64
2               float64
3               float64
4               float64
5               float64
6               float64
7               float64
8               float64
9               float64
10              float64
11              float64
12              float64
13              float64
14              float64
15              float64
16              float64
17              float64
18              float64
19              float64
20              float64
21              float64
22              float64
23              float64
24              float64
25              float64
26              float64
27              float64
28              float64
29              float64
dtype: object

In [19]:
#Repeat this check for the test split in `X_test`
X_train.dtypes

MSSubClass        int64
LotArea           int64
OverallCond       int64
YearBuilt         int64
YearRemodAdd      int64
BsmtFinSF2      float64
TotalBsmtSF     float64
0               float64
1               float64
2               float64
3               float64
4               float64
5               float64
6               float64
7               float64
8               float64
9               float64
10              float64
11              float64
12              float64
13              float64
14              float64
15              float64
16              float64
17              float64
18              float64
19              float64
20              float64
21              float64
22              float64
23              float64
24              float64
25              float64
26              float64
27              float64
28              float64
29              float64
dtype: object

In [9]:
#Model Training and Accuracy
#SVM - Support vector Machine
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import mean_absolute_percentage_error

#Converting feature column to string
X_train.columns = X_train.columns.astype(str)
X_valid.columns = X_valid.columns.astype(str)


model_SVR = svm.SVR()
model_SVR.fit(X_train, y_train)
y_pred = model_SVR.predict(X_valid)

print(mean_absolute_percentage_error(y_valid, y_pred))

0.1870512931818556


In [10]:
X_valid.dtypes

MSSubClass        int64
LotArea           int64
OverallCond       int64
YearBuilt         int64
YearRemodAdd      int64
BsmtFinSF2      float64
TotalBsmtSF     float64
0               float64
1               float64
2               float64
3               float64
4               float64
5               float64
6               float64
7               float64
8               float64
9               float64
10              float64
11              float64
12              float64
13              float64
14              float64
15              float64
16              float64
17              float64
18              float64
19              float64
20              float64
21              float64
22              float64
23              float64
24              float64
25              float64
26              float64
27              float64
28              float64
29              float64
dtype: object

In [11]:
#Random Forest Regression
from sklearn.ensemble import RandomForestRegressor

model_RFR = RandomForestRegressor(n_estimators=10)
model_RFR.fit(X_train, y_train)
y_pred = model_RFR.predict(X_valid)

mean_absolute_percentage_error(y_valid, y_pred)

0.18348950037777664

In [11]:
#Linear Regression
from sklearn.linear_model import LinearRegression

model_LR = LinearRegression()
model_LR.fit(X_train, y_train)
y_pred = model_LR.predict(X_valid)

print(mean_absolute_percentage_error(y_valid, y_pred))

0.18741683841600093


In [None]:
Random Forest Regression model is giving the better accuracy as mean absolute error is the least among all other models mentioned above.