#Exercise: House Price Prediction

Goal: Build a multiple linear regression model to predict house prices using multiple features including categorical data.

##Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

##Importing the dataset

In [2]:
dataset = pd.read_csv('houses_dataset.csv')

In [3]:
print(dataset)

    House_ID  Size_sqft  Bedrooms  Age_years Neighborhood   Price
0          1       1500         3          5            A  300000
1          2       1800         3         10            B  320000
2          3       2400         4         15            A  380000
3          4       3000         4         20            C  420000
4          5       1200         2          2            B  280000
5          6       2000         3          8            C  350000
6          7       1700         3         12            A  310000
7          8       2200         4         18            B  370000
8          9       1900         3          7            C  340000
9         10       2600         4         25            A  400000
10        11       1400         2          3            B  290000
11        12       2800         4         22            C  410000


##Creating the matrix of features

In [4]:
X = dataset.iloc[:, 1:-1].values

In [5]:
print(X)

[[1500 3 5 'A']
 [1800 3 10 'B']
 [2400 4 15 'A']
 [3000 4 20 'C']
 [1200 2 2 'B']
 [2000 3 8 'C']
 [1700 3 12 'A']
 [2200 4 18 'B']
 [1900 3 7 'C']
 [2600 4 25 'A']
 [1400 2 3 'B']
 [2800 4 22 'C']]


##Creating the dependent variable vector

In [6]:
y = dataset.iloc[:, -1].values

In [7]:
print(y)

[300000 320000 380000 420000 280000 350000 310000 370000 340000 400000
 290000 410000]


##Encoding categorical data

###Encoding the independent variable

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [9]:
print(X)

[[1.0 0.0 0.0 1500 3 5]
 [0.0 1.0 0.0 1800 3 10]
 [1.0 0.0 0.0 2400 4 15]
 [0.0 0.0 1.0 3000 4 20]
 [0.0 1.0 0.0 1200 2 2]
 [0.0 0.0 1.0 2000 3 8]
 [1.0 0.0 0.0 1700 3 12]
 [0.0 1.0 0.0 2200 4 18]
 [0.0 0.0 1.0 1900 3 7]
 [1.0 0.0 0.0 2600 4 25]
 [0.0 1.0 0.0 1400 2 3]
 [0.0 0.0 1.0 2800 4 22]]


##Splitting the dataset into training set and test set

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [11]:
print(X_train)

[[0.0 0.0 1.0 1900 3 7]
 [0.0 0.0 1.0 2000 3 8]
 [1.0 0.0 0.0 2400 4 15]
 [0.0 1.0 0.0 1800 3 10]
 [0.0 0.0 1.0 2800 4 22]
 [0.0 1.0 0.0 1200 2 2]
 [0.0 1.0 0.0 2200 4 18]
 [0.0 0.0 1.0 3000 4 20]
 [1.0 0.0 0.0 1700 3 12]]


In [12]:
print(X_test)

[[0.0 1.0 0.0 1400 2 3]
 [1.0 0.0 0.0 2600 4 25]
 [1.0 0.0 0.0 1500 3 5]]


In [13]:
print(y_train)

[340000 350000 380000 320000 410000 280000 370000 420000 310000]


In [14]:
print(y_test)

[290000 400000 300000]


##Building the model

In [15]:
from sklearn.linear_model import LinearRegression
mlr = LinearRegression()

##Training the model

In [16]:
mlr.fit(X_train, y_train)

##Predicting the test set results

In [17]:
y_pred = mlr.predict(X_test)

In [18]:
print(y_pred)

[285871.17825854 378951.36778115 307745.84301806]


##Display the predicted and actual test set results

In [19]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), axis=1))

[[285871.18 290000.  ]
 [378951.37 400000.  ]
 [307745.84 300000.  ]]


##Making a single prediction

In [20]:
#Make a single prediction for a house with Size: 2000 sqft, Bedrooms: 3, Age: 8 years, Neighborhood: 'B'
print(mlr.predict([[0, 1, 0, 2000, 3, 8]]))

[342250.13]


##Calculating the values of the coefficients

###Calculating b<sub>0</sub> (y-intercept)

In [21]:
print(mlr.intercept_)

152666.2792776687


###Calculating b<sub>i</sub> (slope coefficients)

In [22]:
print(mlr.coef_)

[-6262.29   204.27  6058.02    62.68 24274.99 -1100.93]


###Calculate the prediction manually using the formula: y = b<sub>0</sub> + b<sub>1</sub>*x<sub>1</sub> + b<sub>2</sub>*x<sub>2</sub> + ... and verify it matches your model's prediction.

In [23]:
print(mlr.intercept_ + 0*mlr.coef_[0] + 1*mlr.coef_[1] + 0*mlr.coef_[2] + 2000*mlr.coef_[3] + 3*mlr.coef_[4] + 8*mlr.coef_[5])

342250.1340961916
