# Multiple Linear Regression
## Housing Case Study

#### Problem Statement:

Consider a real estate company that has a dataset containing the prices of properties in the Delhi region. It wishes to use the data to optimise the sale prices of the properties based on important factors such as area, bedrooms, parking, etc.

Essentially, the company wants —


- To identify the variables affecting house prices, e.g. area, number of rooms, bathrooms, etc.

- To create a linear model that quantitatively relates house prices with variables such as number of rooms, area, number of bathrooms, etc.

- To know the accuracy of the model, i.e. how well these variables can predict house prices.

**So interpretation is important!**

In [1]:
# Import the numpy and pandas package

import numpy as np
import pandas as pd
import openpyxl as exl

In [2]:
house = pd.read_excel("Housing.xlsx")
house.head(10)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
5,10850000,7500,3,3,1,yes,no,yes,no,yes,2,yes,semi-furnished
6,10150000,8580,4,3,4,yes,no,no,no,yes,2,yes,semi-furnished
7,10150000,16200,5,3,2,yes,no,no,no,no,0,no,unfurnished
8,9870000,8100,4,1,2,yes,yes,yes,no,yes,2,yes,furnished
9,9800000,5750,3,2,4,yes,yes,no,no,yes,1,yes,unfurnished


In [3]:
house.shape

(545, 13)

In [4]:
house.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [5]:
#info and describe work the same

house.describe()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
count,545.0,545.0,545.0,545.0,545.0,545.0
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,0.693578
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,0.861586
min,1750000.0,1650.0,1.0,1.0,1.0,0.0
25%,3430000.0,3600.0,2.0,1.0,1.0,0.0
50%,4340000.0,4600.0,3.0,1.0,2.0,0.0
75%,5740000.0,6360.0,3.0,2.0,2.0,1.0
max,13300000.0,16200.0,6.0,4.0,4.0,3.0


In [6]:
house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


## Step 2: Visualising the Data

In [7]:
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
house.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [16]:
## Converting all the yes to 1 and no too 0

house= house.replace({'yes': 1, 'no': 0})
house.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,semi-furnished,unfurnished
0,1.0,0.396564,4,2,3,1,0,0,0,1,2,1,furnished,False,False
1,0.909091,0.502405,4,4,4,1,0,0,0,1,3,0,furnished,False,False
2,0.909091,0.571134,3,2,2,1,0,1,0,0,2,1,semi-furnished,True,False
3,0.906061,0.402062,4,2,2,1,0,1,0,1,3,1,furnished,False,False
4,0.836364,0.396564,4,1,2,1,1,1,0,1,2,0,furnished,False,False


In [10]:
## Dummy Values

status = pd.get_dummies(house['furnishingstatus'])


In [17]:
status.head()

Unnamed: 0,semi-furnished,unfurnished
0,False,False
1,False,False
2,True,False
3,False,False
4,False,False


In [12]:
#Drop 
status = pd.get_dummies(house['furnishingstatus'], drop_first = True)

# Add the results to the original housing dataframe
house = pd.concat([house, status], axis = 1)

house.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished,False,False
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished,False,False
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished,True,False
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished,False,False
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished,False,False


In [13]:
## Converting the values of price from taht big to the range of 1 - 0 

min_price = house['price'].min()
max_price = house['price'].max()

house['price'] = (house['price'] - min_price) / (max_price - min_price)


In [14]:
##Converting area in the range 1 - 0

min_area = house['area'].min()
max_area = house['area'].max()

house['area'] = (house['area'] - min_area) / (max_area - min_area)


In [15]:
##Corelatipon matrix for all values

plt.figure(figsize = (10, 6))
sns.heatmap(house.corr(), annot = True, cmap="YlGnBu")
plt.show()

correlation_matrix = house.corr()

# Display the correlation matrix
print("\nCorrelation Matrix:")
print(correlation_matrix)

ValueError: could not convert string to float: 'furnished'

<Figure size 1000x600 with 0 Axes>

In [None]:
# Splitting data in train and validation dataset
y= house.pop('price')
x= house

In [None]:
y.head()

In [None]:
x.head()

In [None]:
#splitting data

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y, train_size = 0.8, test_size = 0.2, random_state = 50)

In [None]:
#Building model

import statsmodels.api as sm

In [None]:
#Adding intercept

x_train_inter=sm.add_constant(x_train[['area']])

In [None]:
#Adding regression line

lr=sm.OLS(y_train,x_train_inter).fit()

print(lr.summary())

In [None]:
#Soo we get our exuation as 

# price = 0.1294+0.5455* area 
#  y    =    c  +  m1  * x1

In [None]:
#Plotting graph for linear regression

plt.figure(figsize=(10, 6)) 
plt.scatter(x_train_inter.iloc[:, 1], y_train)
plt.plot(x_train_inter.iloc[:, 1], 0.1294 + 0.5455*x_train_inter.iloc[:, 1], 'k')
plt.show()

In [None]:
#Plotting graph regression , this is by default with calculating the coefficients

plt.figure(figsize=(10, 6)) 
sns.regplot(x=x_train_inter.iloc[:, 1], y=y_train, scatter=True)
plt.show()

In [None]:
#Adding more features for multiple linear regression

x_train_inter = x_train[['area', 'bathrooms']]


x_train_inter = sm.add_constant(x_train_inter)

lr = sm.OLS(y_train, x_train_inter).fit()


print(lr.summary())


In [None]:
house.columns

In [None]:
#converting all the values to numeric

x_train = x_train.apply(pd.to_numeric, errors='coerce')
y_train = pd.to_numeric(y_train, errors='coerce')


In [None]:
x_train_inter = sm.add_constant(x_train)

lr= sm.OLS(y_train, x_train_inter).fit()

print(lr.summary())

In [None]:
#getting only the parameters

lr.params

In [None]:
#Removing the values which have p value more than 0.05

# Dropping highly correlated variables and insignificant variables
x = x_train.drop('bedrooms', axis=1)

In [None]:
x_train_inter = sm.add_constant(x)

lr= sm.OLS(y_train, x_train_inter).fit()

print(lr.summary())

In [None]:
#all this again for furnishing status

#Removing the values which have p value more than 0.05 after changing bedroom we get to see that guestroom has also grown

# Dropping highly correlated variables and insignificant variables
x = x_train.drop(['furnishingstatus','guestroom','bedrooms'], axis=1)

x_train_inter = sm.add_constant(x)

lr= sm.OLS(y_train, x_train_inter).fit()

print(lr.summary())

In [None]:
#Checking if the error is distributed equally or not

y_train_predict=lr.predict(x_train_inter)
res=(y_train - y_train_predict )

fig = plt.figure()
sns.distplot(res,bins=15)
plt.show()

In [None]:
#Checking if our model is predicting correct results or not
#now converting all values to numeric

x_test = x_test.apply(pd.to_numeric, errors='coerce')

x_test.info()

In [None]:
x_test_inter=sm.add_constant(x_test)
y_predict=lr.predict(x_test_inter)

In [None]:
#It is time for Evaluation of our model

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


In [None]:
#R2 score
r2_score(y_test,y_predict)

In [None]:
#Returns the mean squared error; we'll take a square root
#RMSE For test model
np.sqrt(mean_squared_error(y_test, y_predict))

## 2nd way to create a linear Regression Model


In [None]:
from sklearn.model_selection import train_test_split
np.random.seed(0)
x_train_lm, x_test_lm, y_train_lm, y_test_lm = train_test_split(x, y, train_size = 0.8, test_size = 0.2, random_state = 50)

In [None]:
print(x_train_lm.shape)
print(y_train_lm.shape)
print(x_test_lm.shape)
print(y_test_lm.shape)

In [None]:
from sklearn.linear_model import LinearRegression

# Representing LinearRegression as lr(Creating LinearRegression Object)
lm = LinearRegression()

# Fit the model using lr.fit()
lm.fit(x_train_lm, y_train_lm)

In [None]:
print(lm.intercept_)
print(lm.coef_)