# Multiple Linear Regression

## Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.linear_model import LinearRegression

## Importing the data set

In [5]:
data_set = pd.read_csv("housing.csv")
print(data_set)

        price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0    13300000  7420         4          2        3      yes        no       no   
1    12250000  8960         4          4        4      yes        no       no   
2    12250000  9960         3          2        2      yes        no      yes   
3    12215000  7500         4          2        2      yes        no      yes   
4    11410000  7420         4          1        2      yes       yes      yes   
..        ...   ...       ...        ...      ...      ...       ...      ...   
540   1820000  3000         2          1        1      yes        no      yes   
541   1767150  2400         3          1        1       no        no       no   
542   1750000  3620         2          1        1      yes        no       no   
543   1750000  2910         3          1        1       no        no       no   
544   1750000  3850         3          1        2      yes        no       no   

    hotwaterheating aircond

In [6]:
x = data_set.iloc[:,1:].values
y = data_set.iloc[:,0].values

In [7]:
print(x)

[[7420 4 2 ... 2 'yes' 'furnished']
 [8960 4 4 ... 3 'no' 'furnished']
 [9960 3 2 ... 2 'yes' 'semi-furnished']
 ...
 [3620 2 1 ... 0 'no' 'unfurnished']
 [2910 3 1 ... 0 'no' 'furnished']
 [3850 3 1 ... 0 'no' 'unfurnished']]


In [8]:
print(y)

[13300000 12250000 12250000 12215000 11410000 10850000 10150000 10150000
  9870000  9800000  9800000  9681000  9310000  9240000  9240000  9100000
  9100000  8960000  8890000  8855000  8750000  8680000  8645000  8645000
  8575000  8540000  8463000  8400000  8400000  8400000  8400000  8400000
  8295000  8190000  8120000  8080940  8043000  7980000  7962500  7910000
  7875000  7840000  7700000  7700000  7560000  7560000  7525000  7490000
  7455000  7420000  7420000  7420000  7350000  7350000  7350000  7350000
  7343000  7245000  7210000  7210000  7140000  7070000  7070000  7035000
  7000000  6930000  6930000  6895000  6860000  6790000  6790000  6755000
  6720000  6685000  6650000  6650000  6650000  6650000  6650000  6650000
  6629000  6615000  6615000  6580000  6510000  6510000  6510000  6475000
  6475000  6440000  6440000  6419000  6405000  6300000  6300000  6300000
  6300000  6300000  6293000  6265000  6230000  6230000  6195000  6195000
  6195000  6160000  6160000  6125000  6107500  6090

# Taking care of missing data

In [9]:
imputer = SimpleImputer(missing_values = np.nan, strategy="mean")
imputer.fit(x[:,0:4])
x[:,0:4] = imputer.transform(x[:,0:4])
print(x)

[[7420.0 4.0 2.0 ... 2 'yes' 'furnished']
 [8960.0 4.0 4.0 ... 3 'no' 'furnished']
 [9960.0 3.0 2.0 ... 2 'yes' 'semi-furnished']
 ...
 [3620.0 2.0 1.0 ... 0 'no' 'unfurnished']
 [2910.0 3.0 1.0 ... 0 'no' 'furnished']
 [3850.0 3.0 1.0 ... 0 'no' 'unfurnished']]


## Encoding categorical data

In [10]:
le = LabelEncoder()
for i in range(4,9):
    x[:,i]=le.fit_transform(x[:, i])
x[:,10] = le.fit_transform(x[:, 10])
df2 = pd.DataFrame(x,
                   columns=['area','bedrooms','bathrooms','stories','mainroad','guestroom','basement','hotwaterheating','airconditioning','parking','prefarea','furnishingstatus'])
print(df2)

       area bedrooms bathrooms stories mainroad guestroom basement  \
0    7420.0      4.0       2.0     3.0        1         0        0   
1    8960.0      4.0       4.0     4.0        1         0        0   
2    9960.0      3.0       2.0     2.0        1         0        1   
3    7500.0      4.0       2.0     2.0        1         0        1   
4    7420.0      4.0       1.0     2.0        1         1        1   
..      ...      ...       ...     ...      ...       ...      ...   
540  3000.0      2.0       1.0     1.0        1         0        1   
541  2400.0      3.0       1.0     1.0        0         0        0   
542  3620.0      2.0       1.0     1.0        1         0        0   
543  2910.0      3.0       1.0     1.0        0         0        0   
544  3850.0      3.0       1.0     2.0        1         0        0   

    hotwaterheating airconditioning parking prefarea furnishingstatus  
0                 0               1       2        1        furnished  
1              

In [11]:
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[-1])], remainder ='passthrough')
x = ct.fit_transform(x)

In [12]:
print(x)

[[1.0 0.0 0.0 ... 1 2 1]
 [1.0 0.0 0.0 ... 1 3 0]
 [0.0 1.0 0.0 ... 0 2 1]
 ...
 [0.0 0.0 1.0 ... 0 0 0]
 [1.0 0.0 0.0 ... 0 0 0]
 [0.0 0.0 1.0 ... 0 0 0]]


## Splitting the dataset into the Training set and Test set

In [13]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state = 0)

## Training Multiple linear Regression Model on Training set

In [14]:
Regressor = LinearRegression()
Regressor.fit(x_train,y_train)

LinearRegression()

## Predict Test Result

In [15]:
y_predict = Regressor.predict(x_test)
print(y_predict)

[ 3950288.61876186  6173868.81883097  4483635.98836257  7258732.75105262
  2836727.58490489  7032947.09749069  3203851.47112398  3270994.00904059
  3472554.03645921  8289978.32623712  6605321.62954622  3723366.23684097
  3812376.95976089  4548966.84544606  4020476.34849665  1969836.22090145
  4057262.98087852  3704586.86711738  3282767.93188797  4609423.64909565
  5968243.7363715   6363698.62063812  4751300.32389     2659595.2763304
  5305573.24662114  5680819.58784466  5404106.90027136  5543050.52192533
  5768360.47982213  5801753.70839278  3389277.9611061   6399092.02678432
  7081030.31411766  2913042.40387674  4498664.01335428  5210561.68059363
  5013457.84122307  3707596.71347581  2916603.45485344  3937761.75634076
  8041334.20180906  4942174.61142058  6432605.21981749  3511338.78156424
  3813475.39540802  6434856.19540023  4447687.02885143  2696243.71724909
  4180018.7062579   6455973.25779219  4056226.34306795  7124571.30073162
  2530661.67791769  3033278.46419633  3500830.320628

In [16]:
df = pd.DataFrame({"Acctual value":y_test,"predicted value":np.round(y_predict, 2),"difference":y_test-np.round(y_predict, 2)})

In [19]:
print(df)

     Acctual value  predicted value  difference
0          4585000       3950288.62   634711.38
1          6083000       6173868.82   -90868.82
2          4007500       4483635.99  -476135.99
3          6930000       7258732.75  -328732.75
4          2940000       2836727.58   103272.42
..             ...              ...         ...
104        6650000       7340959.28  -690959.28
105        5810000       5235408.60   574591.40
106        4123000       4134159.03   -11159.03
107        3080000       5058911.23 -1978911.23
108        5530000       6279957.32  -749957.32

[109 rows x 3 columns]
