# Multiple Linear Regression using Scikit Learn

## Multiple Linear Regression Equation

### y = a0 + a1*X 1 + a2*X2 + a3*X3 + a4*X4 +a5*X5

In [1]:
#Import Libraries
import numpy as np
import pandas as pd

In [2]:
#Read Restaurant Profit Dara .csv file and divide the data into dependent and independent variables.
data = pd.read_csv('Restaurant_Profit_Data.csv')
data.head()

Unnamed: 0,Miscellaneous_Expenses,Food_Innovation_Spend,Advertising,City,Profit
0,138671.8,167497.2,475918.1,Chicago,202443.83
1,153151.59,164745.7,448032.53,Mumbai,201974.06
2,102919.55,155589.51,412068.54,Tokyo,201232.39
3,120445.85,146520.41,387333.62,Chicago,193083.99
4,93165.77,144255.34,370302.42,Tokyo,176369.94


In [3]:
#Check the shape of the data
data.shape

(50, 5)

In [4]:
#Create Feature Matrix and Dependent Variable Vector
X = data.iloc[:, :-1].values
y = data.iloc[:, 4].values

In [5]:
#Didplay value of Feature Matrix
X

array([[138671.8, 167497.2, 475918.1, 'Chicago'],
       [153151.59, 164745.7, 448032.53, 'Mumbai'],
       [102919.55, 155589.51, 412068.54, 'Tokyo'],
       [120445.85, 146520.41, 387333.62, 'Chicago'],
       [93165.77, 144255.34, 370302.42, 'Tokyo'],
       [101588.71, 134024.9, 366995.36, 'Chicago'],
       [148972.87, 136763.46, 131850.82, 'Mumbai'],
       [147304.06, 132446.13, 328010.68, 'Tokyo'],
       [150492.95, 122690.52, 315747.29, 'Chicago'],
       [110453.17, 125482.88, 309115.62, 'Mumbai'],
       [112368.11, 104061.08, 233294.95, 'Tokyo'],
       [93564.61, 102819.96, 253878.55, 'Mumbai'],
       [129094.38, 96011.75, 253973.44, 'Tokyo'],
       [137269.07, 94140.39, 256798.93, 'Mumbai'],
       [158321.42, 122091.24, 260646.92, 'Tokyo'],
       [124390.84, 116671.61, 265910.23, 'Chicago'],
       [123371.55, 80161.11, 268480.06, 'Mumbai'],
       [146851.58, 96805.16, 286708.31, 'Chicago'],
       [115949.79, 93897.16, 299053.57, 'Tokyo'],
       [155288.11, 88567.

In [6]:
#Display value of Dependent Variable vector
y

array([ 202443.83,  201974.06,  201232.39,  193083.99,  176369.94,
        167173.12,  166304.51,  165934.6 ,  162393.77,  159941.96,
        156303.95,  154441.4 ,  151767.52,  144489.35,  142784.65,
        140099.04,  137174.93,  135552.37,  134448.9 ,  132958.86,
        128656.03,  121495.02,  120534.25,  118915.99,  118734.04,
        117586.34,  115915.54,  115190.31,  113464.38,  111186.64,
        110119.59,  107665.56,  107609.84,  106960.92,  106894.8 ,
        106661.51,  100890.19,  100131.14,   91411.06,   91187.76,
         88421.91,   87980.83,   81680.49,   79940.98,   75382.33,
         75108.08,   59672.75,   52741.73,   45855.41,   24863.4 ])

In [7]:
# One Hot Encoding of categorical column called City
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lblencoder = LabelEncoder()
X[:, 3] = lblencoder.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [8]:
#Display feature matrix once the one hot encoding is done
X.astype(int)

array([[     1,      0,      0, 138671, 167497, 475918],
       [     0,      1,      0, 153151, 164745, 448032],
       [     0,      0,      1, 102919, 155589, 412068],
       [     1,      0,      0, 120445, 146520, 387333],
       [     0,      0,      1,  93165, 144255, 370302],
       [     1,      0,      0, 101588, 134024, 366995],
       [     0,      1,      0, 148972, 136763, 131850],
       [     0,      0,      1, 147304, 132446, 328010],
       [     1,      0,      0, 150492, 122690, 315747],
       [     0,      1,      0, 110453, 125482, 309115],
       [     0,      0,      1, 112368, 104061, 233294],
       [     0,      1,      0,  93564, 102819, 253878],
       [     0,      0,      1, 129094,  96011, 253973],
       [     0,      1,      0, 137269,  94140, 256798],
       [     0,      0,      1, 158321, 122091, 260646],
       [     1,      0,      0, 124390, 116671, 265910],
       [     0,      1,      0, 123371,  80161, 268480],
       [     1,      0,      0,

In [9]:
# Get rid of the Dummy Variable (Column called Chicago) 
X = X[:, 1:]

In [11]:
#Diaplay the value of feature matrix after dummy variable is removed
X.astype(int)

array([[     0,      0, 138671, 167497, 475918],
       [     1,      0, 153151, 164745, 448032],
       [     0,      1, 102919, 155589, 412068],
       [     0,      0, 120445, 146520, 387333],
       [     0,      1,  93165, 144255, 370302],
       [     0,      0, 101588, 134024, 366995],
       [     1,      0, 148972, 136763, 131850],
       [     0,      1, 147304, 132446, 328010],
       [     0,      0, 150492, 122690, 315747],
       [     1,      0, 110453, 125482, 309115],
       [     0,      1, 112368, 104061, 233294],
       [     1,      0,  93564, 102819, 253878],
       [     0,      1, 129094,  96011, 253973],
       [     1,      0, 137269,  94140, 256798],
       [     0,      1, 158321, 122091, 260646],
       [     0,      0, 124390, 116671, 265910],
       [     1,      0, 123371,  80161, 268480],
       [     0,      0, 146851,  96805, 286708],
       [     0,      1, 115949,  93897, 299053],
       [     0,      0, 155288,  88567,   4134],
       [     1,     

In [13]:
#Split the data into training and test datasets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [14]:
len(X_train)

35

In [16]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
MLR = LinearRegression()
MLR.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [24]:
#Print the constant and coeffcients
print(f'constant = {MLR.intercept_}')
print(f'coefficients = {MLR.coef_}')

constant = 51010.161006789844
coefficients = [ -3.04799573e+02   1.58229418e+02   3.01968165e-02   7.90840255e-01
   3.10148566e-02]


In [25]:
# Predicting the Test set results
y_predict = MLR.predict(X_test)

In [26]:
# Display actual and predicted values side by side
df = pd.DataFrame(data=y_test, columns=['y_test'])
df['y_predict'] = y_predict
df

Unnamed: 0,y_test,y_predict
0,113464.38,114464.764722
1,154441.4,142718.884992
2,156303.95,144092.850078
3,87980.83,82766.774894
4,201232.39,190102.927619
5,115190.31,124731.310792
6,91411.06,76626.432613
7,107665.56,108586.968401
8,120534.25,124681.828086
9,176369.94,179549.506399


In [27]:
# Predicting the sigle observation results. Here 1,0 represents that the city is Mumbai
singl_obs = [1,0,160349,134321,401400]
feature_array = np.array(singl_obs)
feature_array = feature_array.reshape(1, -1)
y_pred_single_obs = MLR.predict(feature_array)
round(float(y_pred_single_obs), 2)

174223.21

In [28]:
#Model Evaluation using R-Square
from sklearn import metrics
r_square = metrics.r2_score(y_test, y_predict)
print('R-Square Error:', r_square)

R-Square Error: 0.935868097005


In [29]:
# We have 5 column in Multiple Linear Regression Equation : Profit = a0+a1*Miscellaneous_Expenses+a2*Food_Innovation_Spend+
# a3*Advertising+a4*Mumbai+a5*Tokyo
#Model Evaluation using Adjusted R-Square. 
# Here n = no. of observations and p = no. of independent variables
n = 50
p = 5
Adj_r_square = 1-(1-r_square)*(n-1)/(n-p-1)
print('Adjusted R-Square Error:', Adj_r_square)

Adjusted R-Square Error: 0.928580380755
