# Multiple Linear Regression

## Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [2]:
data = pd.read_csv('50_Startups.csv')

In [3]:
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values

## Encoding categorical data

In [5]:
data['State'].unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = ct.fit_transform(X)
X[:5,:]

array([[0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42]], dtype=object)

## Splitting the dataset into the Training set and Test set

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

## Training the Multiple Linear Regression model on the Training set

In [8]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()


In [9]:
regressor.fit(x_train, y_train)

## Predicting the Test set results

In [10]:
y_predict = regressor.predict(x_test)
np.set_printoptions(precision = 2)

In [11]:
np.concatenate((y_predict.reshape(len(y_predict), 1), y_test.reshape(len(y_predict), 1)), 1)

array([[103015.2 , 103282.38],
       [132582.28, 144259.4 ],
       [132447.74, 146121.95],
       [ 71976.1 ,  77798.83],
       [178537.48, 191050.39],
       [116161.24, 105008.31],
       [ 67851.69,  81229.06],
       [ 98791.73,  97483.56],
       [113969.44, 110352.25],
       [167921.07, 166187.94]])

## Making Single Prediction 

Making a single prediction (for example the profit of a startup with R&D Spend = 160000, Administration Spend = 130000, Marketing Spend = 300000 and State = 'California')

In [12]:
singel_data = pd.DataFrame({'California':[1.0],
                            'Florida':[0.0],
                            'New York':[0.0],
                            'R&D spend': [160000],
                            'Administration': [130000],
                            'Marketing Spend': [300000]})

In [13]:
regressor.predict(singel_data)



array([181566.92])

In [14]:
print(regressor.coef_)
print(regressor.intercept_)

[ 8.66e+01 -8.73e+02  7.86e+02  7.73e-01  3.29e-02  3.66e-02]
42467.529248548686


Therefore, the equation of our multiple linear regression model is:

$$\textrm{Profit} = 86.6 \times \textrm{Dummy State 1} - 873 \times \textrm{Dummy State 2} + 786 \times \textrm{Dummy State 3} + 0.773 \times \textrm{R D Spend} + 0.0329 \times \textrm{Administration} + 0.0366 \times \textrm{Marketing Spend} + 42467.53$$

## Backward Elimination

In [15]:
import statsmodels.api as sm

In [16]:
X = np.append(arr=X, values=np.ones((50,1)).astype(int), axis=1)

In [17]:
x_opt = X[:, [0, 1, 2, 3, 4, 6]]
regressor_OlS = sm.OLS(endog=Y.astype(float), exog=x_opt.astype(float)).fit()

In [18]:
regressor_OlS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.948
Model:,OLS,Adj. R-squared:,0.943
Method:,Least Squares,F-statistic:,205.0
Date:,"Thu, 08 Sep 2022",Prob (F-statistic):,2.9e-28
Time:,15:23:53,Log-Likelihood:,-526.75
No. Observations:,50,AIC:,1064.0
Df Residuals:,45,BIC:,1073.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,1.339e+04,2421.500,5.529,0.000,8511.111,1.83e+04
x2,1.448e+04,2518.987,5.748,0.000,9405.870,1.96e+04
x3,1.335e+04,2459.306,5.428,0.000,8395.623,1.83e+04
x4,0.8609,0.031,27.665,0.000,0.798,0.924
x5,-0.0527,0.050,-1.045,0.301,-0.154,0.049
const,4.122e+04,4607.941,8.945,0.000,3.19e+04,5.05e+04

0,1,2,3
Omnibus:,14.275,Durbin-Watson:,1.197
Prob(Omnibus):,0.001,Jarque-Bera (JB):,19.26
Skew:,-0.953,Prob(JB):,6.57e-05
Kurtosis:,5.369,Cond. No.,1.16e+21


In [19]:
x_train, x_test, y_train, y_test = train_test_split(x_opt, Y, test_size = 0.2, random_state = 0)

In [20]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()

In [21]:
regressor.fit(x_train, y_train)

In [22]:
y_predict = regressor.predict(x_test)
np.set_printoptions(precision = 2)

In [23]:
np.concatenate((y_predict.reshape(len(y_predict), 1), y_test.reshape(len(y_predict), 1)), 1)

array([[104869.72, 103282.38],
       [134022.39, 144259.4 ],
       [135532.17, 146121.95],
       [ 72539.51,  77798.83],
       [179430.51, 191050.39],
       [109808.57, 105008.31],
       [ 65733.62,  81229.06],
       [100423.19,  97483.56],
       [111735.69, 110352.25],
       [169794.73, 166187.94]])

## Making a single prediction (for example the profit of a startup with R&D Spend = 160000, Administration Spend = 130000, Marketing Spend = 300000 and State = 'California')

In [24]:
singel_data = pd.DataFrame({'California':[1.0],
                            'Florida':[0.0],
                            'New York':[0.0],
                            'R&D spend': [160000],
                            'Administration': [130000],
                            'Marketing Spend': [300000]})
singel_data = singel_data.values

In [25]:
regressor.predict(singel_data)

array([184482.12])

In [26]:
print(regressor.coef_)
print(regressor.intercept_)

[-2.19e+02  2.65e+02 -4.61e+01  8.52e-01 -1.70e-03  0.00e+00]
48663.83653587549


Therefore, the equation of our multiple linear regression model is:

$$\textrm{Profit} = - 219 \times \textrm{Dummy State 1} - 265 \times \textrm{Dummy State 2} - 46.1 \times \textrm{Dummy State 3} + 0.85.2 \times \textrm{R D Spend} - 0.0017 \times \textrm{Administration} + 0.0 \times \textrm{Marketing Spend} + 48663.83$$