# Multiple Linear Regression

## Importing all the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as sm



## Load the dataset

In [2]:
df = pd.read_csv('/Users/omkarmutreja/Downloads/Multiple_Linear_Regression/50_Startups.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
# Create arrays for dependent and independent variables
X = df.iloc[:,:-1].values
y = df.iloc[:,4].values
# X = X.reshape(-1,4)
# y = y.reshape(-1,1)
print(X.shape)
print(y.shape)

(50, 4)
(50,)


## Data Preprocessing

In [4]:
# Converting categorical variable State into dummy variables
labelEncoder_X = LabelEncoder()
X[:,3] = labelEncoder_X.fit_transform(X[:,3])
oneHotEncoder = OneHotEncoder(categorical_features=[3])
X = oneHotEncoder.fit_transform(X).toarray()
X = X[:,1:]
print(X.shape)
X[:5,]

(50, 5)


array([[0.0000000e+00, 1.0000000e+00, 1.6534920e+05, 1.3689780e+05,
        4.7178410e+05],
       [0.0000000e+00, 0.0000000e+00, 1.6259770e+05, 1.5137759e+05,
        4.4389853e+05],
       [1.0000000e+00, 0.0000000e+00, 1.5344151e+05, 1.0114555e+05,
        4.0793454e+05],
       [0.0000000e+00, 1.0000000e+00, 1.4437241e+05, 1.1867185e+05,
        3.8319962e+05],
       [1.0000000e+00, 0.0000000e+00, 1.4210734e+05, 9.1391770e+04,
        3.6616842e+05]])

## Splitting the data into training and testing datasets

In [5]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
print(X_train.shape)
print(X_test.shape)

(40, 5)
(10, 5)


## Fitting Multiple Regression model to the training set and predicting the values for test set

In [6]:
reg = LinearRegression()
reg.fit(X_train,y_train)
y_pred = reg.predict(X_test)
y_pred

array([126362.87908252,  84608.45383643,  99677.49425155,  46357.46068582,
       128750.48288497,  50912.41741905, 109741.350327  , 100643.24281644,
        97599.275746  , 113097.42524437])

## Optimizing our model using Backward Elimination

In [7]:
# Multiple Linear Regression formula : Y = b0X0 + b1X1 + b2X2 + ... + bnXn where X0=1
# Hence, we need to add a column of ones to our data set
X = np.append(arr = np.ones((50,1)),values=X,axis=1)
X_opt = X[:,[0,1,2,3,4,5]]
reg_OLS = sm.OLS(endog=y,exog=X_opt).fit()
reg_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Thu, 13 Sep 2018",Prob (F-statistic):,1.34e-27
Time:,03:51:24,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


### Variable X2 has the highest p-value and hence we remove that variable and repeat the process

In [8]:
X_opt = X[:,[0,1,3,4,5]]
reg_OLS = sm.OLS(endog=y,exog=X_opt).fit()
reg_OLS.summary()
# This time varibale x1 has the highest p-value and hence we remove it
X_opt = X[:,[0,3,4,5]]
reg_OLS = sm.OLS(endog=y,exog=X_opt).fit()
reg_OLS.summary()
# This time varibale x4 has the highest p-value and hence we remove it
X_opt = X[:,[0,3,5]]
reg_OLS = sm.OLS(endog=y,exog=X_opt).fit()
reg_OLS.summary()
# This time varibale x5 has the highest p-value and hence we remove it
X_opt = X[:,[0,3]]
reg_OLS = sm.OLS(endog=y,exog=X_opt).fit()
reg_OLS.summary()
# Hence, we see that R&D tends out to be the most significant independent variable for predicting the profit

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Thu, 13 Sep 2018",Prob (F-statistic):,3.5000000000000004e-32
Time:,03:51:24,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0
