In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

# Splitting the dataset into test set and training set
from sklearn.model_selection import train_test_split



# Feature Scaling- In practice it's seen that algos converge much faster if data is scaled properly

# standardization Scaling = (x- min(x))/ sd
# Normalization Scaling = (x- min(x))/ max(x) - min(x)

from sklearn.preprocessing import  StandardScaler


dataset = pd.read_csv('50_Startups.csv')

# last column not included
X = dataset.iloc[:,:-1].values

# last column value ie output
Y = dataset.iloc[:,4].values
4

dataset.columns

Index([u'R&D Spend', u'Administration', u'Marketing Spend', u'State',
       u'Profit'],
      dtype='object')

In [93]:
# Encoding categorial Data- Data ie not numeric(State in above case)

from sklearn.preprocessing import OneHotEncoder,LabelEncoder

labelEncoder = LabelEncoder()

print X[0:5,:]
print '_'*40
X[:,3] = labelEncoder.fit_transform(X[:,3])

# We actually cannot use OneHotEncoder to encode on String hence we convert it into numbers by LabelEncoder
oneHotEncoder = OneHotEncoder(categorical_features = [3])
X = oneHotEncoder.fit_transform(X).toarray()

# pretty-printing a numpy.array without scientific notation and with given precision
np.set_printoptions(suppress=True)
print X[0:5,:]
print '_'*40
# For preventing Dummy Variable Trap 

X = X[:,1:]
print X[0:5,:]

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']]
________________________________________
[[      0.         0.         1.    165349.2   136897.8   471784.1 ]
 [      1.         0.         0.    162597.7   151377.59  443898.53]
 [      0.         1.         0.    153441.51  101145.55  407934.54]
 [      0.         0.         1.    144372.41  118671.85  383199.62]
 [      0.         1.         0.    142107.34   91391.77  366168.42]]
________________________________________
[[      0.         1.    165349.2   136897.8   471784.1 ]
 [      0.         0.    162597.7   151377.59  443898.53]
 [      1.         0.    153441.51  101145.55  407934.54]
 [      0.         1.    144372.41  118671.85  383199.62]
 [      1.         0.    142107.34   91391.77  366168.42]]


In [94]:
# random state is just for sake of course
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

Y_train

array([  96778.92,   96479.51,  105733.54,   96712.8 ,  124266.9 ,
        155752.6 ,  132602.65,   64926.08,   35673.41,  101004.64,
        129917.04,   99937.59,   97427.84,  126992.93,   71498.49,
        118474.03,   69758.98,  152211.77,  134307.35,  107404.34,
        156991.12,  125370.37,   78239.91,   14681.4 ,  191792.06,
        141585.52,   89949.14,  108552.04,  156122.51,  108733.99,
         90708.19,  111313.02,  122776.86,  149759.96,   81005.76,
         49490.75,  182901.99,  192261.83,   42559.73,   65200.33])

In [95]:
# fitting multiple linear regression to training set

from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train,Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [96]:
# predicting the test results

Y_pred = regressor.predict(X_test)

In [97]:
# building the optimal model using backward elimination

import statsmodels.formula.api as sm

# a column of ones for x0 is appended to X at last which can be rolled back later
# axis = 1 means a column of ones and axis = 0 means a row of zeros
#X = np.append(arr = X,values = np.ones((X_train.shape(0),1)).astype(int),axis = 1)

#trick append X at end of columns of 1 to reduce rolling back

X = np.append(arr = np.ones((X.shape[0],1)).astype(int),values = X,axis = 1)

# 1st Iteration
# Initially ans includes all parameters
X_opt = X[:,[0,1,2,3,4,5]]


# endog is dependent variable and exog is independent variable matrix 
regressor_OLS = sm.OLS(endog = Y,exog = X_opt).fit()

# significance value
sigVal = 0.05

# gives large number of details of model like statistical values 
regressor_OLS.summary()

# const is x0 ie column with index 0
# x2 has highest p value so remove it and retrain the model


0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Tue, 09 Jan 2018",Prob (F-statistic):,1.34e-27
Time:,00:26:37,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [98]:

# 2nd Iteration
# Column with index 2 removed
X_opt = X[:,[0,1,3,4,5]]


# endog is dependent variable and exog is independent variable matrix 
regressor_OLS = sm.OLS(endog = Y,exog = X_opt).fit()

# gives large number of details of model like statistical values 
regressor_OLS.summary()

# const is x0 ie column with index 0
# x1 has highest p value ie column with index 1 so remove it and retrain the model

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Tue, 09 Jan 2018",Prob (F-statistic):,8.49e-29
Time:,00:26:38,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
x1,220.1585,2900.536,0.076,0.940,-5621.821,6062.138
x2,0.8060,0.046,17.606,0.000,0.714,0.898
x3,-0.0270,0.052,-0.523,0.604,-0.131,0.077
x4,0.0270,0.017,1.592,0.118,-0.007,0.061

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [99]:

# 3rd Iteration
# Column with index 1 removed
X_opt = X[:,[0,3,4,5]]


# endog is dependent variable and exog is independent variable matrix 
regressor_OLS = sm.OLS(endog = Y,exog = X_opt).fit()

# gives large number of details of model like statistical values 
regressor_OLS.summary()

# const is x0 ie column with index 0
# x2 has highest p value ie column with index 4 so remove it and retrain the model

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Tue, 09 Jan 2018",Prob (F-statistic):,4.53e-30
Time:,00:27:56,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [100]:

# 4th Iteration
# Column with index 4 removed
X_opt = X[:,[0,3,5]]


# endog is dependent variable and exog is independent variable matrix 
regressor_OLS = sm.OLS(endog = Y,exog = X_opt).fit()

# gives large number of details of model like statistical values 
regressor_OLS.summary()

# const is x0 ie column with index 0
# x2 has highest p value ie column with index 5 so remove it and retrain the model

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Tue, 09 Jan 2018",Prob (F-statistic):,2.1600000000000003e-31
Time:,00:29:55,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [102]:

# 5th Iteration
# Column with index 5 removed
X_opt = X[:,[0,3]]


# endog is dependent variable and exog is independent variable matrix 
regressor_OLS = sm.OLS(endog = Y,exog = X_opt).fit()

# gives large number of details of model like statistical values 
regressor_OLS.summary()

# all p values are less than sigVal hence this is our model

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Tue, 09 Jan 2018",Prob (F-statistic):,3.5000000000000004e-32
Time:,00:31:26,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0
