# Problem statement

VC firm has list of startups and their spendings in different areas. Those areas are R&D, administration and marketing. They have given each startup's profit. We need to provide a model which tells the linear relationship between the spending on these areas and profit, so that we can predict the area that is more important to have high profits. This way VC firm can invest on those companies which are planning to spend more on the areas we predict. Not only that, we can also predict the profit based on spendings on these areas.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

dataset = pd.read_csv('../datasets/002_multiple_linear_regression/50_Startups.csv')
print(dataset)

    R&D Spend  Administration  Marketing Spend       State     Profit
0   165349.20       136897.80        471784.10    New York  192261.83
1   162597.70       151377.59        443898.53  California  191792.06
2   153441.51       101145.55        407934.54     Florida  191050.39
3   144372.41       118671.85        383199.62    New York  182901.99
4   142107.34        91391.77        366168.42     Florida  166187.94
5   131876.90        99814.71        362861.36    New York  156991.12
6   134615.46       147198.87        127716.82  California  156122.51
7   130298.13       145530.06        323876.68     Florida  155752.60
8   120542.52       148718.95        311613.29    New York  152211.77
9   123334.88       108679.17        304981.62  California  149759.96
10  101913.08       110594.11        229160.95     Florida  146121.95
11  100671.96        91790.61        249744.55  California  144259.40
12   93863.75       127320.38        249839.44     Florida  141585.52
13   91992.39       

In [2]:
# Prepare dataset

x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values
print(x)

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']
 [131876.9 99814.71 362861.36 'New York']
 [134615.46 147198.87 127716.82 'California']
 [130298.13 145530.06 323876.68 'Florida']
 [120542.52 148718.95 311613.29 'New York']
 [123334.88 108679.17 304981.62 'California']
 [101913.08 110594.11 229160.95 'Florida']
 [100671.96 91790.61 249744.55 'California']
 [93863.75 127320.38 249839.44 'Florida']
 [91992.39 135495.07 252664.93 'California']
 [119943.24 156547.42 256512.92 'Florida']
 [114523.61 122616.84 261776.23 'New York']
 [78013.11 121597.55 264346.06 'California']
 [94657.16 145077.58 282574.31 'New York']
 [91749.16 114175.79 294919.57 'Florida']
 [86419.7 153514.11 0.0 'New York']
 [76253.86 113867.3 298664.47 'California']
 [78389.47 153773.43 299737.29 'New York']
 [73994.56 122782.75 303319.26 'Florida']
 [67532

In [3]:
# Encode categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_x = LabelEncoder()
x[:, 3] = labelencoder_x.fit_transform(x[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
x = onehotencoder.fit_transform(x).toarray()
np.set_printoptions(suppress=True)
print(x)

[[      0.         0.         1.    165349.2   136897.8   471784.1 ]
 [      1.         0.         0.    162597.7   151377.59  443898.53]
 [      0.         1.         0.    153441.51  101145.55  407934.54]
 [      0.         0.         1.    144372.41  118671.85  383199.62]
 [      0.         1.         0.    142107.34   91391.77  366168.42]
 [      0.         0.         1.    131876.9    99814.71  362861.36]
 [      1.         0.         0.    134615.46  147198.87  127716.82]
 [      0.         1.         0.    130298.13  145530.06  323876.68]
 [      0.         0.         1.    120542.52  148718.95  311613.29]
 [      1.         0.         0.    123334.88  108679.17  304981.62]
 [      0.         1.         0.    101913.08  110594.11  229160.95]
 [      1.         0.         0.    100671.96   91790.61  249744.55]
 [      0.         1.         0.     93863.75  127320.38  249839.44]
 [      1.         0.         0.     91992.39  135495.07  252664.93]
 [      0.         1.         0.  

In [4]:
# Avoiding dummy variable trap
x = x[:, 1:]
print(x)

[[      0.         1.    165349.2   136897.8   471784.1 ]
 [      0.         0.    162597.7   151377.59  443898.53]
 [      1.         0.    153441.51  101145.55  407934.54]
 [      0.         1.    144372.41  118671.85  383199.62]
 [      1.         0.    142107.34   91391.77  366168.42]
 [      0.         1.    131876.9    99814.71  362861.36]
 [      0.         0.    134615.46  147198.87  127716.82]
 [      1.         0.    130298.13  145530.06  323876.68]
 [      0.         1.    120542.52  148718.95  311613.29]
 [      0.         0.    123334.88  108679.17  304981.62]
 [      1.         0.    101913.08  110594.11  229160.95]
 [      0.         0.    100671.96   91790.61  249744.55]
 [      1.         0.     93863.75  127320.38  249839.44]
 [      0.         0.     91992.39  135495.07  252664.93]
 [      1.         0.    119943.24  156547.42  256512.92]
 [      0.         1.    114523.61  122616.84  261776.23]
 [      0.         0.     78013.11  121597.55  264346.06]
 [      0.    

In [6]:
# split test and train
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

40
10


In [7]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(x_train, y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [9]:
# test predictions on test set
y_predict = regressor.predict(x_test)
print(y_predict)
print(y_test)

[ 103015.20159796  132582.27760815  132447.73845175   71976.09851258
  178537.48221056  116161.24230166   67851.69209676   98791.73374687
  113969.43533013  167921.06569551]
[ 103282.38  144259.4   146121.95   77798.83  191050.39  105008.31
   81229.06   97483.56  110352.25  166187.94]


In [10]:
# build optimal model using backward elimination
import statsmodels.formula.api as sm
print(x)
x = np.append(arr = np.ones((50,1)).astype(int), values = x, axis=1)
print(x)

[[      0.         1.    165349.2   136897.8   471784.1 ]
 [      0.         0.    162597.7   151377.59  443898.53]
 [      1.         0.    153441.51  101145.55  407934.54]
 [      0.         1.    144372.41  118671.85  383199.62]
 [      1.         0.    142107.34   91391.77  366168.42]
 [      0.         1.    131876.9    99814.71  362861.36]
 [      0.         0.    134615.46  147198.87  127716.82]
 [      1.         0.    130298.13  145530.06  323876.68]
 [      0.         1.    120542.52  148718.95  311613.29]
 [      0.         0.    123334.88  108679.17  304981.62]
 [      1.         0.    101913.08  110594.11  229160.95]
 [      0.         0.    100671.96   91790.61  249744.55]
 [      1.         0.     93863.75  127320.38  249839.44]
 [      0.         0.     91992.39  135495.07  252664.93]
 [      1.         0.    119943.24  156547.42  256512.92]
 [      0.         1.    114523.61  122616.84  261776.23]
 [      0.         0.     78013.11  121597.55  264346.06]
 [      0.    

In [13]:
x_opt = x[:, [0, 1, 2, 3, 4, 5]]
print(x_opt)
regressor_OLS = sm.OLS(endog=y, exog = x_opt).fit()

[[      1.         0.         1.    165349.2   136897.8   471784.1 ]
 [      1.         0.         0.    162597.7   151377.59  443898.53]
 [      1.         1.         0.    153441.51  101145.55  407934.54]
 [      1.         0.         1.    144372.41  118671.85  383199.62]
 [      1.         1.         0.    142107.34   91391.77  366168.42]
 [      1.         0.         1.    131876.9    99814.71  362861.36]
 [      1.         0.         0.    134615.46  147198.87  127716.82]
 [      1.         1.         0.    130298.13  145530.06  323876.68]
 [      1.         0.         1.    120542.52  148718.95  311613.29]
 [      1.         0.         0.    123334.88  108679.17  304981.62]
 [      1.         1.         0.    101913.08  110594.11  229160.95]
 [      1.         0.         0.    100671.96   91790.61  249744.55]
 [      1.         1.         0.     93863.75  127320.38  249839.44]
 [      1.         0.         0.     91992.39  135495.07  252664.93]
 [      1.         1.         0.  

In [14]:
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Sat, 23 Sep 2017",Prob (F-statistic):,1.34e-27
Time:,12:13:47,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [15]:
x_opt = x[:, [0, 1, 3, 4, 5]]
regressor_OLS = sm.OLS(endog=y, exog = x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Sat, 23 Sep 2017",Prob (F-statistic):,8.49e-29
Time:,12:17:06,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
x1,220.1585,2900.536,0.076,0.940,-5621.821,6062.138
x2,0.8060,0.046,17.606,0.000,0.714,0.898
x3,-0.0270,0.052,-0.523,0.604,-0.131,0.077
x4,0.0270,0.017,1.592,0.118,-0.007,0.061

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [16]:
x_opt = x[:, [0, 3, 4, 5]]
regressor_OLS = sm.OLS(endog=y, exog = x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Sat, 23 Sep 2017",Prob (F-statistic):,4.53e-30
Time:,12:21:41,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [17]:
x_opt = x[:, [0, 3, 5]]
regressor_OLS = sm.OLS(endog=y, exog = x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Sat, 23 Sep 2017",Prob (F-statistic):,2.1600000000000003e-31
Time:,12:23:12,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [18]:
x_opt = x[:, [0, 3]]
regressor_OLS = sm.OLS(endog=y, exog = x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Sat, 23 Sep 2017",Prob (F-statistic):,3.5000000000000004e-32
Time:,12:28:43,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0
