In [1]:
# Step 1: Importing the important libraries
"""
1. NumPy: is a general-purpose array-processing package and scientific computing package.
2. Pandas: is an open source package widely used for data science/data analysis and machine learning tasks.
3. Matplotlib: is a cross-platform, data visualization and graphical plotting library and its numerical extension NumPy.
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')

In [3]:
# Step 2.1: Exploration of the dataset
# Check the head of the dataset
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
# Step 2.1: Exploration of the dataset
# Check the tail of the dataset
dataset.tail()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
45,1000.23,124153.04,1903.93,New York,64926.08
46,1315.46,115816.21,297114.46,Florida,49490.75
47,0.0,135426.92,0.0,California,42559.73
48,542.05,51743.15,0.0,New York,35673.41
49,0.0,116983.8,45173.06,California,14681.4


In [5]:
# Step 2.1: Exploration of the dataset
# Check the decoration of the dataset
dataset.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [6]:
# Step 2.1: Exploration of the dataset
# Check the decoration of the dataset
dataset.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [7]:
# Step 3: Seperating the dataset into the dependent (y) and independent (x) varialbes
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,4].values

In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X[:, 3] = le.fit_transform(X[:, 3])
print(X)

[[165349.2 136897.8 471784.1 2]
 [162597.7 151377.59 443898.53 0]
 [153441.51 101145.55 407934.54 1]
 [144372.41 118671.85 383199.62 2]
 [142107.34 91391.77 366168.42 1]
 [131876.9 99814.71 362861.36 2]
 [134615.46 147198.87 127716.82 0]
 [130298.13 145530.06 323876.68 1]
 [120542.52 148718.95 311613.29 2]
 [123334.88 108679.17 304981.62 0]
 [101913.08 110594.11 229160.95 1]
 [100671.96 91790.61 249744.55 0]
 [93863.75 127320.38 249839.44 1]
 [91992.39 135495.07 252664.93 0]
 [119943.24 156547.42 256512.92 1]
 [114523.61 122616.84 261776.23 2]
 [78013.11 121597.55 264346.06 0]
 [94657.16 145077.58 282574.31 2]
 [91749.16 114175.79 294919.57 1]
 [86419.7 153514.11 0.0 2]
 [76253.86 113867.3 298664.47 0]
 [78389.47 153773.43 299737.29 2]
 [73994.56 122782.75 303319.26 1]
 [67532.53 105751.03 304768.73 1]
 [77044.01 99281.34 140574.81 2]
 [64664.71 139553.16 137962.62 0]
 [75328.87 144135.98 134050.07 1]
 [72107.6 127864.55 353183.81 2]
 [66051.52 182645.56 118148.2 1]
 [65605.48 153032.0

In [9]:
# Step 4: Data Preprocessing: State Varialbe in dataset having text data values. Here we needs to be convert the data from categorical text into categorical numerical datavalues.
# Step 4.1: Lable encoder: is a encoding technique for handling categorical variables. In this technique, each label is assigned a unique integer based on alphabetical ordering. (Encoding the categorical data)
# Step 4.2: One Hot Encoder: is a representation of categorical variables as binary vectors. (Encoding the independent variables)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
labelencoder_X = LabelEncoder()
col_trans = ColumnTransformer([('One_hot_encoder', OneHotEncoder(),[3])], remainder ='passthrough')
X = np.array(col_trans.fit_transform(X))

In [10]:
# Avoiding the dummy variable trap
X = X[:,1:]
X

array([[0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [0.0, 1.0, 131876.9, 99814.71, 362861.36],
       [0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [0.0, 0.0, 123334.88, 108679.17, 304981.62],
       [1.0, 0.0, 101913.08, 110594.11, 229160.95],
       [0.0, 0.0, 100671.96, 91790.61, 249744.55],
       [1.0, 0.0, 93863.75, 127320.38, 249839.44],
       [0.0, 0.0, 91992.39, 135495.07, 252664.93],
       [1.0, 0.0, 119943.24, 156547.42, 256512.92],
       [0.0, 1.0, 114523.61, 122616.84, 261776.23],
       [0.0, 0.0, 78013.11, 121597.55, 264346.06],
       [0.0, 1.0, 94657.16, 145077.58, 282574.31],
       [1.0, 0.0, 91749.16, 114175.79, 294919.57],
       [0.0, 1.0, 86419.7

In [11]:
# Step 5: Splitting the dataset into training and test subsets
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state = 0)

In [12]:
# Step 6: Fitting the Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train,y_train)

LinearRegression()

In [13]:
# Step 7: Predicting the test set results
y_pred = reg.predict(X_test)

In [14]:
# Building the opitmal model using Backward Eliminaiton
import statsmodels.api as sm
X = np.append(arr = np.ones((50,1)).astype(int),values = X, axis = 1)
X_opt = np.array(X[:,[0,1,2,3,4,5]], dtype=float)
reg_ols = sm.OLS(endog = y, exog = X_opt).fit()
reg_ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Sun, 13 Jun 2021",Prob (F-statistic):,1.34e-27
Time:,23:35:14,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [15]:
# Removing the highest p value attributes i.e. X3
X_opt = np.array(X[:,[0,1,2,4,5]], dtype=float)
reg_ols = sm.OLS(endog = y, exog = X_opt).fit()
reg_ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.613
Model:,OLS,Adj. R-squared:,0.579
Method:,Least Squares,F-statistic:,17.83
Date:,"Sun, 13 Jun 2021",Prob (F-statistic):,7.78e-09
Time:,23:35:14,Log-Likelihood:,-576.91
No. Observations:,50,AIC:,1164.0
Df Residuals:,45,BIC:,1173.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.903e+04,1.84e+04,1.033,0.307,-1.81e+04,5.61e+04
x1,-1703.7028,9337.989,-0.182,0.856,-2.05e+04,1.71e+04
x2,3875.7625,9002.603,0.431,0.669,-1.43e+04,2.2e+04
x3,0.3239,0.133,2.426,0.019,0.055,0.593
x4,0.2507,0.031,7.997,0.000,0.188,0.314

0,1,2,3
Omnibus:,5.729,Durbin-Watson:,1.266
Prob(Omnibus):,0.057,Jarque-Bera (JB):,5.349
Skew:,-0.461,Prob(JB):,0.0689
Kurtosis:,4.311,Cond. No.,1340000.0


In [16]:
# Removing the highest p value attributes i.e. X5
X_opt = np.array(X[:,[0,1,2,4]], dtype=float)
reg_ols = sm.OLS(endog = y, exog = X_opt).fit()
reg_ols.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.063
Model:,OLS,Adj. R-squared:,0.002
Method:,Least Squares,F-statistic:,1.035
Date:,"Sun, 13 Jun 2021",Prob (F-statistic):,0.386
Time:,23:35:20,Log-Likelihood:,-599.02
No. Observations:,50,AIC:,1206.0
Df Residuals:,46,BIC:,1214.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,6.945e+04,2.66e+04,2.607,0.012,1.58e+04,1.23e+05
x1,1.458e+04,1.4e+04,1.039,0.304,-1.37e+04,4.28e+04
x2,9623.8728,1.38e+04,0.697,0.489,-1.82e+04,3.74e+04
x3,0.2853,0.205,1.390,0.171,-0.128,0.699

0,1,2,3
Omnibus:,0.337,Durbin-Watson:,0.139
Prob(Omnibus):,0.845,Jarque-Bera (JB):,0.465
Skew:,0.172,Prob(JB):,0.792
Kurtosis:,2.675,Cond. No.,597000.0


In [18]:
#A backward stochastic process can be viewed as a marked process attached to a failure event.