# Multiple Linear Regression

## Data Pre Processing

In [81]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [82]:
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
print(X)

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']
 [131876.9 99814.71 362861.36 'New York']
 [134615.46 147198.87 127716.82 'California']
 [130298.13 145530.06 323876.68 'Florida']
 [120542.52 148718.95 311613.29 'New York']
 [123334.88 108679.17 304981.62 'California']
 [101913.08 110594.11 229160.95 'Florida']
 [100671.96 91790.61 249744.55 'California']
 [93863.75 127320.38 249839.44 'Florida']
 [91992.39 135495.07 252664.93 'California']
 [119943.24 156547.42 256512.92 'Florida']
 [114523.61 122616.84 261776.23 'New York']
 [78013.11 121597.55 264346.06 'California']
 [94657.16 145077.58 282574.31 'New York']
 [91749.16 114175.79 294919.57 'Florida']
 [86419.7 153514.11 0.0 'New York']
 [76253.86 113867.3 298664.47 'California']
 [78389.47 153773.43 299737.29 'New York']
 [73994.56 122782.75 303319.26 'Florida']
 [67532

Splitting (One Hot Encoding) the categorical data into dummy variables

In [83]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
print(X)

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 73994.56 122782.75 3

In [84]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Multiple Linear Regression on the training Set

Dummy Variable trap handling : no need to skip the last dummy vecotor of each category set manually as the Multiple Linear regression class automatically avoids the dummy variable trap

Choosing best features : we do not have to choose Statistically Significat features by applying "all in / Backward Elimination / Forward Selection / Bi Directional Elimination / All possible Models" as the class handles this.

In [85]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

## Predicting Test set Results

In [86]:
y_predicted = regressor.predict(X_test)



In [87]:
np.set_printoptions(precision=2)
print(np.concatenate((y_test.reshape(len(y_test),1), y_predicted.reshape(len(y_predicted),1)), axis = 1))
print((y_test - y_predicted).reshape(len(y_test), 1))


[[103282.38 103015.2 ]
 [144259.4  132582.28]
 [146121.95 132447.74]
 [ 77798.83  71976.1 ]
 [191050.39 178537.48]
 [105008.31 116161.24]
 [ 81229.06  67851.69]
 [ 97483.56  98791.73]
 [110352.25 113969.44]
 [166187.94 167921.07]]
[[   267.18]
 [ 11677.12]
 [ 13674.21]
 [  5822.73]
 [ 12512.91]
 [-11152.93]
 [ 13377.37]
 [ -1308.17]
 [ -3617.19]
 [ -1733.13]]


## Making a single Prediction (for example the profit of a startup with R&D Spend = 160000, Administration Spend = 130000, Marketing Spend = 300000 and State = 'California')

In [101]:
print(regressor.predict([[1,0,0,160000, 130000, 300000]]))

[181566.92]


## print final regression equation with values of co efficients

In [102]:
print(regressor.coef_)
print(regressor.intercept_)


[ 8.66e+01 -8.73e+02  7.86e+02  7.73e-01  3.29e-02  3.66e-02]
42467.52924853278


## Backward Elimination (One approach to select only staistically Significant features)

In [88]:
import statsmodels.api as sm
# y = b0 * x0 + b1 * x1 + b2 * x2 + ... + bn * xn
#  in multiple linear equations x0 is assumed to be one
# statsmodels library does not take into account the x0 =1 hence we are adding a column of ones

X = np.append(arr=np.ones((len(X), 1)).astype(int), values= X, axis = 1)
X




array([[1, 0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1, 1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [1, 0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [1, 0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [1, 0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [1, 0.0, 0.0, 1.0, 131876.9, 99814.71, 362861.36],
       [1, 1.0, 0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [1, 0.0, 1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [1, 0.0, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [1, 1.0, 0.0, 0.0, 123334.88, 108679.17, 304981.62],
       [1, 0.0, 1.0, 0.0, 101913.08, 110594.11, 229160.95],
       [1, 1.0, 0.0, 0.0, 100671.96, 91790.61, 249744.55],
       [1, 0.0, 1.0, 0.0, 93863.75, 127320.38, 249839.44],
       [1, 1.0, 0.0, 0.0, 91992.39, 135495.07, 252664.93],
       [1, 0.0, 1.0, 0.0, 119943.24, 156547.42, 256512.92],
       [1, 0.0, 0.0, 1.0, 114523.61, 122616.84, 261776.23],
       [1, 1.0, 0.0, 0.0, 78013.11, 121597.55, 264

In [89]:
# Remove one dummy variable to prevent Dummy Trap
X=np.append(X[:,0:1], X[:,2:], axis=1)
# or
# X = X[:,[0,2,3,4,5,6]]
X

array([[1, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [1, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [1, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [1, 1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [1, 0.0, 1.0, 131876.9, 99814.71, 362861.36],
       [1, 0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [1, 1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [1, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [1, 0.0, 0.0, 123334.88, 108679.17, 304981.62],
       [1, 1.0, 0.0, 101913.08, 110594.11, 229160.95],
       [1, 0.0, 0.0, 100671.96, 91790.61, 249744.55],
       [1, 1.0, 0.0, 93863.75, 127320.38, 249839.44],
       [1, 0.0, 0.0, 91992.39, 135495.07, 252664.93],
       [1, 1.0, 0.0, 119943.24, 156547.42, 256512.92],
       [1, 0.0, 1.0, 114523.61, 122616.84, 261776.23],
       [1, 0.0, 0.0, 78013.11, 121597.55, 264346.06],
       [1, 0.0, 1.0, 94657.16, 145077.58, 282574.31],
       [1, 1.0, 0.0, 9

In [91]:
# 0:constant, 1:dummy 1, 2:dummy2, 3:R&D Spend, 4:Admin Spend, 5:Marketing Spend
X_opt = X[:,[0,1,2,3,4,5]]
X_opt = X_opt.astype(np.float64)
X_opt


array([[1.00e+00, 0.00e+00, 1.00e+00, 1.65e+05, 1.37e+05, 4.72e+05],
       [1.00e+00, 0.00e+00, 0.00e+00, 1.63e+05, 1.51e+05, 4.44e+05],
       [1.00e+00, 1.00e+00, 0.00e+00, 1.53e+05, 1.01e+05, 4.08e+05],
       [1.00e+00, 0.00e+00, 1.00e+00, 1.44e+05, 1.19e+05, 3.83e+05],
       [1.00e+00, 1.00e+00, 0.00e+00, 1.42e+05, 9.14e+04, 3.66e+05],
       [1.00e+00, 0.00e+00, 1.00e+00, 1.32e+05, 9.98e+04, 3.63e+05],
       [1.00e+00, 0.00e+00, 0.00e+00, 1.35e+05, 1.47e+05, 1.28e+05],
       [1.00e+00, 1.00e+00, 0.00e+00, 1.30e+05, 1.46e+05, 3.24e+05],
       [1.00e+00, 0.00e+00, 1.00e+00, 1.21e+05, 1.49e+05, 3.12e+05],
       [1.00e+00, 0.00e+00, 0.00e+00, 1.23e+05, 1.09e+05, 3.05e+05],
       [1.00e+00, 1.00e+00, 0.00e+00, 1.02e+05, 1.11e+05, 2.29e+05],
       [1.00e+00, 0.00e+00, 0.00e+00, 1.01e+05, 9.18e+04, 2.50e+05],
       [1.00e+00, 1.00e+00, 0.00e+00, 9.39e+04, 1.27e+05, 2.50e+05],
       [1.00e+00, 0.00e+00, 0.00e+00, 9.20e+04, 1.35e+05, 2.53e+05],
       [1.00e+00, 1.00e+00, 0.00e+

In [92]:
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() # OLS - Ordinary Least Square
regressor_OLS.summary()
# x1:Dummy1, x2:Dummy2, x3:R&d, x4:Admin, x5:marketing

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Thu, 29 Jun 2023",Prob (F-statistic):,1.34e-27
Time:,16:04:37,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [93]:
# Removing the predictor (colunm index 2: Dummy 2) with highest Significance level (SL)
# as the pvalue  (L>|t|) is greater than the decided SL of 0.05
# 0:constant, 1:dummy 1, 2:dummy2, 3:R&D Spend, 4:Admin Spend, 5:Marketing Spend
X_opt = X[:,[0,1,3,4,5]]
X_opt = X_opt.astype(np.float64)
X_opt

array([[1.00e+00, 0.00e+00, 1.65e+05, 1.37e+05, 4.72e+05],
       [1.00e+00, 0.00e+00, 1.63e+05, 1.51e+05, 4.44e+05],
       [1.00e+00, 1.00e+00, 1.53e+05, 1.01e+05, 4.08e+05],
       [1.00e+00, 0.00e+00, 1.44e+05, 1.19e+05, 3.83e+05],
       [1.00e+00, 1.00e+00, 1.42e+05, 9.14e+04, 3.66e+05],
       [1.00e+00, 0.00e+00, 1.32e+05, 9.98e+04, 3.63e+05],
       [1.00e+00, 0.00e+00, 1.35e+05, 1.47e+05, 1.28e+05],
       [1.00e+00, 1.00e+00, 1.30e+05, 1.46e+05, 3.24e+05],
       [1.00e+00, 0.00e+00, 1.21e+05, 1.49e+05, 3.12e+05],
       [1.00e+00, 0.00e+00, 1.23e+05, 1.09e+05, 3.05e+05],
       [1.00e+00, 1.00e+00, 1.02e+05, 1.11e+05, 2.29e+05],
       [1.00e+00, 0.00e+00, 1.01e+05, 9.18e+04, 2.50e+05],
       [1.00e+00, 1.00e+00, 9.39e+04, 1.27e+05, 2.50e+05],
       [1.00e+00, 0.00e+00, 9.20e+04, 1.35e+05, 2.53e+05],
       [1.00e+00, 1.00e+00, 1.20e+05, 1.57e+05, 2.57e+05],
       [1.00e+00, 0.00e+00, 1.15e+05, 1.23e+05, 2.62e+05],
       [1.00e+00, 0.00e+00, 7.80e+04, 1.22e+05, 2.64e+05

In [94]:
# Fit the regressor model to the new optimized predictors
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() # OLS - Ordinary Least Square
regressor_OLS.summary()
# x1:Dummy1, x2:R&d, x3:Admin, x4:marketing

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Thu, 29 Jun 2023",Prob (F-statistic):,8.49e-29
Time:,16:07:50,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
x1,220.1585,2900.536,0.076,0.940,-5621.821,6062.138
x2,0.8060,0.046,17.606,0.000,0.714,0.898
x3,-0.0270,0.052,-0.523,0.604,-0.131,0.077
x4,0.0270,0.017,1.592,0.118,-0.007,0.061

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [95]:
# Removing the predictor (colunm index 1: Dummy 1) with highest Significance level (SL)
# as the pvalue  (L>|t|) is greater than the decided SL of 0.05
# and previously removed index 2
# 0:constant, 1:dummy 1, 2:dummy2, 3:R&D Spend, 4:Admin Spend, 5:Marketing Spend
X_opt = X[:,[0,3,4,5]]
X_opt = X_opt.astype(np.float64)
X_opt

array([[1.00e+00, 1.65e+05, 1.37e+05, 4.72e+05],
       [1.00e+00, 1.63e+05, 1.51e+05, 4.44e+05],
       [1.00e+00, 1.53e+05, 1.01e+05, 4.08e+05],
       [1.00e+00, 1.44e+05, 1.19e+05, 3.83e+05],
       [1.00e+00, 1.42e+05, 9.14e+04, 3.66e+05],
       [1.00e+00, 1.32e+05, 9.98e+04, 3.63e+05],
       [1.00e+00, 1.35e+05, 1.47e+05, 1.28e+05],
       [1.00e+00, 1.30e+05, 1.46e+05, 3.24e+05],
       [1.00e+00, 1.21e+05, 1.49e+05, 3.12e+05],
       [1.00e+00, 1.23e+05, 1.09e+05, 3.05e+05],
       [1.00e+00, 1.02e+05, 1.11e+05, 2.29e+05],
       [1.00e+00, 1.01e+05, 9.18e+04, 2.50e+05],
       [1.00e+00, 9.39e+04, 1.27e+05, 2.50e+05],
       [1.00e+00, 9.20e+04, 1.35e+05, 2.53e+05],
       [1.00e+00, 1.20e+05, 1.57e+05, 2.57e+05],
       [1.00e+00, 1.15e+05, 1.23e+05, 2.62e+05],
       [1.00e+00, 7.80e+04, 1.22e+05, 2.64e+05],
       [1.00e+00, 9.47e+04, 1.45e+05, 2.83e+05],
       [1.00e+00, 9.17e+04, 1.14e+05, 2.95e+05],
       [1.00e+00, 8.64e+04, 1.54e+05, 0.00e+00],
       [1.00e+00, 7.

In [96]:
# Fit the regressor model to the new optimized predictors
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() # OLS - Ordinary Least Square
regressor_OLS.summary()
# x1:R&d, x2:Admin, x3:marketing

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Thu, 29 Jun 2023",Prob (F-statistic):,4.53e-30
Time:,16:11:07,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [97]:
# Removing the predictor (colunm index 4: Admin Spend) with highest Significance level (SL)
# as the pvalue  (L>|t|) is greater than the decided SL of 0.05
# and previously removed index 1, 2
# 0:constant, 1:dummy 1, 2:dummy2, 3:R&D Spend, 4:Admin Spend, 5:Marketing Spend
X_opt = X[:,[0,3,5]]
X_opt = X_opt.astype(np.float64)
X_opt

array([[1.00e+00, 1.65e+05, 4.72e+05],
       [1.00e+00, 1.63e+05, 4.44e+05],
       [1.00e+00, 1.53e+05, 4.08e+05],
       [1.00e+00, 1.44e+05, 3.83e+05],
       [1.00e+00, 1.42e+05, 3.66e+05],
       [1.00e+00, 1.32e+05, 3.63e+05],
       [1.00e+00, 1.35e+05, 1.28e+05],
       [1.00e+00, 1.30e+05, 3.24e+05],
       [1.00e+00, 1.21e+05, 3.12e+05],
       [1.00e+00, 1.23e+05, 3.05e+05],
       [1.00e+00, 1.02e+05, 2.29e+05],
       [1.00e+00, 1.01e+05, 2.50e+05],
       [1.00e+00, 9.39e+04, 2.50e+05],
       [1.00e+00, 9.20e+04, 2.53e+05],
       [1.00e+00, 1.20e+05, 2.57e+05],
       [1.00e+00, 1.15e+05, 2.62e+05],
       [1.00e+00, 7.80e+04, 2.64e+05],
       [1.00e+00, 9.47e+04, 2.83e+05],
       [1.00e+00, 9.17e+04, 2.95e+05],
       [1.00e+00, 8.64e+04, 0.00e+00],
       [1.00e+00, 7.63e+04, 2.99e+05],
       [1.00e+00, 7.84e+04, 3.00e+05],
       [1.00e+00, 7.40e+04, 3.03e+05],
       [1.00e+00, 6.75e+04, 3.05e+05],
       [1.00e+00, 7.70e+04, 1.41e+05],
       [1.00e+00, 6.47e+0

In [98]:
# Fit the regressor model to the new optimized predictors
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() # OLS - Ordinary Least Square
regressor_OLS.summary()
# x1:R&d, x2:marketing

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Thu, 29 Jun 2023",Prob (F-statistic):,2.1600000000000003e-31
Time:,16:23:48,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [99]:
# Removing the predictor (colunm index 5: Marketing) with highest Significance level (SL)
# as the pvalue  (L>|t|) is greater than the decided SL of 0.05
# and previously removed index 1, 2, 4
# 0:constant, 1:dummy 1, 2:dummy2, 3:R&D Spend, 4:Admin Spend, 5:Marketing Spend
X_opt = X[:,[0,3]]
X_opt = X_opt.astype(np.float64)
X_opt

array([[1.00e+00, 1.65e+05],
       [1.00e+00, 1.63e+05],
       [1.00e+00, 1.53e+05],
       [1.00e+00, 1.44e+05],
       [1.00e+00, 1.42e+05],
       [1.00e+00, 1.32e+05],
       [1.00e+00, 1.35e+05],
       [1.00e+00, 1.30e+05],
       [1.00e+00, 1.21e+05],
       [1.00e+00, 1.23e+05],
       [1.00e+00, 1.02e+05],
       [1.00e+00, 1.01e+05],
       [1.00e+00, 9.39e+04],
       [1.00e+00, 9.20e+04],
       [1.00e+00, 1.20e+05],
       [1.00e+00, 1.15e+05],
       [1.00e+00, 7.80e+04],
       [1.00e+00, 9.47e+04],
       [1.00e+00, 9.17e+04],
       [1.00e+00, 8.64e+04],
       [1.00e+00, 7.63e+04],
       [1.00e+00, 7.84e+04],
       [1.00e+00, 7.40e+04],
       [1.00e+00, 6.75e+04],
       [1.00e+00, 7.70e+04],
       [1.00e+00, 6.47e+04],
       [1.00e+00, 7.53e+04],
       [1.00e+00, 7.21e+04],
       [1.00e+00, 6.61e+04],
       [1.00e+00, 6.56e+04],
       [1.00e+00, 6.20e+04],
       [1.00e+00, 6.11e+04],
       [1.00e+00, 6.34e+04],
       [1.00e+00, 5.55e+04],
       [1.00e+

In [100]:
# Fit the regressor model to the new optimized predictors
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit() # OLS - Ordinary Least Square
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Thu, 29 Jun 2023",Prob (F-statistic):,3.5000000000000004e-32
Time:,16:25:55,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0
