# Multiple Linear Regression
in scikit-learn

In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression

x = [[0, 1], [5, 1], [15, 2], [25, 5], [35, 11], [45, 15], [55, 34], [60, 35]]
y = [4, 5, 20, 14, 32, 22, 38, 43]
x, y = np.array(x), np.array(y)
print(x)

[[ 0  1]
 [ 5  1]
 [15  2]
 [25  5]
 [35 11]
 [45 15]
 [55 34]
 [60 35]]


In [2]:
print(y)

[ 4  5 20 14 32 22 38 43]


In [3]:
model = LinearRegression().fit(x, y)

In [4]:
r_sq = model.score(x, y)
print('coefficient of determination:', r_sq)

coefficient of determination: 0.8615939258756776


In [5]:
print('intercept:', model.intercept_)

intercept: 5.52257927519819


In [6]:
print('slope:', model.coef_)

slope: [0.44706965 0.25502548]


In [7]:
y_pred = model.predict(x)
print('predicted response:', y_pred, sep='\n')

predicted response:
[ 5.77760476  8.012953   12.73867497 17.9744479  23.97529728 29.4660957
 38.78227633 41.27265006]


# Advanced Linear Regression With statsmodels
You can implement linear regression in Python relatively easily by using the package statsmodels as well. Typically, this is desirable when there is a need for more detailed results.

In [8]:
import statsmodels.api as sm
xc = sm.add_constant(x)
print(xc)

ModuleNotFoundError: No module named 'statsmodels'

In [10]:
model2 = sm.OLS(y, xc)
result = model2.fit()
print(result.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.951
Model:                            OLS   Adj. R-squared (uncentered):              0.934
Method:                 Least Squares   F-statistic:                              57.86
Date:                Mon, 21 Feb 2022   Prob (F-statistic):                    0.000120
Time:                        07:59:04   Log-Likelihood:                         -25.398
No. Observations:                   8   AIC:                                      54.80
Df Residuals:                       6   BIC:                                      54.96
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------



In [12]:
print('coefficient of determination:', result.rsquared)
print('adjusted coefficient of determination:', result.rsquared_adj)
print('regression coefficients:', result.params)

coefficient of determination: 0.9507101076504841
adjusted coefficient of determination: 0.9342801435339788
regression coefficients: [ 0.69579339 -0.00477437]


# Let's try

In [3]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

tseries = pd.read_excel("Week 1 data forlab1.xlsx")
tseries = tseries.rename(columns={"Unnamed: 0" : 'Date'})
tseries.head()

Unnamed: 0,Date,GDP,PCE,Inv,G
0,1980-03-01,5903.4,3796.7,778.3,1365.4
1,1980-06-01,5782.4,3710.5,708.1,1369.7
2,1980-09-01,5771.7,3750.3,654.1,1350.8
3,1980-12-01,5878.4,3800.3,720.6,1349.4
4,1981-03-01,6000.6,3821.1,792.2,1367.3


In [20]:
x1 = np.array(tseries["PCE"])
x2 = np.array(tseries["Inv"])
print(x1)

[3796.7 3710.5 3750.3 3800.3 3821.1 3821.1 3836.6 3807.6 3832.2 3845.9
 3875.4 3946.1 3984.8 4063.9 4135.7 4201.3 4237.3 4297.9 4331.1 4388.1
 4462.5 4503.2 4588.7 4598.8 4637.2 4686.6 4768.5 4797.2 4789.9 4854.
 4908.2 4920.  5002.2 5038.5 5078.3 5138.1 5156.9 5180.  5233.7 5259.3
 5300.9 5318.4 5338.6 5297.  5282.  5322.2 5342.6 5340.2 5432.  5464.2
 5524.6 5592.  5614.7 5668.6 5730.1 5781.1 5845.5 5888.8 5936.  5994.6
 6001.6 6050.8 6104.9 6147.8 6204.  6274.2 6311.8 6363.2 6427.3 6453.3
 6563.  6638.1 6704.1 6819.5 6909.9 7015.9 7085.1 7196.6 7283.1 7385.8
 7497.8 7568.3 7642.4 7710.  7740.8 7770.  7804.2 7926.4 7953.7 7994.1
 8048.3 8076.9 8117.7 8198.1 8308.5 8353.7 8427.6 8465.1 8539.1 8631.3
 8700.1 8786.2 8852.9 8874.9 8965.8 9019.8 9073.9 9158.3 9209.2 9244.5
 9285.2 9312.6 9289.1 9285.8 9196.  9076.  9040.9 8998.5 9050.3 9060.2
 9121.2 9186.9 9247.1 9328.4 9376.7 9392.7 9433.5 9482.1 9550.2]


In [21]:
print(x2)

[ 778.3  708.1  654.1  720.6  792.2  754.5  801.3  770.2  690.   689.4
  681.3  620.7  642.8  704.8  752.2  831.4  918.4  949.4  971.4  955.5
  924.   939.9  929.6  965.9  963.9  942.5  913.   914.4  942.3  943.6
  944.6 1018.3  960.9  984.3  990.6 1003.7 1042.2 1030.  1017.9 1007.4
 1017.3 1017.6  993.7  930.8  892.9  888.5  910.6  945.4  924.4  985.3
  995.5 1027.  1051.1 1059.4 1058.6 1114.5 1162.6 1230.4 1208.2 1264.6
 1277.4 1243.1 1231.1 1265.8 1282.4 1348.9 1416.9 1413.  1446.  1538.3
 1565.7 1590.7 1666.6 1646.7 1693.9 1748.4 1803.4 1797.1 1842.2 1907.6
 1880.9 2011.1 1979.7 1980.7 1876.1 1870.  1830.3 1724.5 1781.9 1803.4
 1808.  1808.3 1810.4 1821.8 1888.4 1959.9 1970.1 2055.7 2082.1 2124.9
 2170.  2131.3 2155.1 2232.8 2266.3 2263.1 2231.2 2166.7 2145.1 2193.
 2176.3 2123.6 2055.7 2024.  1934.7 1744.6 1490.4 1397.2 1407.3 1522.
 1630.  1728.3 1766.8 1734.5 1750.9 1778.4 1784.2 1875.7 1903. ]


In [27]:
xs = np.column_stack((tseries["PCE"],tseries["Inv"]))
print(xs)

# You can also do it with dataframes directly, if you want.
# xs = np.column_stack((tseries["PCE"], tseries["Inv"]))
# print(xs)

[[3796.7  778.3]
 [3710.5  708.1]
 [3750.3  654.1]
 [3800.3  720.6]
 [3821.1  792.2]
 [3821.1  754.5]
 [3836.6  801.3]
 [3807.6  770.2]
 [3832.2  690. ]
 [3845.9  689.4]
 [3875.4  681.3]
 [3946.1  620.7]
 [3984.8  642.8]
 [4063.9  704.8]
 [4135.7  752.2]
 [4201.3  831.4]
 [4237.3  918.4]
 [4297.9  949.4]
 [4331.1  971.4]
 [4388.1  955.5]
 [4462.5  924. ]
 [4503.2  939.9]
 [4588.7  929.6]
 [4598.8  965.9]
 [4637.2  963.9]
 [4686.6  942.5]
 [4768.5  913. ]
 [4797.2  914.4]
 [4789.9  942.3]
 [4854.   943.6]
 [4908.2  944.6]
 [4920.  1018.3]
 [5002.2  960.9]
 [5038.5  984.3]
 [5078.3  990.6]
 [5138.1 1003.7]
 [5156.9 1042.2]
 [5180.  1030. ]
 [5233.7 1017.9]
 [5259.3 1007.4]
 [5300.9 1017.3]
 [5318.4 1017.6]
 [5338.6  993.7]
 [5297.   930.8]
 [5282.   892.9]
 [5322.2  888.5]
 [5342.6  910.6]
 [5340.2  945.4]
 [5432.   924.4]
 [5464.2  985.3]
 [5524.6  995.5]
 [5592.  1027. ]
 [5614.7 1051.1]
 [5668.6 1059.4]
 [5730.1 1058.6]
 [5781.1 1114.5]
 [5845.5 1162.6]
 [5888.8 1230.4]
 [5936.  1208.

In [28]:
xc = sm.add_constant(xs)
print(xc)

[[1.0000e+00 3.7967e+03 7.7830e+02]
 [1.0000e+00 3.7105e+03 7.0810e+02]
 [1.0000e+00 3.7503e+03 6.5410e+02]
 [1.0000e+00 3.8003e+03 7.2060e+02]
 [1.0000e+00 3.8211e+03 7.9220e+02]
 [1.0000e+00 3.8211e+03 7.5450e+02]
 [1.0000e+00 3.8366e+03 8.0130e+02]
 [1.0000e+00 3.8076e+03 7.7020e+02]
 [1.0000e+00 3.8322e+03 6.9000e+02]
 [1.0000e+00 3.8459e+03 6.8940e+02]
 [1.0000e+00 3.8754e+03 6.8130e+02]
 [1.0000e+00 3.9461e+03 6.2070e+02]
 [1.0000e+00 3.9848e+03 6.4280e+02]
 [1.0000e+00 4.0639e+03 7.0480e+02]
 [1.0000e+00 4.1357e+03 7.5220e+02]
 [1.0000e+00 4.2013e+03 8.3140e+02]
 [1.0000e+00 4.2373e+03 9.1840e+02]
 [1.0000e+00 4.2979e+03 9.4940e+02]
 [1.0000e+00 4.3311e+03 9.7140e+02]
 [1.0000e+00 4.3881e+03 9.5550e+02]
 [1.0000e+00 4.4625e+03 9.2400e+02]
 [1.0000e+00 4.5032e+03 9.3990e+02]
 [1.0000e+00 4.5887e+03 9.2960e+02]
 [1.0000e+00 4.5988e+03 9.6590e+02]
 [1.0000e+00 4.6372e+03 9.6390e+02]
 [1.0000e+00 4.6866e+03 9.4250e+02]
 [1.0000e+00 4.7685e+03 9.1300e+02]
 [1.0000e+00 4.7972e+03 9.14

In [29]:
y = np.array(tseries["GDP"])
print(y)

[ 5903.4  5782.4  5771.7  5878.4  6000.6  5952.7  6025.   5950.   5852.3
  5884.   5861.4  5866.   5938.9  6072.4  6192.2  6320.2  6442.8  6554.
  6617.7  6671.6  6734.5  6791.5  6897.6  6950.   7016.8  7045.   7112.9
  7147.3  7186.9  7263.3  7326.3  7451.7  7490.2  7586.4  7625.6  7727.4
  7799.9  7858.3  7920.6  7937.9  8020.8  8052.7  8052.6  7982.   7943.4
  7997.   8030.7  8062.2  8150.7  8237.3  8322.3  8409.8  8425.3  8479.2
  8523.8  8636.4  8720.5  8839.8  8896.7  8995.5  9017.6  9037.   9112.9
  9176.4  9239.3  9399.   9480.8  9584.3  9658.   9801.2  9924.2 10000.3
 10094.8 10185.6 10320.  10498.6 10592.1 10674.9 10810.7 11004.8 11033.6
 11248.8 11258.3 11325.  11287.8 11361.7 11330.4 11370.  11467.1 11528.1
 11586.6 11590.6 11638.9 11737.5 11930.7 12038.6 12117.9 12195.9 12286.7
 12387.2 12515.  12570.7 12670.5 12735.6 12896.4 12948.7 12950.4 13038.4
 13056.1 13173.6 13269.8 13326.  13266.8 13310.5 13186.9 12883.5 12663.2
 12641.3 12694.5 12813.5 12937.7 13058.5 13139.6 132

In [30]:
modelGDP = sm.OLS(y, xc)
result = modelGDP.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.998
Model:                            OLS   Adj. R-squared:                  0.998
Method:                 Least Squares   F-statistic:                 3.948e+04
Date:                Mon, 21 Feb 2022   Prob (F-statistic):          5.57e-177
Time:                        21:22:29   Log-Likelihood:                -778.37
No. Observations:                 129   AIC:                             1563.
Df Residuals:                     126   BIC:                             1571.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        942.0515     33.904     27.786      0.0