# Simple Linear Regression

In [2]:
# Import packages and classes
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

In [3]:
# Rocket Propellent Data from Montgomery's Book Regression Analysis
y = np.array([2158.7,1678.15,2316,2061.3,2207.5,1708.3,1784.7,2575,2357.9,2256.7,2165.2,2399.55,1779.8,2336.75,1765.3,2053.5,2414.4,2200.5,2654.2,1753.7])
x = np.array([15.5,23.75,8,17,5.5,19,24,2.5,7.5,11,13,3.75,25,9.75,22,18,6,12.5,2,21.5]).reshape(-1,1)

Now, you have two arrays: the input x and output y. You should call .reshape() on x because this array is required to be two-dimensional, or to be more precise, to have one column and as many rows as necessary. That’s exactly what the argument (-1, 1) of .reshape() specifies.

In [4]:
print("x.ndim : ",x.ndim)
print("y.ndim : ",y.ndim)

x.ndim :  2
y.ndim :  1


In [5]:
x

array([[15.5 ],
       [23.75],
       [ 8.  ],
       [17.  ],
       [ 5.5 ],
       [19.  ],
       [24.  ],
       [ 2.5 ],
       [ 7.5 ],
       [11.  ],
       [13.  ],
       [ 3.75],
       [25.  ],
       [ 9.75],
       [22.  ],
       [18.  ],
       [ 6.  ],
       [12.5 ],
       [ 2.  ],
       [21.5 ]])

In [6]:
y

array([2158.7 , 1678.15, 2316.  , 2061.3 , 2207.5 , 1708.3 , 1784.7 ,
       2575.  , 2357.9 , 2256.7 , 2165.2 , 2399.55, 1779.8 , 2336.75,
       1765.3 , 2053.5 , 2414.4 , 2200.5 , 2654.2 , 1753.7 ])

In [7]:
# Let’s create an instance of the class LinearRegression
model = LinearRegression()

In [8]:
model.fit(x, y)

LinearRegression()

In [9]:
# Get results

print('coefficient of determination R^2:', model.score(x, y))

coefficient of determination R^2: 0.9018414316763039


In [10]:
print('intercept:', model.intercept_) # intercept Scaler

intercept: 2627.8223590012963


In [11]:
print('slope:', model.coef_) # Array Quantity

slope: [-37.15359094]


In [12]:
# Predict response
y_pred = model.predict(x)
print('predicted response:', y_pred, sep='\n')
y_pred.ndim

predicted response:
[2051.94169936 1745.42457406 2330.59363144 1996.21131294 2423.4776088
 1921.90413105 1736.13617632 2534.93838164 2349.17042691 2219.13285861
 2144.82567672 2488.49639296 1698.98258538 2265.57484729 1810.44335821
 1959.05772199 2404.90081333 2163.40247219 2553.51517711 1829.02015369]


1

In [13]:
# Alternatively

y_pred = model.intercept_ + model.coef_ * x
print('predicted response:', y_pred, sep='\n')
y_pred.ndim

predicted response:
[[2051.94169936]
 [1745.42457406]
 [2330.59363144]
 [1996.21131294]
 [2423.4776088 ]
 [1921.90413105]
 [1736.13617632]
 [2534.93838164]
 [2349.17042691]
 [2219.13285861]
 [2144.82567672]
 [2488.49639296]
 [1698.98258538]
 [2265.57484729]
 [1810.44335821]
 [1959.05772199]
 [2404.90081333]
 [2163.40247219]
 [2553.51517711]
 [1829.02015369]]


2

In [14]:
pd.DataFrame({"X":np.concatenate(x),
              "Actual":y,
              "Predicted":np.concatenate(y_pred)})

Unnamed: 0,X,Actual,Predicted
0,15.5,2158.7,2051.941699
1,23.75,1678.15,1745.424574
2,8.0,2316.0,2330.593631
3,17.0,2061.3,1996.211313
4,5.5,2207.5,2423.477609
5,19.0,1708.3,1921.904131
6,24.0,1784.7,1736.136176
7,2.5,2575.0,2534.938382
8,7.5,2357.9,2349.170427
9,11.0,2256.7,2219.132859


In [15]:
model.predict(np.array([18]).reshape(-1, 1))

array([1959.05772199])

In [16]:
model.predict([[18]])

array([1959.05772199])

# Multiple Linear Regression

In [17]:
data=pd.read_csv("data/DeliveryTimeData.csv")

In [18]:
n=data.shape[0] # Number of Observations
n

25

In [19]:
X=data.iloc[:,2:]
y=data.iloc[:,1]

In [20]:
model = LinearRegression().fit(X, y)

In [21]:
print('coefficient of determination:',model.score(X, y))
print('intercept:', model.intercept_)
print('slope:', model.coef_)

coefficient of determination: 0.9595937494832257
intercept: 2.3412311451922
slope: [1.61590721 0.01438483]


In [22]:
y_pred = model.predict(X)
print('predicted response:', y_pred, sep='\n')

predicted response:
[21.70808432 10.35361455 12.0797937   9.95564609 14.19439835 18.39957428
  7.15537645 16.67339513 71.8202938  19.12358708 38.09250698 21.5930409
 12.47299068 18.68246414 23.3287982  29.66292843 14.91363966 15.55137869
  7.70680652 40.88796994 20.51417893 56.00652789 23.35756786 24.4028535
 10.96258393]


In [27]:
model.predict([[20, 300]])

array([38.97482323])

In [28]:
data.head()

Unnamed: 0,Sr,DeliveryTime,NumberOfCases,Distance
0,1,16.68,7,560
1,2,11.5,3,220
2,3,12.03,3,340
3,4,14.88,4,80
4,5,13.75,6,150


In [29]:
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
model=ols('DeliveryTime~NumberOfCases+Distance',data).fit()

In [30]:
model.summary()

0,1,2,3
Dep. Variable:,DeliveryTime,R-squared:,0.96
Model:,OLS,Adj. R-squared:,0.956
Method:,Least Squares,F-statistic:,261.2
Date:,"Mon, 18 Apr 2022",Prob (F-statistic):,4.69e-16
Time:,14:57:44,Log-Likelihood:,-63.415
No. Observations:,25,AIC:,132.8
Df Residuals:,22,BIC:,136.5
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.3412,1.097,2.135,0.044,0.067,4.616
NumberOfCases,1.6159,0.171,9.464,0.000,1.262,1.970
Distance,0.0144,0.004,3.981,0.001,0.007,0.022

0,1,2,3
Omnibus:,0.421,Durbin-Watson:,1.17
Prob(Omnibus):,0.81,Jarque-Bera (JB):,0.01
Skew:,0.032,Prob(JB):,0.995
Kurtosis:,3.073,Cond. No.,873.0


In [31]:
anova_results = anova_lm(model)

In [32]:
anova_results

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
NumberOfCases,1.0,5382.408797,5382.408797,506.619363,1.112549e-16
Distance,1.0,168.402126,168.402126,15.850854,0.0006312469
Residual,22.0,233.731677,10.624167,,
