# Multiple Linear Regression
1. categorical data encoding
2. Backward Elimination
3. Statsmodel

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
dataset = pd.read_csv('data/50_Startups.csv')

In [3]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

__encoding categorical data -> State Column__

In [5]:
label_encoder = LabelEncoder()
X[:, 3] = label_encoder.fit_transform(X[:, 3])
onehot_encoder = OneHotEncoder(categorical_features=[3])
X = onehot_encoder.fit_transform(X).toarray()

In [6]:
X[0]

array([0.000000e+00, 0.000000e+00, 1.000000e+00, 1.653492e+05,
       1.368978e+05, 4.717841e+05])

__categorical data are now encoded at the start of the columns. we have to avoid dummy variable trap by removing one column of categorical data.__

In [7]:
X = X[:, 1:]

In [8]:
X[0]

array([0.000000e+00, 1.000000e+00, 1.653492e+05, 1.368978e+05,
       4.717841e+05])

__Split dataset into train and test data__

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
len(X_train), len(X_test)

(40, 10)

In [11]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

  linalg.lstsq(X, y)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [12]:
y_pred = regressor.predict(X_test)
y_pred

array([103015.20159797, 132582.27760815, 132447.73845175,  71976.09851259,
       178537.48221053, 116161.24230163,  67851.69209676,  98791.73374689,
       113969.43533011, 167921.0656955 ])

In [13]:
y_test

array([103282.38, 144259.4 , 146121.95,  77798.83, 191050.39, 105008.31,
        81229.06,  97483.56, 110352.25, 166187.94])

__BACKWARD ELIMINATION__

add b_0 coeficient in X dataset, as for statsmodel's OLS(ordinary Least Square) does not do this automatically, so we need to add x_0 = 1 column explicitely

In [14]:
X = np.append(arr=np.ones((len(X), 1)).astype(int), values=X, axis=1)

In [15]:
X[0]

array([1.000000e+00, 0.000000e+00, 1.000000e+00, 1.653492e+05,
       1.368978e+05, 4.717841e+05])

In [16]:
import statsmodels.formula.api as sm
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
print(regressor_OLS.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.951
Model:                            OLS   Adj. R-squared:                  0.945
Method:                 Least Squares   F-statistic:                     169.9
Date:                Fri, 28 Dec 2018   Prob (F-statistic):           1.34e-27
Time:                        23:21:08   Log-Likelihood:                -525.38
No. Observations:                  50   AIC:                             1063.
Df Residuals:                      44   BIC:                             1074.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       5.013e+04   6884.820      7.281      0.0

*based on p value eliminate the column*

In [17]:
X_opt = X[:, [0, 1, 3, 4, 5]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
print(regressor_OLS.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.951
Model:                            OLS   Adj. R-squared:                  0.946
Method:                 Least Squares   F-statistic:                     217.2
Date:                Fri, 28 Dec 2018   Prob (F-statistic):           8.49e-29
Time:                        23:21:12   Log-Likelihood:                -525.38
No. Observations:                  50   AIC:                             1061.
Df Residuals:                      45   BIC:                             1070.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       5.011e+04   6647.870      7.537      0.0

In [18]:
X_opt = X[:, [0, 3, 4, 5]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
print(regressor_OLS.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.951
Model:                            OLS   Adj. R-squared:                  0.948
Method:                 Least Squares   F-statistic:                     296.0
Date:                Fri, 28 Dec 2018   Prob (F-statistic):           4.53e-30
Time:                        23:21:15   Log-Likelihood:                -525.39
No. Observations:                  50   AIC:                             1059.
Df Residuals:                      46   BIC:                             1066.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       5.012e+04   6572.353      7.626      0.0