# Multiple Linear Regression
### Here with a train / test split, taking all the predictor variables

In [10]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import numpy as np

boston = datasets.load_boston()

X = boston.data
y = boston.target

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

reg_all = linear_model.LinearRegression()
reg_all.fit(X_train, y_train)

y_pred = reg_all.predict(X_test)

print("R-squared: {}".format(reg_all.score(X_test, y_test)))

R-squared: 0.7109203586326271


### Here with cross validation instead of a train/test split. This helps control for selection bias by doing the splitting into some number of "folds" that get train/tested against eachother, scores result.  

In [11]:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn import linear_model

boston = datasets.load_boston()
X = boston.data
y = boston.target

reg = linear_model.LinearRegression()
cv_results = cross_val_score(reg, X, y, cv=5) #get the five-fold cross validation

print(cv_results) #prints the R-squared for each fold.

[ 0.63861069  0.71334432  0.58645134  0.07842495 -0.26312455]


## Improve the model - backward elimination

The following code came from KIRILL EREMENKO machine learning course https://www.superdatascience.com/machine-learning/ This is used to eliminate some less useful features as determined by the r-squared and p-values by backward elimination.

In [9]:
import statsmodels.formula.api as sm
def backwardElimination(x, SL):
    numVars = len(x[0])
    rows = len(x)
    temp = np.zeros((rows,numVars)).astype(int)
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        adjR_before = regressor_OLS.rsquared_adj.astype(float)
        if maxVar > SL:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    temp[:,j] = x[:, j]
                    x = np.delete(x, j, 1)
                    tmp_regressor = sm.OLS(y, x).fit()
                    adjR_after = tmp_regressor.rsquared_adj.astype(float)
                    if (adjR_before >= adjR_after):
                        x_rollback = np.hstack((x, temp[:,[0,j]]))
                        x_rollback = np.delete(x_rollback, j, 1)
                        print (regressor_OLS.summary())
                        return x_rollback
                    else:
                        continue
    print(regressor_OLS.summary())
    return x
 
SL = 0.05
#this needs a column of ones on the front to work right.
X = np.append(arr = np.ones((len(X), 1)).astype(int), values = X, axis = 1)
X_Modeled = backwardElimination(X, SL)


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.735
Method:                 Least Squares   F-statistic:                     128.2
Date:                Fri, 06 Apr 2018   Prob (F-statistic):          5.74e-137
Time:                        14:50:13   Log-Likelihood:                -1498.9
No. Observations:                 506   AIC:                             3022.
Df Residuals:                     494   BIC:                             3073.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         18.1847      2.534      7.176      0.0