In [16]:
from pathlib import Path # to interact with file system.

import numpy as np # for working with arrays.
import pandas as pd # for working with data frames (tables).
import seaborn as sns # for plotting box plots to detect outliers

from sklearn.model_selection import train_test_split # for data partition.
from sklearn.metrics import r2_score # to identify r_square for regression model.
from sklearn.linear_model import LinearRegression # for linear regression model. 

%matplotlib inline 
import matplotlib.pylab as plt # for building and showing graphs.

import statsmodels.formula.api as sm

import dmba

from dmba import regressionSummary, exhaustive_search
from dmba import backward_elimination, forward_selection, stepwise_selection
from dmba import adjusted_r2_score, AIC_score, BIC_score

In [17]:
!pip install dmba --quiet

In [18]:
boston_df = pd.read_csv("BostonHousing.csv")

In [19]:
print('Dataframe dimensions, i.e. Number of rows and columns in data set:', 
      boston_df.shape)

Dataframe dimensions, i.e. Number of rows and columns in data set: (506, 14)


In [20]:
boston_df.head()

Unnamed: 0,CRIME,ZONE,INDUST,CHAR RIV,NIT OXIDE,ROOMS,AGE,DISTANCE,RADIAL,TAX,ST RATIO,LOW STAT,MVALUE,C MVALUE
0,0.00632,18.0,2.31,N,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0,No
1,0.02731,0.0,7.07,N,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6,No
2,0.02729,0.0,7.07,N,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7,Yes
3,0.03237,0.0,2.18,N,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4,Yes
4,0.06905,0.0,2.18,N,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2,Yes


In [21]:
boston_df.tail()

Unnamed: 0,CRIME,ZONE,INDUST,CHAR RIV,NIT OXIDE,ROOMS,AGE,DISTANCE,RADIAL,TAX,ST RATIO,LOW STAT,MVALUE,C MVALUE
501,0.06263,0.0,11.93,N,0.573,6.593,69.1,2.4786,1,273,21.0,9.67,22.4,No
502,0.04527,0.0,11.93,N,0.573,6.12,76.7,2.2875,1,273,21.0,9.08,20.6,No
503,0.06076,0.0,11.93,N,0.573,6.976,91.0,2.1675,1,273,21.0,5.64,23.9,No
504,0.10959,0.0,11.93,N,0.573,6.794,89.3,2.3889,1,273,21.0,6.48,22.0,No
505,0.04741,0.0,11.93,N,0.573,6.03,80.8,2.505,1,273,21.0,7.88,11.9,No


In [22]:
# Use the sample() function to retrieve a random sample of observations. 
# Here we sample 5 observations without replacement.
boston_df.sample(5)

Unnamed: 0,CRIME,ZONE,INDUST,CHAR RIV,NIT OXIDE,ROOMS,AGE,DISTANCE,RADIAL,TAX,ST RATIO,LOW STAT,MVALUE,C MVALUE
422,12.0482,0.0,18.1,N,0.614,5.648,87.6,1.9512,24,666,20.2,14.1,20.8,No
190,0.09068,45.0,3.44,N,0.437,6.951,21.5,6.4798,5,398,15.2,5.1,37.0,Yes
205,0.13642,0.0,10.59,N,0.489,5.891,22.3,3.9454,4,277,18.6,10.87,22.6,No
365,4.55587,0.0,18.1,N,0.718,3.561,87.9,1.6132,24,666,20.2,7.12,27.5,No
378,23.6482,0.0,18.1,N,0.671,6.38,96.2,1.3861,24,666,20.2,23.69,13.1,No


In [23]:
print('Original column titles:')
boston_df.columns

Original column titles:


Index(['CRIME', 'ZONE', 'INDUST', 'CHAR RIV', 'NIT OXIDE', 'ROOMS', 'AGE',
       'DISTANCE', 'RADIAL', 'TAX', 'ST RATIO', 'LOW STAT', 'MVALUE',
       'C MVALUE'],
      dtype='object')

In [24]:
print('Modified column titles with no space and one word for titles:')
boston_df.columns = [s.strip().replace(" ", "_") for s in boston_df.columns]
boston_df.columns

Modified column titles with no space and one word for titles:


Index(['CRIME', 'ZONE', 'INDUST', 'CHAR_RIV', 'NIT_OXIDE', 'ROOMS', 'AGE',
       'DISTANCE', 'RADIAL', 'TAX', 'ST_RATIO', 'LOW_STAT', 'MVALUE',
       'C_MVALUE'],
      dtype='object')

In [25]:
boston_df.dtypes

CRIME        float64
ZONE         float64
INDUST       float64
CHAR_RIV      object
NIT_OXIDE    float64
ROOMS        float64
AGE          float64
DISTANCE     float64
RADIAL         int64
TAX            int64
ST_RATIO     float64
LOW_STAT     float64
MVALUE       float64
C_MVALUE      object
dtype: object

In [26]:
# The CHAR_RIV and C_MVALUE column are 'object's; does not have 
# the 'category' definition.
print('Original CHAR_RIV and C_MVALUE variables:')
print(boston_df.CHAR_RIV.dtype, boston_df.C_MVALUE.dtype )

# Need to change variable types to 'category'. 
boston_df.CHAR_RIV = boston_df.CHAR_RIV.astype('category')
boston_df.C_MVALUE = boston_df.C_MVALUE.astype("category")

# Display category levels (attributes) and category type.
print(' ')
print('Category levels and changed variable type of CHAR_RIV column:')
print(boston_df.CHAR_RIV.cat.categories)  # It can take one of two levels.
print(boston_df.CHAR_RIV.dtype)  # Type is now 'category'.
print(' ')
print('Category levels and changed variable type of C_MVALUE column:')
print(boston_df.C_MVALUE.cat.categories)  # It can take one of two levels.
print(boston_df.C_MVALUE.dtype) 

Original CHAR_RIV and C_MVALUE variables:
object object
 
Category levels and changed variable type of CHAR_RIV column:
Index(['N', 'Y'], dtype='object')
category
 
Category levels and changed variable type of C_MVALUE column:
Index(['No', 'Yes'], dtype='object')
category


In [29]:
boston_df = pd.get_dummies(boston_df, prefix_sep='_', drop_first = True)
print("Modified list of column variables:")
boston_df.columns

Modified list of column variables:


Index(['CRIME', 'ZONE', 'INDUST', 'NIT_OXIDE', 'ROOMS', 'AGE', 'DISTANCE',
       'RADIAL', 'TAX', 'ST_RATIO', 'LOW_STAT', 'MVALUE', 'CHAR_RIV_Y',
       'C_MVALUE_Yes'],
      dtype='object')

In [30]:
# Display values of the new dummy variables, 
# CHAR_RIV_Y and C_MVALUE_Yes.
#if the dummy variale CHAR_RIV_Y is 0 then it is N. Similarly, if the dummy variale C_MVALUE_Yes is 0 then it is No.
print(boston_df.loc[:, 'CHAR_RIV_Y':'C_MVALUE_Yes'].head(10))

   CHAR_RIV_Y  C_MVALUE_Yes
0           0             0
1           0             0
2           0             1
3           0             1
4           0             1
5           0             0
6           0             0
7           0             0
8           0             0
9           0             0


In [34]:
des_df = pd.DataFrame(np.round(boston_df.describe(), decimals=2))
print("Descriptive statistics for all the columns in the modified data frame:")
print(" ")
print(des_df)
# filepath = Path("D:\BAN 620 Data MIning\Case studies\Case 1\out.csv")
# des_df.to_csv(filepath)

Descriptive statistics for all the columns in the modified data frame:
 
        CRIME    ZONE  INDUST  NIT_OXIDE   ROOMS     AGE  DISTANCE  RADIAL  \
count  506.00  506.00  506.00     506.00  506.00  506.00    506.00  506.00   
mean     3.61   11.36   11.14       0.55    6.28   68.57      3.80    9.55   
std      8.60   23.32    6.86       0.12    0.70   28.15      2.11    8.71   
min      0.01    0.00    0.46       0.38    3.56    2.90      1.13    1.00   
25%      0.08    0.00    5.19       0.45    5.89   45.02      2.10    4.00   
50%      0.26    0.00    9.69       0.54    6.21   77.50      3.21    5.00   
75%      3.68   12.50   18.10       0.62    6.62   94.07      5.19   24.00   
max     88.98  100.00   27.74       0.87    8.78  100.00     12.13   24.00   

          TAX  ST_RATIO  LOW_STAT  MVALUE  CHAR_RIV_Y  C_MVALUE_Yes  
count  506.00    506.00    506.00  506.00      506.00        506.00  
mean   408.24     18.46     12.65   22.53        0.07          0.17  
std    168.54 

PermissionError: [Errno 13] Permission denied: 'D:\\BAN 620 Data MIning\\Case studies\\Case 1\\out.csv'

In [35]:
print("               Count")
print(boston_df[:].count()) #find out missing values #result shows no missing values

               Count
CRIME           506
ZONE            506
INDUST          506
NIT_OXIDE       506
ROOMS           506
AGE             506
DISTANCE        506
RADIAL          506
TAX             506
ST_RATIO        506
LOW_STAT        506
MVALUE          506
CHAR_RIV_Y      506
C_MVALUE_Yes    506
dtype: int64


## Linear Regression Model

In [36]:
excludeCol = ['MVALUE'] #excluding the columns other than predictors

In [37]:
predictors = [s for s in boston_df if s not in excludeCol]

In [38]:
print(predictors)

['CRIME', 'ZONE', 'INDUST', 'NIT_OXIDE', 'ROOMS', 'AGE', 'DISTANCE', 'RADIAL', 'TAX', 'ST_RATIO', 'LOW_STAT', 'CHAR_RIV_Y', 'C_MVALUE_Yes']


In [39]:
outcome = 'MVALUE' # defining the outcome variable

In [40]:
X = boston_df[predictors]
y = boston_df[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

In [41]:
lin_model = LinearRegression()
lin_model.fit(train_X, train_y)

LinearRegression()

In [45]:
# Display intercept and regression coefficients. Round them to 2 decimals.
print('Regression Model for Boston Housing Training Set')
print()
print('Intercept: ', np.round(lin_model.intercept_, 2))
print("")
reg_coeff_df = pd.DataFrame({'Predictor': X.columns, 'Coefficient': np.round(lin_model.coef_, 2)})
print(reg_coeff_df)

Regression Model for Boston Housing Training Set

Intercept:  43.65

       Predictor  Coefficient
0          CRIME        -0.14
1           ZONE         0.01
2         INDUST         0.12
3      NIT_OXIDE       -16.47
4          ROOMS         0.89
5            AGE        -0.01
6       DISTANCE        -0.72
7         RADIAL         0.20
8            TAX        -0.01
9       ST_RATIO        -0.58
10      LOW_STAT        -0.45
11    CHAR_RIV_Y         2.11
12  C_MVALUE_Yes        10.99


In [46]:
# MAthematical equation for the linear Regression smodel
print((np.round(lin_model.intercept_, 2)), '+')
for row in range(len(reg_coeff_df.Predictor)):
#     print(row)
    print(reg_coeff_df.Coefficient[row], '*', reg_coeff_df.Predictor[row], '+')
    

43.65 +
-0.14 * CRIME +
0.01 * ZONE +
0.12 * INDUST +
-16.47 * NIT_OXIDE +
0.89 * ROOMS +
-0.01 * AGE +
-0.72 * DISTANCE +
0.2 * RADIAL +
-0.01 * TAX +
-0.58 * ST_RATIO +
-0.45 * LOW_STAT +
2.11 * CHAR_RIV_Y +
10.99 * C_MVALUE_Yes +


In [47]:
valid_pred_y = lin_model.predict(valid_X)

# Develop and display data frame with actual values of Price,
# scoring (predicted) results, and residuals.
print('Actual, Predicted, and Residual Prices for Validation Set')
result = round(pd.DataFrame({'Actual': valid_y,'Predicted': valid_pred_y, 
                       'Residual': valid_y - valid_pred_y}), 2)
print(result.head(10))

Actual, Predicted, and Residual Prices for Validation Set
     Actual  Predicted  Residual
307    28.2      25.53      2.67
343    23.9      22.95      0.95
47     16.6      17.89     -1.29
67     22.0      21.81      0.19
362    20.8      18.89      1.91
132    23.0      19.60      3.40
292    27.9      25.95      1.95
31     14.5      17.90     -3.40
218    21.5      22.40     -0.90
90     22.6      23.25     -0.65


In [48]:
train_pred_y = lin_model.predict(train_X)

# Develop and display data frame with actual values of Price,
# scoring (predicted) results, and residuals.
print('Actual, Predicted, and Residual Prices for Training Set')
result = round(pd.DataFrame({'Actual': train_y,'Predicted': train_pred_y, 
                       'Residual': train_y - train_pred_y}), 2)
print(result.head(10))

Actual, Predicted, and Residual Prices for Training Set
     Actual  Predicted  Residual
452    16.1      17.10     -1.00
346    17.2      17.48     -0.28
295    28.6      26.13      2.47
88     23.6      24.93     -1.33
322    20.4      22.11     -1.71
131    19.6      19.00      0.60
124    18.8      20.34     -1.54
174    22.6      23.47     -0.87
461    17.7      18.43     -0.73
191    30.5      36.54     -6.04


In [49]:
# Create prediction performance measures for training set.
r2 = round(r2_score(train_y, train_pred_y),3)
adj_r2 = round(adjusted_r2_score(train_y, train_pred_y, lin_model),3)

In [50]:
# Display prediction performance measures for training set.
print('Prediction Performance Measures for Training Set')
print('r2 : ', r2)
print('Adjusted r2 : ', adj_r2)

Prediction Performance Measures for Training Set
r2 :  0.839
Adjusted r2 :  0.832


In [51]:
# Create prediction performance measures for validation set.
r2 = round(r2_score(valid_y, valid_pred_y),3)
adj_r2 = round(adjusted_r2_score(valid_y, valid_pred_y, lin_model),3)

In [53]:
# Display prediction performance measures for validation set.
print('Prediction Performance Measures for Validation Set')
print('r2 : ', r2)
print('Adjusted r2 : ', adj_r2)

Prediction Performance Measures for Validation Set
r2 :  0.834
Adjusted r2 :  0.822


In [191]:
# Display common accuracy measures for training set.
print('Common Accuracy Measures for Training Set - All Variables')
regressionSummary(train_y, train_pred_y)
print()

# Display common accuracy measures for validation set.
print('Common Accuracy Measures for Validation Set - All Variables')
regressionSummary(valid_y, valid_pred_y)

Common Accuracy Measures for Training Set - All Variables

Regression statistics

                      Mean Error (ME) : 0.0000
       Root Mean Squared Error (RMSE) : 3.5845
            Mean Absolute Error (MAE) : 2.5961
          Mean Percentage Error (MPE) : -2.7127
Mean Absolute Percentage Error (MAPE) : 13.1715

Common Accuracy Measures for Validation Set - All Variables

Regression statistics

                      Mean Error (ME) : 0.4347
       Root Mean Squared Error (RMSE) : 3.8763
            Mean Absolute Error (MAE) : 2.7696
          Mean Percentage Error (MPE) : -2.2773
Mean Absolute Percentage Error (MAPE) : 13.3233


# Exhaustive Search Algorithm 

In [54]:
# modeling with different combination of variables
def train_model(variables): #variables- no. of predictors may be different
    model = LinearRegression()
    model.fit(train_X[variables], train_y)
    return model

# Define score_model() function used in Executive Search
# algorithm with executive_search() function. 
def score_model(model, variables):  #score_model() function that score the model performance using adjusted_r2.
    pred_y = model.predict(train_X[variables])
    # Apply negative sign as score is optimized to be 
    # as low as possible in exhaustive_search() function.
    return -adjusted_r2_score(train_y, pred_y, model)

# Create allVariables object with predcitors in train_X,
# i.e., training data set, with 11 predictor columns and 
# 600 records.
allVariables = train_X.columns

# The exhaustive_search() function consists of 3 arguments:
# - allVariables - list of all variables in training data set,
# - train_model() function that creates a model for a specific 
#    combination of variables,
# - score_model() function that score the model performance using
#     adjusted_r2.
results = exhaustive_search(allVariables, train_model, score_model) #this function can analyse different suset of preditors
# and identify the best outcome for each subset

# Create data[] loop process to identify and append the best model 
#  for each combination of 1, 2, 3, ..., 11 variables with their 
# respective number of variables (n), adjusted R_squared (r2adj) and
# AIC. 
data = []
for result in results:
    model = result['model']
    variables = result['variables']
    AIC = AIC_score(train_y, model.predict(train_X[variables]), model)
    d = {'n': result['n'], 'r2adj': -result['score'], 'AIC': AIC}
    d.update({var: var in result['variables'] for var in allVariables})
    data.append(d)

# Define the width of output presentation to be wider
# to display results in two rows (instead of more rows
# otherwise). 
pd.set_option('display.width', 100)

# Display the Exhaustive Search results.
results = pd.DataFrame(data, columns=('n', 'r2adj', 'AIC') + tuple(sorted(allVariables)))
print(results)

# Reset the output width to the default. 
pd.reset_option('display.width')

     n     r2adj          AIC    AGE  CHAR_RIV_Y  CRIME  C_MVALUE_Yes  DISTANCE  INDUST  LOW_STAT  \
0    1  0.604171  1911.931006  False       False  False          True     False   False     False   
1    2  0.793030  1716.454818  False       False  False          True     False   False      True   
2    3  0.804621  1699.980252  False       False   True          True     False   False      True   
3    4  0.810106  1692.338134  False        True   True          True     False   False      True   
4    5  0.814675  1685.940421  False        True   True          True     False   False      True   
5    6  0.821125  1676.183684  False       False   True          True      True   False      True   
6    7  0.826096  1668.619824  False       False   True          True      True   False      True   
7    8  0.830529  1661.766431  False        True   True          True      True   False      True   
8    9  0.831819  1660.418877  False        True   True          True      True   False    

In [56]:
# filepath = Path("D:\BAN 620 Data MIning\Case studies\Case 1\Ex_results.csv")
# results.to_csv(filepath)

In [196]:
# Develop the multiple linear regression model based
# on the Exhaustive Search results.

# Identify predictors and outcome of the regression model.
predictors_ex = ['CHAR_RIV_Y', 'CRIME', 'C_MVALUE_Yes', 
                 'DISTANCE', 'INDUST', 'LOW_STAT', 
                 'NIT_OXIDE', 'RADIAL', 'ROOMS', 'ST_RATIO', 'TAX']
outcome = 'MVALUE'

# Identify X and y variables for regression and partition data
# using 60% of records for training and 40% for validation 
# (test_size=0.4). 
X = boston_df[predictors_ex]
y = boston_df[outcome]
train_X_ex, valid_X_ex, train_y_ex, valid_y_ex = train_test_split(X, y, test_size=0.4, random_state=1)

# Create multiple linear regression model using X and y.
boston_df_ex = LinearRegression()
boston_df_ex.fit(train_X_ex, train_y_ex)

# Display intercept and regression coefficients. Round them
# to 2 decimals.
print('Regression Model for Training Set Using Exhaustive Search')
print()
print('Intercept ', np.round(boston_df_ex.intercept_, 2))
regex_coeff_df = pd.DataFrame({'Predictor': X.columns,
            'Coefficient': np.round(boston_df_ex.coef_, 2)})
print(regex_coeff_df)


Regression Model for Training Set Using Exhaustive Search

Intercept  43.89
       Predictor  Coefficient
0     CHAR_RIV_Y         2.13
1          CRIME        -0.14
2   C_MVALUE_Yes        11.11
3       DISTANCE        -0.63
4         INDUST         0.11
5       LOW_STAT        -0.46
6      NIT_OXIDE       -16.89
7         RADIAL         0.19
8          ROOMS         0.86
9       ST_RATIO        -0.61
10           TAX        -0.01


In [199]:
# Mathematical equation for the linear Regression model based on Exhaustive search
print((np.round(boston_df_ex.intercept_, 2)), '+')
for row in range(len(regex_coeff_df.Predictor)):
    print(regex_coeff_df.Coefficient[row], '*', regex_coeff_df.Predictor[row], '+')

43.89 +
2.13 * CHAR_RIV_Y +
-0.14 * CRIME +
11.11 * C_MVALUE_Yes +
-0.63 * DISTANCE +
0.11 * INDUST +
-0.46 * LOW_STAT +
-16.89 * NIT_OXIDE +
0.19 * RADIAL +
0.86 * ROOMS +
-0.61 * ST_RATIO +
-0.01 * TAX +


In [200]:
# Use predict() function to score (make) predictions 
# for validation set and measure their accuracy using
# Exhaustive Search algorithm.
valid_ex_pred_y = boston_df_ex.predict(valid_X_ex)

# Develop and display data frame with actual values of Price,
# scoring (predicted) results, and residuals.
# Use round() function to round vlaues in data frame to 
# 2 decimals. 
result = round(pd.DataFrame({'Actual': valid_y_ex,'Predicted': valid_ex_pred_y, 
                       'Residual': valid_y_ex - valid_ex_pred_y}), 2)
print()
print('Prediction for Validation Set Using Exhaustive Search') 
print(result.head(10))

# Display common accuracy measures for validation set.
print()
print('Accuracy Measures for Validation Set Using Exhaustive Search')
regressionSummary(valid_y_ex, valid_ex_pred_y)


Prediction for Validation Set Using Exhaustive Search
     Actual  Predicted  Residual
307    28.2      25.24      2.96
343    23.9      22.78      1.12
47     16.6      18.17     -1.57
67     22.0      21.86      0.14
362    20.8      18.93      1.87
132    23.0      19.58      3.42
292    27.9      25.25      2.65
31     14.5      18.06     -3.56
218    21.5      22.49     -0.99
90     22.6      23.28     -0.68

Accuracy Measures for Validation Set Using Exhaustive Search

Regression statistics

                      Mean Error (ME) : 0.4505
       Root Mean Squared Error (RMSE) : 3.8674
            Mean Absolute Error (MAE) : 2.7724
          Mean Percentage Error (MPE) : -2.1963
Mean Absolute Percentage Error (MAPE) : 13.3441


##  Forward Selection algorithm.

In [202]:
def train_model(variables):
    if len(variables) == 0:
        return None
    model = LinearRegression()
    model.fit(train_X[variables], train_y)
    return model

# Define score_model() function used in Forward Selection
# algorithm with forward_selection() function. 
def score_model(model, variables):
    if len(variables) == 0:
        return AIC_score(train_y, [train_y.mean()] * len(train_y), model, df=1)
    return AIC_score(train_y, model.predict(train_X[variables]), model)

# Use forward_selection() function to identify the
# best_model and best_variables.
best_model_fs, best_variables_fs = forward_selection(train_X.columns, 
                    train_model, score_model, verbose=True)

# Display best variables based on Forward Selection algorithm.
print()
print('Best Variables from Forward Selection Algorithm')
print(best_variables_fs)
print(best_model_fs)

Variables: CRIME, ZONE, INDUST, NIT_OXIDE, ROOMS, AGE, DISTANCE, RADIAL, TAX, ST_RATIO, LOW_STAT, CHAR_RIV_Y, C_MVALUE_Yes
Start: score=2191.75, constant
Step: score=1911.93, add C_MVALUE_Yes
Step: score=1716.45, add LOW_STAT
Step: score=1699.98, add CRIME
Step: score=1692.34, add CHAR_RIV_Y
Step: score=1685.94, add ST_RATIO
Step: score=1682.90, add ROOMS
Step: score=1680.20, add DISTANCE
Step: score=1665.78, add NIT_OXIDE
Step: score=1660.42, add RADIAL
Step: score=1660.42, add None

Best Variables from Forward Selection Algorithm
['C_MVALUE_Yes', 'LOW_STAT', 'CRIME', 'CHAR_RIV_Y', 'ST_RATIO', 'ROOMS', 'DISTANCE', 'NIT_OXIDE', 'RADIAL']
LinearRegression()


In [206]:
# Develop the multiple linear regression model based
# on the Forward Selection results.

# Identify predictors and outcome of the regression model.
predictors_fs = best_variables_fs
outcome = 'MVALUE'

# Identify X and y variables for regression and partition data
# using 60% of records for training and 40% for validation 
# (test_size=0.4). 
X = boston_df[predictors_fs]
y = boston_df[outcome]
train_X_fs, valid_X_fs, train_y_fs, valid_y_fs = train_test_split(X, y, test_size=0.4, random_state=1)

# Create multiple linear regression model using X and y.
boston_fs = LinearRegression()
boston_fs.fit(train_X_fs, train_y_fs)

# Display intercept and regression coefficients. Round them
# to 2 decimals.
print('Regression Model for Training Set Using Forward Selection')
print()
print('Intercept ', np.round(boston_fs.intercept_, 2))
reg_fs_coeff = pd.DataFrame({'Predictor': X.columns,
            'Coefficient': np.round(boston_fs.coef_, 2)})
print(reg_fs_coeff)

Regression Model for Training Set Using Forward Selection

Intercept  42.76
      Predictor  Coefficient
0  C_MVALUE_Yes        10.97
1      LOW_STAT        -0.45
2         CRIME        -0.14
3    CHAR_RIV_Y         2.36
4      ST_RATIO        -0.60
5         ROOMS         0.87
6      DISTANCE        -0.71
7     NIT_OXIDE       -15.95
8        RADIAL         0.11


In [207]:
# Mathematical equation for the linear Regression model based on Forward Selection
print((np.round(boston_fs.intercept_, 2)), '+')
for row in range(len(reg_fs_coeff.Predictor)):
    print(reg_fs_coeff.Coefficient[row], '*', reg_fs_coeff.Predictor[row], '+')

42.76 +
10.97 * C_MVALUE_Yes +
-0.45 * LOW_STAT +
-0.14 * CRIME +
2.36 * CHAR_RIV_Y +
-0.6 * ST_RATIO +
0.87 * ROOMS +
-0.71 * DISTANCE +
-15.95 * NIT_OXIDE +
0.11 * RADIAL +


In [208]:
# Use predict() to score predictions for validation set.
valid_fs_pred_y = boston_fs.predict(valid_X_fs)

# Develop and display data frame with actual values of Price,
# scoring (predicted) results, and residuals.
# Use round() function to round vlaues in data frame to 
# 2 decimals. 
result = round(pd.DataFrame({'Actual': valid_y_fs,'Predicted': valid_fs_pred_y, 
                       'Residual': valid_y_fs - valid_fs_pred_y}), 2)
print()
print('Predictions for Validation Set Using Forward Selection')
print(result.head(10))

# Display common accuracy measures for validation set.
print()
print('Accuracy Measures for Validation Set Using Forward Selection')
regressionSummary(valid_y_fs, valid_fs_pred_y)


Predictions for Validation Set Using Forward Selection
     Actual  Predicted  Residual
307    28.2      25.20      3.00
343    23.9      23.48      0.42
47     16.6      17.85     -1.25
67     22.0      22.11     -0.11
362    20.8      18.97      1.83
132    23.0      19.27      3.73
292    27.9      25.07      2.83
31     14.5      18.21     -3.71
218    21.5      22.04     -0.54
90     22.6      23.86     -1.26

Accuracy Measures for Validation Set Using Forward Selection

Regression statistics

                      Mean Error (ME) : 0.4321
       Root Mean Squared Error (RMSE) : 3.9314
            Mean Absolute Error (MAE) : 2.8585
          Mean Percentage Error (MPE) : -2.3792
Mean Absolute Percentage Error (MAPE) : 13.8040
