In [1]:
################################################################################################################################################
################################################################################################################################################
############################################# PROGRAM TO CREATE LINEAR REGRESSION MODEL ########################################################
################################################################################################################################################
################################################################################################################################################

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats


In [2]:
################################################ Read the Housing Data CSV file ##############################################################
column_names = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE',
                'DIS','RAD','TAX','PTRATIO','B','LSTAT','MEDV']

""" Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
        - LSTAT    % lower status of the population
        - MEDV     Median value of owner-occupied homes in $1000's
 """
data_boston = pd.read_csv("../data/bostonhousingdata.csv", header=None, delimiter=r"\s+", names=column_names)
data_boston.head(20)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222.0,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311.0,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311.0,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311.0,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311.0,15.2,386.71,17.1,18.9


In [3]:
##################################### Define Dependent and Independent Attributes ##########################################################
df = data_boston.drop(columns=["MEDV"])
df.head(10)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222.0,18.7,394.12,5.21
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311.0,15.2,395.6,12.43
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311.0,15.2,396.9,19.15
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311.0,15.2,386.63,29.93
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311.0,15.2,386.71,17.1


In [4]:
target = data_boston[["MEDV"]]
target.head(10)

Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2
5,28.7
6,22.9
7,27.1
8,16.5
9,18.9


In [5]:
###################################################################################################################################
##################################### Model Evaluation on Training and Testing Data ###############################################
###################################################################################################################################

from sklearn.model_selection import train_test_split

############################ Split the data into test and training (15% as test data) #############################################
x_train, x_test, y_train, y_test = train_test_split(df, target, test_size=0.15, random_state=10)

In [6]:
print("Number of test samples :", x_test.shape[0])
print("Number of training samples:",x_train.shape[0])

Number of test samples : 76
Number of training samples: 430


In [7]:
################################### Fit (Train) the model using the training data ##############################################
from sklearn.linear_model import LinearRegression  
lm_train = LinearRegression()
lm_train.fit(x_train,y_train)


In [8]:
# Co-efficients & Intercept
print('Intercept: \n', lm_train.intercept_)
print('Coefficients: \n', lm_train.coef_)

Intercept: 
 [35.72051582]
Coefficients: 
 [[-1.11794576e-01  6.18749173e-02 -3.70418899e-03  2.15669708e+00
  -1.59421272e+01  3.26720866e+00  9.44918091e-03 -1.48988233e+00
   2.96772087e-01 -1.18666329e-02 -8.27560818e-01  1.09551959e-02
  -5.60250248e-01]]


In [9]:
coeff_df = pd.DataFrame({'Variable': np.append('Intercept', df.columns.to_numpy()), 'Coefficient': np.append(lm_train.intercept_,lm_train.coef_)}) 
coeff_df  

Unnamed: 0,Variable,Coefficient
0,Intercept,35.720516
1,CRIM,-0.111795
2,ZN,0.061875
3,INDUS,-0.003704
4,CHAS,2.156697
5,NOX,-15.942127
6,RM,3.267209
7,AGE,0.009449
8,DIS,-1.489882
9,RAD,0.296772


In [10]:
######################################## Prediction using Training Data ########################################################
predictions_train = lm_train.predict(x_train)
print(predictions_train[0:5])

[[38.50466036]
 [ 8.24724764]
 [18.28981594]
 [30.23698109]
 [22.67291665]]


In [11]:
######################################## Prediction using Test Data ############################################################
predictions_test = lm_train.predict(x_test)
print(predictions_test[0:5])

[[30.88964438]
 [31.80456557]
 [30.77368612]
 [22.23594312]
 [18.66336652]]


In [12]:
######################################### Model accuracy using Training Data ###################################################
from sklearn.metrics import mean_squared_error
mse_train = mean_squared_error(y_train,predictions_train)
print("Mean square error is",mse_train)

Mean square error is 19.29593358353971


In [13]:
rmse_train = np.sqrt(mean_squared_error(y_train, predictions_train))
print(f"Root Mean Squared Error (RMSE): {rmse_train}")

Root Mean Squared Error (RMSE): 4.39271369241608


In [14]:
from sklearn.metrics import mean_absolute_error

mae_train = mean_absolute_error(y_train, predictions_train)
print(f"Mean Absolute Error (MAE): {mae_train}")

Mean Absolute Error (MAE): 3.080819594596147


In [15]:
from sklearn.metrics import r2_score
r_squared_train = r2_score(y_train, predictions_train)
print(f"R-squared: {r_squared_train}")

R-squared: 0.7469674747271953


In [16]:
#print("R-Square value using training data is", lm_train.score(x_train,y_train))
#Returned same value as above (r2_score)

In [17]:
########################################### Model accuracy using Test Data #####################################################
mse_test = mean_squared_error(y_test,predictions_test)
print("Mean square error is",mse_test)
rmse_test = np.sqrt(mean_squared_error(y_test, predictions_test))
print(f"Root Mean Squared Error (RMSE): {rmse_test}")
mae_test = mean_absolute_error(y_test, predictions_test)
print(f"Mean Absolute Error (MAE): {mae_test}")
print("R-Square value using test data is", r2_score(y_test,predictions_test))

Mean square error is 38.401262752112544
Root Mean Squared Error (RMSE): 6.19687524096722
Mean Absolute Error (MAE): 4.278964810102529
R-Square value using test data is 0.6622047563270379


In [18]:
###############################################################################################################################
############################################### K-Fold Cross Validation #######################################################
###############################################################################################################################
from sklearn.model_selection import cross_val_score, KFold

lm = LinearRegression()
lm.fit(target, df)
rcross = cross_val_score(lm, df, target, cv=KFold(n_splits=5,shuffle=True))

############ R-Sqaure values for each of the fold and their mean #################
print(rcross)
print("The mean of the folds are", rcross.mean())

[0.71223343 0.75742729 0.7942689  0.63303661 0.66985438]
The mean of the folds are 0.7133641239546289
