In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.simplefilter("ignore")
import seaborn as sns
#sns.set(style="whitegrid")

from sklearn.model_selection import train_test_split                              # To split the data in training and testing part
from sklearn.preprocessing import StandardScaler                                  # Importing Standard Scaler library from preprocessing.
from sklearn.linear_model import LinearRegression                                 # Importing Linear Regression model
from sklearn.metrics import mean_squared_error                                    # To calculate the MSE of a regression model
from sklearn.metrics import mean_absolute_error                                   # To calculate the MAE of a regression model
from sklearn.metrics import r2_score

In [2]:
np.random.seed(72018)

In [4]:
# User-defined functions
def to_2d(array):
    return array.reshape(array.shape[0], -1)

def plot_exponential_data():
    data = np.exp(np.random.normal(size=1000))
    plt.hist(data)
    plt.show()
    return data

def plot_square_normal_data():
    data = np.square(np.random.normal(loc=5, size=1000))
    plt.hist(data)
    plt.show()
    return data

In [5]:
def boston_dataframe(description=False):

    data_url = "http://lib.stat.cmu.edu/datasets/boston"
    raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
    data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
    target = raw_df.values[1::2, 2]
    target = to_2d(target)
    data_all = np.concatenate([data, target], axis=1)

    columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT','MEDV']
    return pd.DataFrame(data=data_all, columns=columns)

In [6]:
boston_data = boston_dataframe()

In [7]:
boston_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


* CRIM - per capita crime rate by town
* ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
* INDUS - proportion of non-retail business acres per town.
* CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
* NOX - nitric oxides concentration (parts per 10 million)
* RM - average number of rooms per dwelling
* AGE - proportion of owner-occupied units built prior to 1940
* DIS - weighted distances to five Boston employment centres
* RAD - index of accessibility to radial highways
* TAX - full-value property-tax rate per $10,000
* PTRATIO - pupil-teacher ratio by town
* B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town

* LSTAT - % lower status of the population
* MEDV - Median value of owner-occupied homes in $1000's- The target variable

<b> The task is to fit a model which will determine the median price of homes in Boston based on appropriate input parameters.

In [8]:
boston_data.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


**Lets fit our data to Multiple Linear regression model.**

First thing we will split our data into training and test set.


In [9]:
# data separated from main dataframe for multiple linear regression
X = boston_data.drop(labels=['MEDV'], axis=1)
y = boston_data['MEDV']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)


In [10]:
mlr = LinearRegression()
mlr.fit(X_train, y_train)

y_pred = mlr.predict(X_test)
y_pred_train = mlr.predict(X_train)

In [11]:
train_r2 = r2_score(y_train, y_pred_train)
print("R2 Score : train data")
print( train_r2)

adj_r2_train = 1 - (1-train_r2)*(X_train.shape[0]-1)/(X_train.shape[0]-X_train.shape[1]-1)
print("\nAdjusted  R2 Score : train data")
print( adj_r2_train)

R2 Score : train data
0.7103879080674731

Adjusted  R2 Score : train data
0.6993145045524058


In [12]:
train_mae = mean_absolute_error(y_train, y_pred_train)
print("\nMAE : train data")
print( train_mae)

train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("\nRMSE : train data")
print( train_rmse)


MAE : train data
3.3444361206579796

RMSE : train data
4.849055005805464


**Lets fit our test data on the model**

In [13]:
mlr_test = LinearRegression()
mlr_test.fit(X_test, y_test)

y_pred = mlr_test.predict(X_test)

test_r2 = r2_score(y_test, y_pred)

print("R2 Score : test data  ")
print(test_r2)

adj_r2_test = 1 - (1-test_r2)*(X_test.shape[0]-1)/(X_test.shape[0]-X_test.shape[1]-1)
print("\nAdjusted  R2 Score : test data   ")
print(adj_r2_test)

test_mae = mean_absolute_error(y_test, y_pred)
print("\nMAE : test data   ")
print(test_mae)

test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("\nRMSE : test data  ")
print(test_rmse)

R2 Score : test data  
0.8446908413340615

Adjusted  R2 Score : test data   
0.8300602684162557

MAE : test data   
2.929060830001612

RMSE : test data  
3.7729009104055136


**Based on the values above we can say that our model is neither overfitting nor underfitting.**