## Linear Regression on Boston Housing Dataset

In [23]:
#Import libraries for analysis

import pandas as pd
import numpy as np

# for visualizing data
import matplotlib.pyplot as plt
import seaborn as sns

# For randomized data splitting
from sklearn.model_selection import train_test_split

# To build linear regression_model
import statsmodels.api as sm

# To check model performance
from sklearn.metrics import mean_absolute_error, mean_squared_error

# For linear regression model
from sklearn import linear_model


In [24]:
## load dataset
boston=pd.read_csv("boston.csv")
boston

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,6.48,22.0


In [25]:
#create the x and y variables
# X variables
X = boston.drop(['MEDV'], axis = 1)
#y variable
y = boston.MEDV

In [26]:
y

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Name: MEDV, Length: 506, dtype: float64

In [27]:
# linear regression model, reg for regression
reg = linear_model.LinearRegression()

In [28]:
# split data to 67% train and 33% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42
)

In [29]:
# train the model with training data
reg.fit(X_train, y_train)

In [30]:
# Print the coefficients/weights for each feature/column of our model
print(reg.coef_)

[-1.41995876e-01  3.82566053e-02  4.56104214e-02  3.58692385e+00
 -1.68215690e+01  3.74518111e+00 -1.08468868e-02 -1.42707547e+00
  2.14335914e-01 -8.72764853e-03 -9.08702152e-01 -5.72774312e-01]


In [31]:
# Print predictions on test data
y_pred = reg.predict(X_test)
print(y_pred)

[28.45759328 36.94034798 15.01130441 25.50243962 17.6994546  23.19263435
 17.07927304 13.50682586 22.12763554 20.22919805 24.89432993 18.19463203
 -4.76112302 21.66399545 19.16792639 25.22641178 20.15908195  4.67445236
 40.21025105 16.26278197 27.4458162  30.13928585 10.94447697 23.81055962
 17.8167994  15.56475334 23.09054815 17.65324878 22.55345237 19.01778652
 22.07767352 24.97038281 23.77813661 17.8244919  16.26856914 18.38364618
 30.71918632 20.02809555 24.28457502 25.02219105 13.57872901 31.58962887
 42.38669624 17.60680932 26.97575697 16.1307151  13.95728454 26.1469104
 19.25175658 29.99763503 20.96854231 33.98618189 15.15059525 25.80755931
 38.94749052 22.09136736 17.69047137 33.12200938 24.69622419 12.69886112
 22.20887784 30.59532443 31.60056863 17.52413645 21.2083156  16.79308434
 19.09947202 26.14159757 31.19403168 15.29964473 20.31961475 26.61983513
 11.37894864 17.15755231 23.90447243  3.68746902 21.25338391 41.15990604
 16.75358389  9.65615963 21.20185266 12.78353272 21.

In [32]:
# Print actual values
print(y_test)

173    23.6
274    32.4
491    13.6
72     22.8
452    16.1
       ... 
110    21.7
321    23.1
265    22.8
29     21.0
262    48.8
Name: MEDV, Length: 167, dtype: float64


In [33]:
# to check model performance/accuracy, how good the model is using mean squared error using numpy
np.mean((y_pred - y_test)**2)


20.46002262243072

In [34]:
# to check model performance using sklearn metrics
print(mean_squared_error(y_test, y_pred))

20.46002262243072


In [35]:
# you want to be as close to 0 as possible for the mean squared error, this means that the lower it is the higher better the 
#model prediction

In [36]:
print(mean_absolute_error(y_test, y_pred))

3.1737424602225723
