In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [26]:
# This step could potentially be replaced, and data could be loaded from the database instead.

# Load data into the dataframe.
boston_df = pd.read_csv('boston_housing.csv')
print (f'Boston dataset at the beginning: {boston_df.shape}')
boston_df.head(10)

Boston dataset at the beginning: (511, 14)


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1,18.9


In [27]:
# Find if there are any NULL values.
boston_df.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         5
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

In [28]:
# Drop all NULL rows and examine the size of the dataframe again.
boston_df = boston_df.dropna(axis=0)
print (f'Boston dataset after dropping null values: {boston_df.shape}')

Boston dataset after dropping null values: (506, 14)


In [29]:
# Find column types. We observe they are all numeric, and we require no special handling.
boston_df.dtypes

CRIM       float64
ZN         float64
INDUS      float64
CHAS         int64
NOX        float64
RM         float64
AGE        float64
DIS        float64
RAD          int64
TAX          int64
PTRATIO    float64
B          float64
LSTAT      float64
MEDV       float64
dtype: object

In [30]:
# Use all columns except MEDV as the features, and use the MEDV column as the target.
X = boston_df.drop(columns=['MEDV'])
Y = boston_df['MEDV']
print (f'Separating features and target: {X.shape} | {Y.shape}')

Separating features and target: (506, 13) | (506,)


In [31]:
# Split data into training and testing sets. 80% data is used for training
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=100)
print(f'After splitting data: X train:{X_train.shape}, Y train: {Y_train.shape}, \
X test: {X_test.shape}, Y test: {Y_test.shape}')

After splitting data: X train:(404, 13), Y train: (404,), X test: (102, 13), Y test: (102,)


In [32]:
# Instantiate a linear regression model and fit on training dataset.
lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)

LinearRegression()

In [33]:
# Make predictions on testing data.
Y_pred = lin_model.predict(X_test)

In [34]:
# Print the model coefficients.
print (f'Model intercept: {lin_model.intercept_}, Model coefficients: {lin_model.coef_}')

Model intercept: 19.769695040117817, Model coefficients: [-1.73718114e-01  5.00236245e-02 -1.71249414e-02  3.60120412e+00
 -1.73636858e+01  5.81396262e+00 -5.59968595e-02 -1.93145874e+00
  2.21743501e-01 -1.30579236e-02 -6.08422755e-01  9.02413267e-03
 -1.01329327e-01]


In [37]:
# Print RMSE and accuracy.

# The mean squared error
print (f'Mean squared error: {mean_squared_error(Y_test, Y_pred):.2f}')

# Model accuracy:
print (f'Training accuracy: {lin_model.score(X_train, Y_train):.2f}')
print (f'Testing accuracy: {lin_model.score(X_test, Y_test):.2f}')

Mean squared error: 16.05
Training accuracy: 0.60
Testing accuracy: 0.75


In [38]:
# The sklearn linear model fit doesn't give p-value information. To get this info and display
# it in a table, we will use statsmodels instead. 
# Source: https://stackoverflow.com/questions/27928275/find-p-value-significance-in-scikit-learn-linearregression
import statsmodels.api as sm

X2_train = sm.add_constant(X_train)
lin_mod = sm.OLS(Y_train,X2_train)

fii = lin_mod.fit()
summary = fii.summary2()
print (summary)

                 Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.585    
Dependent Variable: MEDV             AIC:                2649.9501
Date:               2021-09-05 01:12 BIC:                2705.9699
No. Observations:   404              Log-Likelihood:     -1311.0  
Df Model:           13               F-statistic:        44.79    
Df Residuals:       390              Prob (F-statistic): 4.00e-69 
R-squared:          0.599            Scale:              39.940   
-------------------------------------------------------------------
              Coef.    Std.Err.     t     P>|t|    [0.025    0.975]
-------------------------------------------------------------------
const         19.7697    7.3766   2.6800  0.0077    5.2668  34.2726
CRIM          -0.1737    0.0463  -3.7492  0.0002   -0.2648  -0.0826
ZN             0.0500    0.0207   2.4195  0.0160    0.0094   0.0907
INDUS         -0.0171    0.0930  -0.1841  0.8540   -0.2000   0.1658
CHAS  

In [39]:
# Now, instantiate the Random Forrest model.
rf_model = RandomForestRegressor(criterion='mae', n_estimators=128, random_state=50)
rf_model = brf_model.fit(X_train, Y_train)
Y_pred = rf_model.predict(X_test)

# The mean squared error
print (f'Mean squared error: {mean_squared_error(Y_test, Y_pred):.2f}')

# Model accuracy:
print (f'Training accuracy: {rf_model.score(X_train, Y_train):.2f}')
print (f'Testing accuracy: {rf_model.score(X_test, Y_test):.2f}')

Mean squared error: 7.99
Training accuracy: 0.97
Testing accuracy: 0.87
