# Regression Metrics Exercise (Core)
- **Student:** Michael McCann
- **Date:** 28 FEB 2022

## Setup -Mount Drive, Import Libraries and Data

In [1]:
## Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
## Import Libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn import set_config
set_config(display='diagram')

In [3]:
## Load and Inspect the Data
boston_filepath = '/content/drive/MyDrive/Data/Boston_Housing_from_Sklearn.csv'
bos_df = pd.read_csv(boston_filepath)


## Data Dictionary

- CRIM: per capita crime rate by town
- NOX: nitric oxides concentration (parts per 10 million)
- RM: average number of rooms per dwelling
- AGE: proportion of owner-occupied units built prior to 1940
- PTRATIO: pupil-teacher ratio by town
- LSTAT: % lower economic status of the population
- PRICE: Median value of owner-occupied homes in $1000's


In [4]:
display(bos_df.head())
bos_df.info()

Unnamed: 0,CRIM,NOX,RM,AGE,PTRATIO,LSTAT,PRICE
0,0.00632,0.538,6.575,65.2,15.3,4.98,24.0
1,0.02731,0.469,6.421,78.9,17.8,9.14,21.6
2,0.02729,0.469,7.185,61.1,17.8,4.03,34.7
3,0.03237,0.458,6.998,45.8,18.7,2.94,33.4
4,0.06905,0.458,7.147,54.2,18.7,5.33,36.2


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   NOX      506 non-null    float64
 2   RM       506 non-null    float64
 3   AGE      506 non-null    float64
 4   PTRATIO  506 non-null    float64
 5   LSTAT    506 non-null    float64
 6   PRICE    506 non-null    float64
dtypes: float64(7)
memory usage: 27.8 KB


In [5]:
## No Duplicates or NAs
print(f"Number of NAs: {bos_df.isna().sum().sum()}")
print(f"Number of Duplicates: {bos_df.duplicated().sum()}")

Number of NAs: 0
Number of Duplicates: 0


In [6]:
corr_mask =  bos_df.corr()
round(bos_df.corr()[(corr_mask['PRICE'] >= .5) | (corr_mask['PRICE'] <= -.5)]['PRICE'], 2)

## Features with a correlation coefficient above/below  .5/-.5 relative to Price: 
## Rooms, Pupil-Teacher Ration, and percent lower economic status TRATIO, LSTAT

RM         0.70
PTRATIO   -0.51
LSTAT     -0.74
PRICE      1.00
Name: PRICE, dtype: float64

## Train Test Split

In [7]:
y = bos_df['PRICE']
X = bos_df[['RM', 'PTRATIO', 'LSTAT']]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

## Run Scaler

In [8]:
# Instantiate and Fit StandardScaler to training set

scaler = StandardScaler()
scaler.fit(X_train)

In [9]:
# Run StandardScaler and transform back into DF
scaled_train = scaler.transform(X_train)
scaled_test = scaler.transform(X_test)

cols = ('RM', 'PTRATIO', 'LSTAT')

X_train_output = pd.DataFrame(scaled_train, columns = cols)
X_test_output = pd.DataFrame(scaled_test, columns = cols)

## Regression Model

In [10]:
# Instantiate Linear Regression as reg
reg = LinearRegression()

In [11]:
# Fit Linear Regression to the model
reg.fit(X_train_output, y_train)

In [12]:
# Predictions
train_preds = reg.predict(X_train_output)
test_preds = reg.predict(X_test_output)

In [13]:
# R^2
r2_train = round(r2_score(y_train, train_preds),2)
r2_test = round(r2_score(y_test, test_preds),2)

print(f'r2 score for the training set is: {r2_train}')
print(f'r2 score for the test set is: {r2_test}')

r2 score for the training set is: 0.69
r2 score for the test set is: 0.63


In [14]:
# MAE
mae_train = round(mean_absolute_error(y_train, train_preds),2)
mae_test = round(mean_absolute_error(y_test, test_preds),2)

print(f'MAE score for the training set is: {mae_train}')
print(f'MAE score for the test set is: {mae_test}')

MAE score for the training set is: 3.71
MAE score for the test set is: 3.39


In [15]:
# MSE
mse_train = round(mean_squared_error(y_train, train_preds),2)
mse_test = round(mean_squared_error(y_test, test_preds),2)

print(f'MSE score for the training set is: {mse_train}')
print(f'MSE score for the test set is: {mse_test}')

MSE score for the training set is: 27.69
MSE score for the test set is: 25.69


In [16]:
# RSME
rmse_train = round(np.sqrt(mean_squared_error(y_train, train_preds)),2)
rmse_test = round(np.sqrt(mean_squared_error(y_test, test_preds)),2)

print(f'RMSE score for the training set is: {rmse_train}')
print(f'RMSE score for the test set is: {rmse_test}')

RMSE score for the training set is: 5.26
RMSE score for the test set is: 5.07


In [29]:
pipe = make_pipeline(StandardScaler(), LinearRegression())
pipe.fit(X_train, y_train)

In [31]:
def evaluate_model(y_true, y_pred):
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = np.sqrt(mean_squared_error(y_true, y_pred))
  r2 = r2_score(y_true, y_pred)

  print(f'MAE: {mae:,.2f} \nMSE: {mse:,.2f} \nRMSE: {rmse:,.2f} \
  \nR2: {r2:.2f}')

print("Training Model Scores")
print(evaluate_model(y_train, pipe.predict(X_train)))
print("\nTest Model Scores")
evaluate_model(y_test, pipe.predict(X_test))

Training Model Scores
MAE: 3.71 
MSE: 27.69 
RMSE: 5.26   
R2: 0.69
None

Test Model Scores
MAE: 3.39 
MSE: 25.69 
RMSE: 5.07   
R2: 0.63
