<a href="https://colab.research.google.com/github/mvince33/Coding-Dojo/blob/main/week06/regression_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Imports
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
# Get the data
california = fetch_california_housing(as_frame = True)

# Make a DataFrame
df = california.frame

# Rescale median value since it's in 100,000s
df['MedHouseVal'] = df['MedHouseVal'] * 100000

df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,452600.0
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,358500.0
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,352100.0
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,341300.0
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,342200.0


In [5]:
# Check for duplicates and missing values
print('Duplicates:', df.duplicated().sum())
print('Missing Values:', df.isna().sum().sum())

Duplicates: 0
Missing Values: 0


In [6]:
# Check the datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1   HouseAge     20640 non-null  float64
 2   AveRooms     20640 non-null  float64
 3   AveBedrms    20640 non-null  float64
 4   Population   20640 non-null  float64
 5   AveOccup     20640 non-null  float64
 6   Latitude     20640 non-null  float64
 7   Longitude    20640 non-null  float64
 8   MedHouseVal  20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [7]:
# Check for outliers
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,206855.816909
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,115395.615874
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,14999.0
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,119600.0
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,179700.0
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,264725.0
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,500001.0


In [8]:
# Set the features and target
X = df.drop(columns = 'MedHouseVal')
y = df['MedHouseVal']

# Validation split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [9]:
# Instantiate the scaler and model
scaler = StandardScaler()
lin_reg = LinearRegression()

# Construct the pipeline
lin_reg_pipe = make_pipeline(scaler, lin_reg)

# Fit the pipeline
lin_reg_pipe.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])

In [10]:
# Create model predictions
train_pred = lin_reg_pipe.predict(X_train)
test_pred = lin_reg_pipe.predict(X_test)

In [11]:
# Calculate the MAE
train_MAE = np.mean(np.abs(train_pred - y_train))
test_MAE = np.mean(np.abs(test_pred - y_test))

print(train_MAE, test_MAE)

52951.54304301853 52969.64012919461


In [12]:
# Use scikit to compute the MAE
train_MAE = mean_absolute_error(y_train, train_pred)
test_MAE = mean_absolute_error(y_test, test_pred)

print(train_MAE, test_MAE)

52951.54304301853 52969.64012919461


In [13]:
# Calculate the MSE
train_MSE = np.mean((train_pred - y_train)**2)
test_MSE = np.mean((test_pred - y_test)**2)

print(train_MSE, test_MSE)

5205522163.645129 5411287478.470688


In [14]:
# Use scikitlearn to find the MSE
train_MSE = mean_squared_error(y_train, train_pred)
test_MSE = mean_squared_error(y_test, test_pred)

print(train_MSE, test_MSE)

5205522163.645129 5411287478.470688


In [15]:
# Calculate the RMSE
train_RMSE = np.sqrt(np.mean((train_pred - y_train)**2))
test_RMSE = np.sqrt(np.mean((test_pred - y_test)**2))

print(train_RMSE, test_RMSE)

72149.30466501482 73561.45375446769


In [16]:
# Use scikitlearn to find the RMSE
train_RMSE = np.sqrt(mean_squared_error(y_train, train_pred))
test_RMSE = np.sqrt(mean_squared_error(y_test, test_pred))

print(train_RMSE, test_RMSE)

72149.30466501482 73561.45375446769


In [17]:
# Get R2 manually
train_r2 = np.corrcoef(y_train, train_pred)[0][1]**2
test_r2 = np.corrcoef(y_test, test_pred)[0][1]**2

print(train_r2, test_r2)

0.6098730310529247 0.5914748282366336


In [18]:
# Use scikitlearn to get R2
train_r2 = r2_score(y_train, train_pred)
test_r2 = r2_score(y_test, test_pred)

print(train_r2, test_r2)

0.609873031052925 0.5910509795491351
