## **Performance for Regression-Based Models**

We want to use:

* MSE
* RMSE
* MAE
* MAPE
* R2

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('https://raw.githubusercontent.com/martinwg/ISA591/refs/heads/main/data/used_cars_sales_clean.csv')
df.head()

Unnamed: 0,condition,odometer,mmr,sellingprice,make_BMW,make_Mercedes-Benz,make_Rolls-Royce,make_Infiniti,make_Ram,body_Sedan,...,interior_red,interior_silver,interior_tan,interior_unknown,interior_white,interior_yellow,interior_—,carage_5 to 10 years,carage_less than 5 years,carage_older than 15 years
0,2.0,158959.0,4850,4500,0,0,0,0,0,1,...,False,False,False,False,False,False,True,False,False,False
1,2.1,124858.0,4825,2800,0,0,0,0,0,1,...,False,False,False,False,False,False,False,True,False,False
2,4.3,23771.0,27600,28000,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,True,False
3,4.0,24304.0,21100,21750,1,0,0,0,0,1,...,False,False,False,False,False,False,False,False,True,False
4,2.0,93837.0,6925,3100,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False


In [2]:
## To create the X Matrix and y vector
X = df.drop('sellingprice', axis = 1)
y = df.sellingprice

### 1) Split the data. Use seed (random_state = 13). 90% for training and 10% test.

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 13)

In [4]:
X_train.shape, y_train.shape

((78740, 59), (78740,))

In [5]:
X_test.shape, y_test.shape

((8749, 59), (8749,))

### **Runs two models**

* Linear Regression Model
* Random Forest (50 trees)

In [7]:
## models are stores in modules
## linear models are under linear
from sklearn.linear_model import LinearRegression

## instance
## predictive model (goodness of fit metrics are ignored)
lr = LinearRegression()

## fit on training data set
lr.fit(X_train, y_train)

In [8]:
## Random Forest is an ensemble
from sklearn.ensemble import RandomForestRegressor

## instance
rf = RandomForestRegressor(n_estimators = 50, n_jobs = -1) ## n_jobs = -1 runs parallel models

## fit on training data set
rf.fit(X_train, y_train)

In [9]:
## Performance (Test)
y_pred_lr = lr.predict(X_test)
y_pred_rf = rf.predict(X_test)

In [10]:
y_pred_lr[:20]

array([17165.93004568,  9144.29467806,  6032.5662268 ,  9502.56225397,
        7965.00059296,  6145.19641185, 18520.02253964, 22295.76440848,
       -1098.07315615, 34417.60863451, 14090.19672795, 15488.35269647,
        4631.50555078, 14092.09923863, 10912.66407116, 16665.05124332,
        1898.52744499, 14799.68491595, 10171.10893425, 12112.18888115])

In [11]:
y_pred_rf[:20]

array([17320.,  8860.,  5486.,  9252.,  8416.,  5892., 18771., 22820.,
         382., 33736., 14256., 15928.,  5000., 14700., 11158., 17961.,
        2453., 14228.,  9907., 12520.])

In [13]:
## the true value of y
y_test.values[:20]

array([17600,  9300,  6200,  9400,  9000,  6500, 18500, 24250,   225,
       35700, 13400, 15100,  3400, 13600, 11300, 18000,  2700, 14500,
       10400, 12600])

In [14]:
## metrics: MSE, RMSE, MAE, MAPE, R2
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

In [20]:
print(f' The MSE for LR is:  {mean_squared_error(y_test, y_pred_lr)}' )
print(f' The MSE for RF is:  {mean_squared_error(y_test, y_pred_rf)} ')

 The MSE for LR is:  2667182.824527917
 The MSE for RF is:  3286644.610866522 


In [21]:
print(f' The RMSE for LR is:  {root_mean_squared_error(y_test, y_pred_lr)}' )
print(f' The RMSE for RF is:  {root_mean_squared_error(y_test, y_pred_rf)} ')

 The RMSE for LR is:  1633.1511946319963
 The RMSE for RF is:  1812.910535814308 


In [22]:
print(f' The MAE for LR is:  {mean_absolute_error(y_test, y_pred_lr)}' )
print(f' The MAE for RF is:  {mean_absolute_error(y_test, y_pred_rf)} ')

 The MAE for LR is:  1005.5460908754072
 The MAE for RF is:  1029.7004640530345 


In [23]:
print(f' The MAPE for LR is:  {mean_absolute_percentage_error(y_test, y_pred_lr)}' )
print(f' The MAPE for RF is:  {mean_absolute_percentage_error(y_test, y_pred_rf)} ')

 The MAPE for LR is:  0.15911276103130767
 The MAPE for RF is:  0.14577506165879614 


In [24]:
print(f' The R2 for LR is:  {r2_score(y_test, y_pred_lr)}' )
print(f' The R2 for RF is:  {r2_score(y_test, y_pred_rf)} ')

 The R2 for LR is:  0.9713397484341648
 The R2 for RF is:  0.9646833128615383 


In [25]:
## Estimates of the SLOPE
lr.coef_

array([ 9.53211904e+02,  2.42765036e-03,  9.63954657e-01,  1.10467981e+02,
       -2.50101608e+01,  1.29462284e+03,  1.59364207e+02,  1.44618586e+02,
       -1.22189231e+01,  2.06493289e+02,  6.38194847e+01, -3.72419459e+02,
       -1.09002131e+02,  1.68234171e+02, -4.50466614e+02,  2.17217465e+02,
       -7.20786515e+01, -5.12931246e+02, -1.85528994e+02,  1.20109749e+02,
        1.18072066e+01, -5.99867726e+01,  7.17334712e+01, -2.66328586e+02,
        8.89021125e+01,  8.10914724e+01,  6.24395288e+01,  3.12113708e+01,
        3.43955227e+02,  1.55280087e+01,  1.10915537e+02,  1.55129465e+02,
        1.33016758e+02,  5.31345787e+01,  4.07312222e+02, -2.02297133e+02,
        2.12698767e+02,  1.54013653e+02,  8.14179501e+01, -2.71373982e+01,
       -7.10923489e+01,  8.54937846e+01,  3.40301912e+02, -9.37126508e+01,
       -6.23905278e+01,  2.53701419e+02,  1.55225375e+02,  7.61070129e+02,
        1.86714405e+02,  5.34621876e+02,  7.22448235e+01, -9.47191846e+00,
       -2.02297133e+02,  

In [26]:
X_train.columns

Index(['condition', 'odometer', 'mmr', 'make_BMW', 'make_Mercedes-Benz',
       'make_Rolls-Royce', 'make_Infiniti', 'make_Ram', 'body_Sedan',
       'body_Crew Cab', 'body_unknown', 'body_SuperCrew', 'body_Hatchback',
       'state_nc', 'state_md', 'state_tn', 'state_va', 'state_ma',
       'transmission_manual', 'color_black', 'color_blue', 'color_brown',
       'color_burgundy', 'color_charcoal', 'color_gold', 'color_gray',
       'color_green', 'color_lime', 'color_off-white', 'color_orange',
       'color_pink', 'color_purple', 'color_red', 'color_silver',
       'color_turquoise', 'color_unknown', 'color_white', 'color_yellow',
       'color_—', 'interior_black', 'interior_blue', 'interior_brown',
       'interior_burgundy', 'interior_gold', 'interior_gray', 'interior_green',
       'interior_off-white', 'interior_orange', 'interior_purple',
       'interior_red', 'interior_silver', 'interior_tan', 'interior_unknown',
       'interior_white', 'interior_yellow', 'interior_—',
    

In [28]:
((y_test - y_pred_lr)**2).mean() ## MSE very susceptible to outliers

2667182.824527917

In [30]:
## MAE is less susceptible
np.mean([1000, 2000, 500, 2500, 50000])

11200.0

In [32]:
## median absolute error
np.abs((y_test - y_pred_lr)).median()

694.0506879358945

## **PERFORMANCE METRICS FOR CLASSIFICATION**

### **Confusion Matrix**

A summary of the predictions compared to the actuals. Actuals are in the rows and the predictions are in the columns (generally).

In [33]:
## y_true and p_hat
import numpy as np

y_true = np.array([0,0,0,1,1,1,1,1])

## models output a propensity
p_hat = np.array([0.45, 0.52, .21, .94, .1, .85, .15,.49])

In [34]:
## y_hat are just using the 0.5 cutoff
## .predict() checks on p_hat and selects 1 if p_hat > 0.5
## maybe 0.5 is not the best threshold
y_pred = (p_hat > 0.5).astype(int)
y_pred

array([0, 1, 0, 1, 0, 1, 0, 0])

In [35]:
## let's show the two vectors together
print(y_true)
print(y_pred)

[0 0 0 1 1 1 1 1]
[0 1 0 1 0 1 0 0]


In [37]:
## Confusion matrix is just a summary of predictions
conf_matrix = np.array( [[2,1],
                         [3,2]] )
conf_matrix

array([[2, 1],
       [3, 2]])