In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score, mean_squared_error

#For inline plotting 
%matplotlib inline                 
%config InlineBackend.figure_format = 'svg'

plt.style.use("seaborn-v0_8-dark")     

In [3]:
# Download the data for the stock prices 

!wget -nc https://lazyprogrammer.me/course_files/SPY.csv

--2024-04-03 17:51:10--  https://lazyprogrammer.me/course_files/SPY.csv
Resolving lazyprogrammer.me (lazyprogrammer.me)... 104.21.23.210, 172.67.213.166
Connecting to lazyprogrammer.me (lazyprogrammer.me)|104.21.23.210|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 251935 (246K) [text/csv]
Saving to: 'SPY.csv'


2024-04-03 17:51:12 (2.93 MB/s) - 'SPY.csv' saved [251935/251935]



In [5]:
df = pd.read_csv('SPY.csv', index_col=0, parse_dates=True)

In [6]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,112.370003,113.389999,111.510002,113.330002,92.246048,118944600
2010-01-05,113.260002,113.68,112.849998,113.629997,92.490204,111579900
2010-01-06,113.519997,113.989998,113.43,113.709999,92.555328,116074400
2010-01-07,113.5,114.330002,113.18,114.190002,92.94606,131091100
2010-01-08,113.889999,114.620003,113.660004,114.57,93.255348,126402800


In [7]:
# Predictions according to the naive forecast

df['Close_Predict'] = df['Close'].shift(1)

df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Close_Predict
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-04,112.370003,113.389999,111.510002,113.330002,92.246048,118944600,
2010-01-05,113.260002,113.68,112.849998,113.629997,92.490204,111579900,113.330002
2010-01-06,113.519997,113.989998,113.43,113.709999,92.555328,116074400,113.629997
2010-01-07,113.5,114.330002,113.18,114.190002,92.94606,131091100,113.709999
2010-01-08,113.889999,114.620003,113.660004,114.57,93.255348,126402800,114.190002


In [12]:
y_true, y_pred = df.Close[1:], df.Close_Predict[1:]

### Metrics
-----------

Main idea: to get a feeling for how the different evaluation metrics relate to one another. What is "good"? What is "bad"? If the $R^2$ is "good" will the MAE also be "good"?

In [13]:
# Sum squared error
(y_true-y_pred).dot(y_true - y_pred)

6330.3742894926045

In [14]:
# Mean squared error (MSE)
mean_squared_error(y_true, y_pred)

2.798573956451196

In [16]:
# MSE from scratch 
(y_true-y_pred).dot(y_true - y_pred)/len(y_true)

2.7985739564511958

In [17]:
# RMSE
mean_squared_error(y_true, y_pred, squared = False)

1.672893886787562

In [18]:
# RMSE from scratch
np.sqrt((y_true-y_pred).dot(y_true - y_pred)/len(y_true))

1.6728938867875618

In [19]:
# MAE
mean_absolute_error(y_true,y_pred)

1.1457559803120336

In [20]:
# R^2
# Seems too good to be true eh?  

r2_score(y_pred,y_true)

0.998960562204967

In [21]:
# MAPE 

mean_absolute_percentage_error(y_pred,y_true)

0.0064896321490491805

#### sMape from scratch: 

$$\textrm{sMAPE} = \frac{1}{N} \sum_{i=1}^{N} \frac{|y_i - \hat{y}_i|}{\left(|y_i| + |\hat{y}_i|\right)/2} $$

In [24]:
def smape(y_t, y_p):
    num = np.abs(y_t-y_p)
    denom = 0.5 * (np.abs(y_t) + np.abs(y_p))

    ratio = num/denom

    return ratio.mean()

In [23]:
smape(y_true,y_pred)

0.006491365814068417