### Import Packages & Data

In [15]:
import pandas as pd
import numpy as np
import sklearn as sk
import math
import warnings
warnings.filterwarnings("ignore")

In [2]:
stockData = pd.read_csv('Chapter2_StockPriceData.csv')

In [3]:
stockData['Error'] = stockData.Forecasted - stockData.StockPrice
stockData

Unnamed: 0,Period,StockPrice,Forecasted,Error
0,January,35,30,-5
1,February,35,31,-4
2,March,10,30,20
3,April,5,10,5
4,May,8,12,4
5,June,10,17,7
6,July,15,18,3
7,August,20,27,7
8,September,23,29,6
9,October,21,24,3


### Explore Evaluation Metrics

In [39]:
#MSE
MSE = sk.metrics.mean_squared_error(stockData['StockPrice'], stockData['Forecasted'])
print('MSE: ', round(MSE, 1 ))

MSE:  53.7


In [40]:
#RMSE
RMSE = math.sqrt(sk.metrics.mean_squared_error(stockData['StockPrice'], stockData['Forecasted']))
print('RMSE: ', round(RMSE, 1 ))

RMSE:  7.3


In [41]:
#MAE
MAE = sk.metrics.mean_absolute_error(stockData['StockPrice'], stockData['Forecasted'])
print('MAE: ', round(MAE, 1 ))

MAE:  5.7


In [36]:
#MAPE
MAPE = sum((abs(stockData['Forecasted'] - stockData['StockPrice']))/stockData['StockPrice'])/np.count_nonzero(stockData['Period'])
print('MAPE: ', round(MAPE, 2 ))
print('MAPE Performance: ', round((1 - MAPE)*100, 2))

MAPE:  0.46
MAPE Performance:  53.53


In [42]:
#R2
R2 = sk.metrics.r2_score(stockData['StockPrice'], stockData['Forecasted'])
print('R2: ', round(R2, 1 ))

R2:  0.4


### Train-Test Split

In [6]:
#Using MEAN as the model
y = stockData['StockPrice']

train1, test1 = sk.model_selection.train_test_split(y, test_size=0.3, shuffle=False)
forecast = train1.mean() #should be 17.25

train1 = pd.DataFrame(train1)
train1['forecast'] = forecast
train1_error = sk.metrics.mean_squared_error(train1['StockPrice'], train1['forecast'])

test1 = pd.DataFrame(test1)
test1['forecast'] = forecast
test1_error = sk.metrics.mean_squared_error(test1['StockPrice'], test1['forecast'])

print('Train Error: ', train1_error, ' vs Test Error: ', test1_error)

Train Error:  122.9375  vs Test Error:  32.4375


### Train-Validation-Test Split

In [11]:
#Comparing MEAN and MEDIAN as models
#Splitting into 70% train, 15% validation and 15% test:
train2, test2 = sk.model_selection.train_test_split(y, test_size=0.3, shuffle=False, random_state=12345)
val2, test2 = sk.model_selection.train_test_split(test2, test_size=0.5, shuffle=False, random_state=12345)

forecast_mean = train2.mean() #17.25
forecast_median = train2.median() #12.5

#Compute MSE on validation data for both models:
val2 = pd.DataFrame(val2)
val2['forecast_mean'] = forecast_mean
val2['forecast_median'] = forecast_median

mean_val_mse = round(sk.metrics.mean_squared_error(val2['StockPrice'], val2['forecast_mean']), 2)
median_val_mse = round(sk.metrics.mean_squared_error(val2['StockPrice'], val2['forecast_median']), 2)

print('Val Error for Mean model: ', mean_val_mse, ' vs Val Error for Median model: ', median_val_mse)
print()
print('Since the Mean model has a lower validation error, we are picking it to compute the test error')

Val Error for Mean model:  23.56  vs Val Error for Median model:  91.25

Since the Mean model has a lower validation error, we are picking it to compute the test error


In [12]:
test2 = pd.DataFrame(test2)
test2['forecast_mean'] = forecast_mean
mean_test_mse = sk.metrics.mean_squared_error(test2['StockPrice'], test2['forecast_mean'])
print('Test Error for Mean model: ', mean_test_mse)
print()
print('Test error does not validate our validation split findings since it is double the validation error; such incongruence highlights possible bias in our evaluation.')
print('In this case, it is related to the fact that there are too few data points and the model itself is just the average of all values.')

Test Error for Mean model:  41.3125

Test error does not validate our validation split findings since it is double the validation error; such incongruence highlights possible bias in our evaluation.
In this case, it is related to the fact that there are too few data points and the model itself is just the average of all values.


### Cross-Validation: K-Fold Cross-Validation

In [18]:
kf = sk.model_selection.KFold(n_splits=5)

errors = []
for train_index1, test_index1 in kf.split(stockData):
    train3 = stockData.iloc[train_index1,:]
    test3 = stockData.iloc[test_index1,:]
    
    pred3 = train3['StockPrice'].mean()
    test3['forecast'] = pred3
    error = sk.metrics.mean_squared_error(test3['StockPrice'], test3['forecast'])
    errors.append(error)

print('K-Fold Cross-Validation Error for Mean model: ', round(np.mean(errors), 1)) #This method should not be used for time-series.

K-Fold Cross-Validation Error for Mean model:  106.1


### Cross-Validation: Time Series Cross-Validation

In [31]:
tscv = sk.model_selection.TimeSeriesSplit(n_splits=5)

errors_ts = []
for train_index2, test_index2 in tscv.split(stockData):
    train4 = stockData.iloc[train_index2,:]
    test4 = stockData.iloc[test_index2,:]
    
    pred4 = train4['StockPrice'].mean()
    test4['forecast'] = pred4
    error = sk.metrics.mean_squared_error(test4['StockPrice'], test4['forecast'])
    errors_ts.append(error)
    
print('Time-Series Cross-Validation Error for Mean model: ', round(np.mean(errors_ts), 2))

Time-Series Cross-Validation Error for Mean model:  194.77


### Cross-Validation: Rolling Time Series Cross-Validation

In [35]:
tscv_rl = sk.model_selection.TimeSeriesSplit(n_splits=5, max_train_size=2)

errors_rl = []
for train_index3, test_index3 in tscv_rl.split(stockData):
    train5 = stockData.iloc[train_index3,:]
    test5 = stockData.iloc[test_index3,:]
    
    pred5 = train5['StockPrice'].mean()
    test5['forecast'] = pred5
    error = sk.metrics.mean_squared_error(test5['StockPrice'], test5['forecast'])
    errors_rl.append(error)
    
print('Rolling Cross-Validation Error for Mean model: ', round(np.mean(errors_rl), 1))

Rolling Cross-Validation Error for Mean model:  174.0
