# Benchmarking

In this notebook we will test some naive methods, and use these as benchmark for the more advanced machine learning algorithms we will use to predict stock prices.

In [None]:
import matplotlib

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from IPython.display import display

from data.get_50_highest_weights import get_sp_50_highest_weights_symbols
from data_preparation.ochlva_data import OCHLVAData
from utils.column_modifiers import target_generator
from utils.column_modifiers import keep_columns
from scorers.scorers import normalized_root_mean_square_error
from estimators.predictions import calculate_rolling_prediction
from estimators import latest_day

In [None]:
matplotlib.use('nbAgg')

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Load the SP500 data
ochlva_data = OCHLVAData()

In [None]:
# Load three other stocks
symbols = get_sp_50_highest_weights_symbols()

# Select symbols with high, medium and low weights
selected_symbols = (symbols.iloc[0], symbols.iloc[len(symbols)//2], symbols.iloc[-1])

for s in selected_symbols:
    ochlva_data.load_data(s)

In [None]:
ax = ochlva_data.plot(['Adj. Close'])
plt.show()

In [None]:
# Keep only 'Adj. Close' column
ochlva_data.transform(keep_columns, ['Adj. Close'], copy=False)

In [None]:
# Create target values for the data
# The targets are columns shifted 7, 14 and 28 days with respect to 'Adj. Close'
days = [7, 14, 28]
ochlva_data.transform(target_generator, 'Adj. Close', days, copy=False)

In [None]:
# Make the latest day regressor (note that only one is needed)
reg = latest_day.LatestDay()

In [None]:
# Here we are looping through the symbols in ochlva

for key in ochlva_data.transformed_data.keys():
    print(f'Processing {key}')
    # Extract the features and targets
    # NOTE: We have multiple targets
    x = ochlva_data.transformed_data[key].loc[:, ochlva_data.transformed_data[key].columns[:-len(days)]] 
    y = ochlva_data.transformed_data[key].loc[:, ochlva_data.transformed_data[key].columns[-len(days):]]

    print('Head of features')
    display(x.head())
    print('Head of targets')
    display(y.head())
    
    # NOTE: We could use sklearn.model_selection.TimeSeriesSplit for splitting the data
    # However, as we are not doing any form of cross-validation, it is here more convenient to utilize train_test_split
    x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=False, test_size=.2)
    
    print(f'Train shape: {x_train.shape}')
    print(f'Test shape: {x_test.shape}')

    # Plot the train and test set
    ax = x_train.plot()
    _ = x_test.plot(ax=ax)
    ax.legend([f'{key} Train', f'{key} Test'])
    ax.grid()
    _ = ax.set_ylabel('USD')

    plt.show()
    
    # Make predictions
    y_pred = calculate_rolling_prediction(reg, x_train, x_test, y_train, y_test)
    
    # Plot the short and the long predictions seperately in order not to clutter the plot
    ax = y_test.loc[:, [y_test.columns[0]]].plot()
    _ = y_pred.loc[:, [y_pred.columns[0]]].plot(ax=ax)
    ax.grid()
    _ = ax.set_ylabel('USD')
    plt.show()
    
    ax = y_test.loc[:, [y_test.columns[-1]]].plot()
    _ = y_pred.loc[:, [y_pred.columns[-1]]].plot(ax=ax)
    ax.grid()
    _ = ax.set_ylabel('USD')
    plt.show()
    
    # As the first prediction is on the training set, we subtract 1 in the indexing to account for this
    # Calculate the normalized root mean squared error
    nrmse = normalized_root_mean_square_error(y_test, y_pred)
    
    print(f'Normalized root mean squared error (averaged for the three predictions): {nrmse}')
    
    print('-'*80)
    print('\n'*5)