# Ensemble model

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import sys  
sys.path.insert(1, '/Users/simon/Documents/II/Dissertation/')
from src.evaluate import get_prediction_dfs_from_experiment, get_all_metrics, visualise
from src.misc import load_processed_dataset


%load_ext autoreload
%autoreload 2

Original results aggregated by stock

In [61]:
path = "./results.csv"
if os.path.exists(path):
    dfs = pd.read_csv(path, header=[0,1], index_col=0)
dfs["Model Type"] = dfs.index.str.split("_").str[0]
dfs["Stock"] = dfs.index.str.split("_").str[1]
orig = dfs.copy()
orig_by_stock = orig.drop(columns=["Hyperparameters", "Model Type"]).groupby("Stock").mean().loc[["NVDA", "JPM", "HD", "UNH"]]["Test set"]
orig_by_model = orig.drop(columns=["Hyperparameters", "Stock"]).groupby("Model Type").mean().loc[["Linear", "ARIMA", "RandomForest", "CNN", "LSTM", "ConvLSTM"]]["Test set"]
orig_by_model

  orig_by_stock = orig.drop(columns=["Hyperparameters", "Model Type"]).groupby("Stock").mean().loc[["NVDA", "JPM", "HD", "UNH"]]["Test set"]
  orig_by_model = orig.drop(columns=["Hyperparameters", "Stock"]).groupby("Model Type").mean().loc[["Linear", "ARIMA", "RandomForest", "CNN", "LSTM", "ConvLSTM"]]["Test set"]


Unnamed: 0_level_0,R2,MSE,RMSE,MAE,p,Accuracy,Avg. daily return,Std. daily return,Risk adj. return
Model Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Linear,-0.01909141,0.00035809,0.01752181,0.01252915,0.02385229,49.1,0.00090831,0.01171694,0.05495246
ARIMA,-1.01053333,0.00071815,0.02469201,0.01912823,-0.00055633,48.8,0.00058564,0.01129133,0.04494016
RandomForest,-0.10110938,0.00042627,0.01860393,0.01317114,-0.00732916,50.3,0.00086942,0.0108223,0.07098626
CNN,-0.16178257,0.00038139,0.01834945,0.01319566,-0.04282749,50.2,0.00119086,0.0150357,0.03647862
LSTM,-0.00462845,0.00035063,0.01737026,0.01233547,0.02478478,52.7,0.00125459,0.01608904,0.04510046
ConvLSTM,-0.00106495,0.00034991,0.01734557,0.01233434,0.00839879,55.0,0.00175389,0.01711801,0.08239443


Ensemble model of positive accuracy learners

In [60]:
models = ["RandomForest", "CNN", "LSTM", "ConvLSTM"]
stocks = ["NVDA", "JPM", "HD", "UNH"]

dfs = []
for m in models:
    for s in stocks:
        exp_name = f"{m}_{s}"
        val_df, test_df, hparams = get_prediction_dfs_from_experiment(experiment_name=exp_name)
        test_df["Model Type"] = m
        test_df["Stock"] = s
        dfs.append(test_df)
dfs = pd.concat(dfs)
dfs = dfs.drop(columns=["Model Type"]).groupby(["Date","Stock"]).mean()

metrics = []
for s in stocks:
    df = dfs.xs(s, level="Stock")
    metrics.append(get_all_metrics(df["Predictions"], df["Actuals"]))
metrics = pd.DataFrame(metrics, index=stocks)
metrics.mean()

Loading RandomForest_NVDA.
Rank 1: trial no. 1, value: 50.59760956175299. Run completed at 2024-04-29 17:44:32.475530
Loading RandomForest_JPM.
Rank 1: trial no. 8, value: 53.38645418326693. Run completed at 2024-04-29 17:51:43.110122
Loading RandomForest_HD.
Rank 1: trial no. 6, value: 54.18326693227091. Run completed at 2024-04-29 17:56:56.054294
Loading RandomForest_UNH.
Rank 1: trial no. 3, value: 55.77689243027888. Run completed at 2024-04-29 17:59:37.088842
Loading CNN_NVDA.
Rank 1: trial no. 11, value: 0.5737051963806152. Run completed at 2024-04-29 20:22:15.814904
Loading CNN_JPM.
Rank 1: trial no. 9, value: 0.5737051963806152. Run completed at 2024-04-29 18:34:47.643948
Loading CNN_HD.
Rank 1: trial no. 0, value: 0.5498008131980896. Run completed at 2024-04-29 18:35:17.552823
Loading CNN_UNH.
Rank 1: trial no. 8, value: 0.5816733241081238. Run completed at 2024-04-29 18:44:16.965952
Loading LSTM_NVDA.
Rank 1: trial no. 0, value: 0.5537848472595215. Run completed at 2024-04-29 

R2                  -0.02602749
MSE                  0.00036070
RMSE                 0.01758301
MAE                  0.01253625
p                   -0.03972312
Accuracy            51.50000000
Avg. daily return    0.00091445
Std. daily return    0.01339927
Risk adj. return     0.04897786
dtype: float64

Ensemble of all learners

In [62]:
models = ["Linear", "ARIMA", "RandomForest", "CNN", "LSTM", "ConvLSTM"]
stocks = ["NVDA", "JPM", "HD", "UNH"]

dfs = []
for m in models:
    for s in stocks:
        exp_name = f"{m}_{s}"
        val_df, test_df, hparams = get_prediction_dfs_from_experiment(experiment_name=exp_name)
        test_df["Model Type"] = m
        test_df["Stock"] = s
        dfs.append(test_df)
dfs = pd.concat(dfs)
dfs = dfs.drop(columns=["Model Type"]).groupby(["Date","Stock"]).mean()

metrics = []
for s in stocks:
    df = dfs.xs(s, level="Stock")
    metrics.append(get_all_metrics(df["Predictions"], df["Actuals"]))
metrics = pd.DataFrame(metrics, index=stocks)
metrics.mean()

Loading Linear_NVDA.
Rank 1: trial no. 0, value: 45.0199203187251. Run completed at 2024-04-29 16:52:25.570862
Loading Linear_JPM.
Rank 1: trial no. 0, value: 46.613545816733065. Run completed at 2024-04-29 16:52:27.015066
Loading Linear_HD.
Rank 1: trial no. 0, value: 52.589641434262944. Run completed at 2024-04-29 16:52:28.379346
Loading Linear_UNH.
Rank 1: trial no. 0, value: 47.808764940239044. Run completed at 2024-04-29 16:52:29.689755
Loading ARIMA_NVDA.
Rank 1: trial no. 18, value: 56.97211155378486. Run completed at 2024-04-29 17:11:04.256046
Loading ARIMA_JPM.
Rank 1: trial no. 1, value: 52.98804780876494. Run completed at 2024-04-29 17:15:08.552541
Loading ARIMA_HD.
Rank 1: trial no. 4, value: 52.98804780876494. Run completed at 2024-04-29 17:27:20.232495
Loading ARIMA_UNH.
Rank 1: trial no. 1, value: 47.01195219123506. Run completed at 2024-04-29 17:35:18.839560
Loading RandomForest_NVDA.
Rank 1: trial no. 1, value: 50.59760956175299. Run completed at 2024-04-29 17:44:32.47

R2                  -0.03926330
MSE                  0.00036640
RMSE                 0.01770864
MAE                  0.01272426
p                   -0.01848382
Accuracy            48.70000000
Avg. daily return    0.00066838
Std. daily return    0.01147748
Risk adj. return     0.04684723
dtype: float64

Ensemble of deep learning models

In [63]:
models = ["CNN", "LSTM", "ConvLSTM"]
stocks = ["NVDA", "JPM", "HD", "UNH"]

dfs = []
for m in models:
    for s in stocks:
        exp_name = f"{m}_{s}"
        val_df, test_df, hparams = get_prediction_dfs_from_experiment(experiment_name=exp_name)
        test_df["Model Type"] = m
        test_df["Stock"] = s
        dfs.append(test_df)
dfs = pd.concat(dfs)
dfs = dfs.drop(columns=["Model Type"]).groupby(["Date","Stock"]).mean()

metrics = []
for s in stocks:
    df = dfs.xs(s, level="Stock")
    metrics.append(get_all_metrics(df["Predictions"], df["Actuals"]))
metrics = pd.DataFrame(metrics, index=stocks)
metrics.mean()

Loading CNN_NVDA.
Rank 1: trial no. 11, value: 0.5737051963806152. Run completed at 2024-04-29 20:22:15.814904
Loading CNN_JPM.
Rank 1: trial no. 9, value: 0.5737051963806152. Run completed at 2024-04-29 18:34:47.643948
Loading CNN_HD.
Rank 1: trial no. 0, value: 0.5498008131980896. Run completed at 2024-04-29 18:35:17.552823
Loading CNN_UNH.
Rank 1: trial no. 8, value: 0.5816733241081238. Run completed at 2024-04-29 18:44:16.965952
Loading LSTM_NVDA.
Rank 1: trial no. 0, value: 0.5537848472595215. Run completed at 2024-04-29 18:01:30.993116
Loading LSTM_JPM.
Rank 1: trial no. 3, value: 0.6175298690795898. Run completed at 2024-04-29 18:13:38.976207
Loading LSTM_HD.
Rank 1: trial no. 4, value: 0.6055777072906494. Run completed at 2024-04-29 18:19:41.955427
Loading LSTM_UNH.
Rank 1: trial no. 11, value: 0.5896414518356323. Run completed at 2024-04-29 20:23:47.878337
Loading ConvLSTM_NVDA.
Rank 1: trial no. 2, value: 0.518652081489563. Run completed at 2024-04-29 18:47:06.218729
Loading 

R2                  -0.02423907
MSE                  0.00035377
RMSE                 0.01749355
MAE                  0.01245824
p                   -0.01024205
Accuracy            52.00000000
Avg. daily return    0.00138459
Std. daily return    0.01606900
Risk adj. return     0.05566150
dtype: float64

In [64]:
models = ["CNN", "LSTM"]
stocks = ["NVDA", "JPM", "HD", "UNH"]

dfs = []
for m in models:
    for s in stocks:
        exp_name = f"{m}_{s}"
        val_df, test_df, hparams = get_prediction_dfs_from_experiment(experiment_name=exp_name)
        test_df["Model Type"] = m
        test_df["Stock"] = s
        dfs.append(test_df)
dfs = pd.concat(dfs)
dfs = dfs.drop(columns=["Model Type"]).groupby(["Date","Stock"]).mean()

metrics = []
for s in stocks:
    df = dfs.xs(s, level="Stock")
    metrics.append(get_all_metrics(df["Predictions"], df["Actuals"]))
metrics = pd.DataFrame(metrics, index=stocks)
metrics.mean()

Loading CNN_NVDA.
Rank 1: trial no. 11, value: 0.5737051963806152. Run completed at 2024-04-29 20:22:15.814904
Loading CNN_JPM.
Rank 1: trial no. 9, value: 0.5737051963806152. Run completed at 2024-04-29 18:34:47.643948
Loading CNN_HD.
Rank 1: trial no. 0, value: 0.5498008131980896. Run completed at 2024-04-29 18:35:17.552823
Loading CNN_UNH.
Rank 1: trial no. 8, value: 0.5816733241081238. Run completed at 2024-04-29 18:44:16.965952
Loading LSTM_NVDA.
Rank 1: trial no. 0, value: 0.5537848472595215. Run completed at 2024-04-29 18:01:30.993116
Loading LSTM_JPM.
Rank 1: trial no. 3, value: 0.6175298690795898. Run completed at 2024-04-29 18:13:38.976207
Loading LSTM_HD.
Rank 1: trial no. 4, value: 0.6055777072906494. Run completed at 2024-04-29 18:19:41.955427
Loading LSTM_UNH.
Rank 1: trial no. 11, value: 0.5896414518356323. Run completed at 2024-04-29 20:23:47.878337


R2                  -0.04951123
MSE                  0.00035996
RMSE                 0.01767374
MAE                  0.01262452
p                   -0.01063027
Accuracy            50.60000000
Avg. daily return    0.00120291
Std. daily return    0.01558573
Risk adj. return     0.04666328
dtype: float64