In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
def get_exp_path(expid):
    expid = str(expid)
    logs_folder = Path(r"C:\Users\Mathiass\Documents\Projects\master-thesis")/"logs"
    matches = Path(logs_folder).rglob(expid) #Get folder in logs_folder that matches expid
    matches_list = list(matches)
    if not len(matches_list) == 1:
        raise ValueError(f"There exists none or more than 1 folder with "
                            f"experiment id {expid} in the {logs_folder.name} "
                            "directory!")
    exp_dir = matches_list[0]
    return exp_dir

In [3]:
def multi_categorize(y: float, classes: int):
    """
    Creates categorical labels from continuous values.

        Args:
            y (float):      continuous target variable (option return)
            classes (int):  number of classes to create
        Returns:
            (int):          class assignment
        CAREFUL: classes have to be between [0, C) for F.crossentropyloss.
    """
    if classes == 5:
        # thresholds: +/- 2.5% and +/- 5%
        if y > 0.05:
            return 4
        elif (y > 0.025 and y <= 0.05):
            return 3
        elif (y >= -0.05 and y < -0.025):
            return 1
        elif (y < -0.05):
            return 0
        else:
            return 2 # all returns \elin [-0.025, 0.025]
    else:
        raise ValueError("Only multi for 3 or 5 classes implemented right now.")

### y_true

In [4]:
data = pd.read_parquet(r"C:\Users\Mathiass\Documents\Projects\master-thesis\data\final_df_call_cao_small.parquet")

In [5]:
y_true = data["option_ret"].apply(multi_categorize, classes=5)

### y_pred

In [6]:
def load_pred(exp_id):
    p = get_exp_path(exp_id)
    preds = pd.read_csv(p/"all_pred.csv")
    y_pred = preds["pred"].rename(p.parent.name)
    return y_pred

In [7]:
def concat_preds(exp_id_ls):
    pred_ls = []
    for e in exp_id_ls:
        y_pred = load_pred(e)
        pred_ls.append(y_pred)
    return pd.concat(pred_ls, axis=1)

In [8]:
#medium
exp_id_ls = [20220924104656, 20220920100736, 20220923074741, 20220923123700]
# small
# exp_id_ls = [20220919200811, 20220908133630, 20220915074003, 20220919213409]

In [9]:
conc = concat_preds(exp_id_ls)

In [10]:
conc

Unnamed: 0,xgb,nn,transformer,rf
0,4,4,4,0
1,3,3,3,4
2,2,3,3,2
3,3,3,3,2
4,2,2,2,2
...,...,...,...,...
531788,2,2,2,2
531789,3,3,3,3
531790,2,2,2,2
531791,3,3,1,1


In [11]:
from scipy.stats import mode
m = mode(conc, axis=-1)[0]

In [12]:
y_pred = m

In [13]:
y_true = y_true[-len(y_pred):]

In [14]:
y_pred = y_pred.squeeze()

In [15]:
y_true

234731    4
234732    3
234733    2
234734    3
234735    2
         ..
766519    2
766520    2
766521    1
766522    2
766523    1
Name: option_ret, Length: 531793, dtype: int64

In [16]:
from sklearn.metrics import balanced_accuracy_score

In [17]:
balanced_accuracy_score(y_true, y_pred)

0.3734701706817729

In [18]:
pd.DataFrame(y_pred)

Unnamed: 0,0
0,4
1,3
2,2
3,3
4,2
...,...
531788,2
531789,3
531790,2
531791,1


In [19]:
# Get all_preds design
p = get_exp_path(20220924104656)
design = pd.read_csv(p/"all_pred.csv", index_col=0)

In [20]:
design["pred"] = y_pred

In [21]:
design.to_csv(r"C:\Users\Mathiass\Documents\Projects\master-thesis\logs\production\small\ensemble\11111_rf\custom_preds.csv")