In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 500)

# Load Datasets

In [2]:
def train_test_split(X, y, train_idx=None, test_idx=None):
    X_train = X.loc[train_idx]
    y_train = y.loc[train_idx]
    X_test = X.loc[test_idx]
    y_test = y.loc[test_idx]
    return (X_train, y_train, X_test, y_test)

def load_split_data(suffix=None, split=False, window=14):
    if suffix==None:
        suffix='DEFAULT'
    try:
        X = pd.read_pickle(f'data/X_{suffix}.pkl')
        y = pd.read_pickle(f'data/y_{suffix}.pkl')
    except:
        X, y, _ = build_Xy(df, window=window, use_atr=True, atr_ratio=(20,5), reverse=False, debug=True)
        X.to_pickle(f'data/X_{suffix}.pkl')
        y.to_pickle(f'data/y_{suffix}.pkl')
        
    if split:
        X_train, y_train, X_test, y_test = train_test_split(X, y, X.loc['2018':'2020'].index, X.loc['2021':].index)
        return X_train, y_train, X_test, y_test
    else:
        return X, y
    
dataset_suffixes = [
    '20210806a',
    '20210806b',
    '20210806c',
    '20210806d',
    '20210806e',
    '20210806f',
    '20210806g',
    '20210806h',
    '20210806i',
    '20210806j',
    '20210806k',
    '20210806l',
]

datasets = {}

for d in dataset_suffixes:
    X_train, y_train, X_test, y_test = load_split_data(suffix=d, split=True)
    dataset_info = pd.read_pickle(f'data/data_file_hist_all_cols.pkl').loc[d]
    data = dict(X_train = X_train,
                y_train = y_train,
                X_test = X_test,
                y_test = y_test,
                use_atr = dataset_info['use_atr'],
                ratio = dataset_info['ratio'],
                reverse = dataset_info['reverse'],
                window = dataset_info['window'],
               )
    datasets[d] = data

# Simulator

In [3]:
def get_target_stoploss(df, threshold_ratio=(0.04,0.02), use_atr=True, atr_ratio=(2,1), reverse=False):
    if not reverse:
        if use_atr:
            stop_losses = df.low-(df.atr*atr_ratio[1])
            targets = df.close+(df.atr*atr_ratio[0])
        else:
            stop_losses = df.close-df.close*threshold_ratio[1]
            targets = df.close+df.close*threshold_ratio[0]
    else:
        if use_atr:
            stop_losses = df.high+(df.atr*atr_ratio[1])
            targets = df.close-(df.atr*atr_ratio[0])
        else:
            stop_losses = df.close+df.close*threshold_ratio[1]
            targets = df.close-df.close*threshold_ratio[0]

    return targets, stop_losses

def get_decisions_and_prices(x_data, pred, info_dict):
    next_action = 1
    target = -1
    stoploss = -1
    
    if type(x_data.index) != pd.RangeIndex:
        x_data = x_data.reset_index(drop=True)
    
    if type(pred) in (pd.DataFrame, pd.Series):
        pred = pred.to_numpy().ravel()

    use_atr = info_dict['model_use_atr']
    atr_ratio = info_dict['model_ratio']
    threshold_ratio = info_dict['model_ratio']
    reverse = info_dict['model_reverse']
        
    targets, stop_losses = get_target_stoploss(x_data,
                                               use_atr=use_atr,
                                               atr_ratio=atr_ratio,
                                               threshold_ratio=threshold_ratio,
                                               reverse=reverse)
    low_prices = x_data['low'].to_numpy()
    high_prices = x_data['high'].to_numpy()
    
    # Decisions:
    # 1 = buy
    # 0 = hold (default)
    # -1 = sell
    decision = pd.Series(0, index=x_data.index)
    execution_price = pd.Series(0.0, index=x_data.index)

    i = 0
    while True:
        if i>=len(x_data):
            break
        if next_action == 1:
            # Find next buy opportunity
            try:
                next_buy_idx = np.where(pred[i:]==1)[0][0] + i
                target = targets.iloc[next_buy_idx]
                stoploss = stop_losses.iloc[next_buy_idx]
                decision.at[next_buy_idx] = 1
                execution_price.at[next_buy_idx] = x_data.loc[next_buy_idx, 'close']
                i = next_buy_idx+1
                next_action = -1
            except:
                # No more buy opportunties
                break
        else:
            # Find next sell opportunity
            try:
                if not reverse:
                    next_sell_idx = np.where((high_prices[i:]>=target) | (low_prices[i:]<=stoploss))[0][0] + i
                else:
                    next_sell_idx = np.where((low_prices[i:]<=target) | (high_prices[i:]>=stoploss))[0][0] + i
                if x_data.loc[next_sell_idx, 'low'] <= target <= x_data.loc[next_sell_idx, 'high']:
                    execution_price.at[next_sell_idx] = target
                else:
                    execution_price.at[next_sell_idx] = stoploss
                decision.at[next_sell_idx] = -1
                i = next_sell_idx+1
                next_action = 1
            except:
                # No more sell opportunties
                break

    return decision, execution_price

def simulate(in_df, starting_value, trading_fees_percent, trading_fees_buy, trading_fees_sell):
    df = in_df.copy()
    df['value'] = 0.0
    value = starting_value
    fee_multiplier = 1.0 - trading_fees_percent / 100

    for x,r in df.iterrows():
        if r.decision == 1 and value > 0:
            value = ((value-trading_fees_buy) * r.price) * fee_multiplier
            if value < 0:
                break
        elif r.decision == -1 and value > 0:
            value = ((value-trading_fees_sell) / r.price) * fee_multiplier
            if value < 0:
                break
        else:
            break # value is below zero
        df.loc[x,'value'] = value
    return df.value

def run_simulator(X, y, model_use_atr, model_ratio, model_reverse,
                  starting_value=1, trading_fees_percent=0.1,
                  trading_fees_buy=0, trading_fees_sell=0):
    df = X.copy()
    d = dict(model_use_atr=model_use_atr, model_ratio=model_ratio, model_reverse=model_reverse)
    
    decision, execution_price = get_decisions_and_prices(X, y, d)
    
    df['decision'] = decision.values
    df['price'] = execution_price.values
    
    sim_df = df[df['decision']!=0][['decision','price']].copy()
    if len(sim_df) == 0:
        return starting_value
    else:
        sim_df['value'] = simulate(sim_df, starting_value, trading_fees_percent, trading_fees_buy, trading_fees_sell)
        return sim_df[sim_df.decision==-1].value.to_numpy()[-1]

# Train & Score Classifier with All Columns

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)


def score_clf(clf, X, y, model_use_atr=True, model_ratio=(2,1), model_reverse=True):
    pred = clf.predict(X)
    f1 = round(f1_score(y,pred),4)
    pr = round(precision_score(y,pred),4)
    rc = round(recall_score(y,pred),4)
    pf = round(run_simulator(X,pred,model_use_atr,model_ratio,model_reverse), 4)
    return {'f1':f1, 'pr':pr, 'rc':rc, 'pf':pf}

In [5]:
def add_scores(dname, clf_name, train_scores, test_scores):
    try:
        data_file_df = pd.read_pickle(f'data/data_file_hist_all_cols.pkl')
        if dname in data_file_df.index:
            train_name = f'{clf_name}_train_scores'
            test_name = f'{clf_name}_test_scores'
            data_file_df.at[dname,train_name] = f'{train_scores}'
            data_file_df.at[dname,test_name] = f'{test_scores}'
            data_file_df.to_pickle(f'data/data_file_hist_all_cols.pkl')
    except Exception as e:
        #raise e
        print(f'Exception ({dname},{clf_name}): ', e)

clfs = {
    'gnb' : GaussianNB(),
    'lrc' : LogisticRegression(random_state=42, max_iter=10000),
    'rfc' : RandomForestClassifier(random_state=42, n_jobs=-1),
    'abc' : AdaBoostClassifier(random_state=42),
    'gbc': GradientBoostingClassifier(random_state=42),
    'xgb' : xgb.XGBClassifier(n_jobs=-1, random_state=42, use_label_encoder=False),
    'mlp' : make_pipeline(StandardScaler(),MLPClassifier(random_state=42)),
}

for dname, data in datasets.items():
    print(f'{dname}:')
    for clf_name,clf in clfs.items():
        print(f'  {clf_name}:  ',end='')
        %time clf.fit(data['X_train'], data['y_train'])
        train_scores = score_clf(clf, data['X_train'], data['y_train'], data['use_atr'], data['ratio'], data['reverse'])
        test_scores = score_clf(clf, data['X_test'], data['y_test'], data['use_atr'], data['ratio'], data['reverse'])
        add_scores(dname, clf_name, train_scores, test_scores)
        print('    Test Scores:', test_scores)
    print(' ')

20210806g:
  gnb:  Wall time: 757 ms
    Test Scores: {'f1': 0.3686, 'pr': 0.2769, 'rc': 0.5513, 'pf': 0.5891}
  lrc:  Wall time: 2min 5s
    Test Scores: {'f1': 0.0013, 'pr': 0.3333, 'rc': 0.0006, 'pf': 0.8811}
  rfc:  Wall time: 15.4 s
    Test Scores: {'f1': 0.1167, 'pr': 0.5278, 'rc': 0.0656, 'pf': 1.2348}
  abc:  Wall time: 2min 42s
    Test Scores: {'f1': 0.0063, 'pr': 0.8824, 'rc': 0.0031, 'pf': 1.0243}
  gbc:  Wall time: 13min
    Test Scores: {'f1': 0.0004, 'pr': 0.3333, 'rc': 0.0002, 'pf': 0.9963}
Wall time: 55.2 s
    Test Scores: {'f1': 0.1308, 'pr': 0.4057, 'rc': 0.078, 'pf': 1.1658}
  mlp:  Wall time: 5min 48s
    Test Scores: {'f1': 0.2053, 'pr': 0.2783, 'rc': 0.1626, 'pf': 0.3146}
 
20210806h:
  gnb:  Wall time: 790 ms
    Test Scores: {'f1': 0.3285, 'pr': 0.2351, 'rc': 0.545, 'pf': 0.4365}
  lrc:  Wall time: 1min 42s
    Test Scores: {'f1': 0.002, 'pr': 0.4, 'rc': 0.001, 'pf': 1.0008}
  rfc:  Wall time: 14.3 s
    Test Scores: {'f1': 0.0572, 'pr': 0.4106, 'rc': 0.0307,

# Limit to 3 lookbacks

In [7]:
import re

def get_columns(X_train,lookbacks):
    # Drop columns with lookbacks equal to or greater than X
    columns = list(X_train.columns)
    for c in X_train.columns:
        if m := re.match(r'^.*_([0-9]+)$', c):
            if int(m[1]) > lookbacks:
                columns.remove(c)
    return columns


def add_scores(dname, clf_name, train_scores, test_scores):
    try:
        data_file_df = pd.read_pickle(f'data/data_file_hist_some_cols.pkl')
        if dname in data_file_df.index:
            train_name = f'{clf_name}_train_scores'
            test_name = f'{clf_name}_test_scores'
            data_file_df.at[dname,train_name] = f'{train_scores}'
            data_file_df.at[dname,test_name] = f'{test_scores}'
            data_file_df.to_pickle(f'data/data_file_hist_some_cols.pkl')
    except Exception as e:
        #raise e
        print(f'Exception ({dname},{clf_name}): ', e)

clfs = {
    'gnb' : GaussianNB(),
    'lrc' : LogisticRegression(random_state=42, max_iter=10000),
    'rfc' : RandomForestClassifier(random_state=42, n_jobs=-1),
    'abc' : AdaBoostClassifier(random_state=42),
    'gbc': GradientBoostingClassifier(random_state=42),
    'xgb' : xgb.XGBClassifier(n_jobs=-1, random_state=42, use_label_encoder=False),
    'mlp' : make_pipeline(StandardScaler(),MLPClassifier(random_state=42)),
}

for dname, data in datasets.items():
    print(f'{dname}:')
    
    columns = get_columns(data['X_train'],lookbacks=3)
    data['X_train'] = data['X_train'][columns]
    data['X_test'] = data['X_test'][columns]
    
    for clf_name,clf in clfs.items():
        print(f'  {clf_name}:  ',end='')
        %time clf.fit(data['X_train'], data['y_train'])
        train_scores = score_clf(clf, data['X_train'], data['y_train'], data['use_atr'], data['ratio'], data['reverse'])
        test_scores = score_clf(clf, data['X_test'], data['y_test'], data['use_atr'], data['ratio'], data['reverse'])
        add_scores(dname, clf_name, train_scores, test_scores)
        print('    Test Scores:', test_scores)
    print(' ')

20210806a:
  gnb:  Wall time: 219 ms
    Test Scores: {'f1': 0.3709, 'pr': 0.3565, 'rc': 0.3866, 'pf': 0.28}
  lrc:  Wall time: 37.8 s
    Test Scores: {'f1': 0.0068, 'pr': 0.2973, 'rc': 0.0034, 'pf': 0.9901}
  rfc:  Wall time: 6.33 s
    Test Scores: {'f1': 0.1973, 'pr': 0.3709, 'rc': 0.1344, 'pf': 0.4862}
  abc:  Wall time: 40.4 s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}
  gbc:  Wall time: 3min 29s
    Test Scores: {'f1': 0.0003, 'pr': 0.5, 'rc': 0.0002, 'pf': 0.993}
Wall time: 16.7 s
    Test Scores: {'f1': 0.1308, 'pr': 0.3145, 'rc': 0.0826, 'pf': 0.4693}
  mlp:  Wall time: 2min 27s
    Test Scores: {'f1': 0.1447, 'pr': 0.3541, 'rc': 0.0909, 'pf': 0.2577}
 
20210806b:
  gnb:  Wall time: 233 ms
    Test Scores: {'f1': 0.2737, 'pr': 0.2234, 'rc': 0.3534, 'pf': 0.1115}
  lrc:  Wall time: 52.3 s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 0.9955}
  rfc:  Wall time: 7.33 s
    Test Scores: {'f1': 0.0243, 'pr': 0.2825, 'rc': 0.0127, 'pf': 0.9578}
  abc:  W

Wall time: 14.9 s
    Test Scores: {'f1': 0.3195, 'pr': 0.4459, 'rc': 0.2489, 'pf': 0.2391}
  mlp:  Wall time: 2min 27s
    Test Scores: {'f1': 0.2728, 'pr': 0.4186, 'rc': 0.2023, 'pf': 0.0998}
 
20210806j:
  gnb:  Wall time: 207 ms
    Test Scores: {'f1': 0.3402, 'pr': 0.2908, 'rc': 0.4097, 'pf': 0.0388}
  lrc:  Wall time: 1min 7s
    Test Scores: {'f1': 0.007, 'pr': 0.125, 'rc': 0.0036, 'pf': 0.9787}
  rfc:  Wall time: 6.95 s
    Test Scores: {'f1': 0.0637, 'pr': 0.2591, 'rc': 0.0363, 'pf': 0.7818}
  abc:  Wall time: 40.7 s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}
  gbc:  Wall time: 3min 27s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}
Wall time: 14.4 s
    Test Scores: {'f1': 0.0561, 'pr': 0.3071, 'rc': 0.0309, 'pf': 0.7762}
  mlp:  Wall time: 2min 30s
    Test Scores: {'f1': 0.1041, 'pr': 0.2862, 'rc': 0.0636, 'pf': 0.3291}
 
20210806k:
  gnb:  Wall time: 218 ms
    Test Scores: {'f1': 0.1594, 'pr': 0.2136, 'rc': 0.1271, 'pf': 0.2785}
  lrc:  Wall 

# No lookbacks

In [8]:
def add_scores(dname, clf_name, train_scores, test_scores):
    try:
        data_file_df = pd.read_pickle(f'data/data_file_hist_no_lkbk.pkl')
        if dname in data_file_df.index:
            train_name = f'{clf_name}_train_scores'
            test_name = f'{clf_name}_test_scores'
            data_file_df.at[dname,train_name] = f'{train_scores}'
            data_file_df.at[dname,test_name] = f'{test_scores}'
            data_file_df.to_pickle(f'data/data_file_hist_no_lkbk.pkl')
    except Exception as e:
        #raise e
        print(f'Exception ({dname},{clf_name}): ', e)

clfs = {
    'gnb' : GaussianNB(),
    'lrc' : LogisticRegression(random_state=42, max_iter=10000),
    'rfc' : RandomForestClassifier(random_state=42, n_jobs=-1),
    'abc' : AdaBoostClassifier(random_state=42),
    'gbc': GradientBoostingClassifier(random_state=42),
    'xgb' : xgb.XGBClassifier(n_jobs=-1, random_state=42, use_label_encoder=False),
    'mlp' : make_pipeline(StandardScaler(),MLPClassifier(random_state=42)),
}

for dname, data in datasets.items():
    print(f'{dname}:')
    
    columns = get_columns(data['X_train'],lookbacks=0)
    data['X_train'] = data['X_train'][columns]
    data['X_test'] = data['X_test'][columns]
    
    for clf_name,clf in clfs.items():
        print(f'  {clf_name}:  ',end='')
        %time clf.fit(data['X_train'], data['y_train'])
        train_scores = score_clf(clf, data['X_train'], data['y_train'], data['use_atr'], data['ratio'], data['reverse'])
        test_scores = score_clf(clf, data['X_test'], data['y_test'], data['use_atr'], data['ratio'], data['reverse'])
        add_scores(dname, clf_name, train_scores, test_scores)
        print('    Test Scores:', test_scores)
    print(' ')

20210806a:
  gnb:  Wall time: 62.8 ms
    Test Scores: {'f1': 0.0217, 'pr': 0.3077, 'rc': 0.0113, 'pf': 0.7722}
  lrc:  Wall time: 833 ms
    Test Scores: {'f1': 0.0016, 'pr': 0.2083, 'rc': 0.0008, 'pf': 1.0036}
  rfc:  Wall time: 3.26 s
    Test Scores: {'f1': 0.211, 'pr': 0.3804, 'rc': 0.146, 'pf': 0.4542}
  abc:  Wall time: 10.8 s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}
  gbc:  Wall time: 53.3 s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}
Wall time: 5.6 s
    Test Scores: {'f1': 0.1685, 'pr': 0.3605, 'rc': 0.11, 'pf': 0.5238}
  mlp:  Wall time: 54.7 s
    Test Scores: {'f1': 0.1116, 'pr': 0.4145, 'rc': 0.0645, 'pf': 0.5697}
 
20210806b:
  gnb:  Wall time: 60.9 ms
    Test Scores: {'f1': 0.0148, 'pr': 0.2586, 'rc': 0.0076, 'pf': 0.899}
  lrc:  Wall time: 843 ms
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}
  rfc:  Wall time: 3.24 s
    Test Scores: {'f1': 0.038, 'pr': 0.3008, 'rc': 0.0203, 'pf': 0.8503}
  abc:  Wall time: 10.7 s
    T

Wall time: 6.46 s
    Test Scores: {'f1': 0.2855, 'pr': 0.4452, 'rc': 0.2101, 'pf': 0.2822}
  mlp:  Wall time: 1min 34s
    Test Scores: {'f1': 0.2606, 'pr': 0.4477, 'rc': 0.1838, 'pf': 0.2079}
 
20210806j:
  gnb:  Wall time: 61.8 ms
    Test Scores: {'f1': 0.1299, 'pr': 0.2583, 'rc': 0.0868, 'pf': 0.1997}
  lrc:  Wall time: 3.16 s
    Test Scores: {'f1': 0.0037, 'pr': 0.0885, 'rc': 0.0019, 'pf': 0.9381}
  rfc:  Wall time: 3.9 s
    Test Scores: {'f1': 0.0935, 'pr': 0.2588, 'rc': 0.0571, 'pf': 0.5651}
  abc:  Wall time: 10.7 s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}
  gbc:  Wall time: 53.3 s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}
Wall time: 6.33 s
    Test Scores: {'f1': 0.0632, 'pr': 0.252, 'rc': 0.0362, 'pf': 0.7515}
  mlp:  Wall time: 1min 32s
    Test Scores: {'f1': 0.0425, 'pr': 0.354, 'rc': 0.0226, 'pf': 0.8883}
 
20210806k:
  gnb:  Wall time: 59.9 ms
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 0.991}
  lrc:  Wall time: 2.86 s