In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 500)

# Load Datasets

In [2]:
def train_test_split(X, y, train_idx=None, test_idx=None):
    X_train = X.loc[train_idx]
    y_train = y.loc[train_idx]
    X_test = X.loc[test_idx]
    y_test = y.loc[test_idx]
    return (X_train, y_train, X_test, y_test)

def load_split_data(suffix=None, split=False, window=14):
    if suffix==None:
        suffix='DEFAULT'
    try:
        X = pd.read_pickle(f'data/X_{suffix}.pkl')
        y = pd.read_pickle(f'data/y_{suffix}.pkl')
    except:
        X, y, _ = build_Xy(df, window=window, use_atr=True, atr_ratio=(20,5), reverse=False, debug=True)
        X.to_pickle(f'data/X_{suffix}.pkl')
        y.to_pickle(f'data/y_{suffix}.pkl')
        
    if split:
        X_train, y_train, X_test, y_test = train_test_split(X, y, X.loc['2018':'2020'].index, X.loc['2021':].index)
        return X_train, y_train, X_test, y_test
    else:
        return X, y
    
dataset_suffixes = [
    '20210806a',
    '20210806b',
    '20210806c',
    '20210806d',
    '20210806e',
    '20210806f',
    '20210806g',
    '20210806h',
    '20210806i',
    '20210806j',
    '20210806k',
    '20210806l',
]

datasets = {}

for d in dataset_suffixes:
    X_train, y_train, X_test, y_test = load_split_data(suffix=d, split=True)
    data = dict(X_train = X_train,
                y_train = y_train,
                X_test = X_test,
                y_test = y_test
               )
    datasets[d] = data

# Train & Score Classifier with All Columns

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)


def score_clf(clf, X, y):
    f1 = round(f1_score(y,clf.predict(X)),4)
    pr = round(precision_score(y,clf.predict(X)),4)
    rc = round(recall_score(y,clf.predict(X)),4)
    return {'f1':f1, 'pr':pr, 'rc':rc}

In [9]:
def add_scores(dname, clf_name, train_scores, test_scores):
    try:
        data_file_df = pd.read_pickle(f'data/data_file_hist_all_cols.pkl')
        if dname in data_file_df.index:
            train_name = f'{clf_name}_train_scores'
            test_name = f'{clf_name}_test_scores'
            data_file_df.at[dname,train_name] = f'{train_scores}'
            data_file_df.at[dname,test_name] = f'{test_scores}'
            data_file_df.to_pickle(f'data/data_file_hist_all_cols.pkl')
    except Exception as e:
        #raise e
        print(f'Exception ({dname},{clf_name}): ', e)

clfs = {
    'gnb' : GaussianNB(),
    'lrc' : LogisticRegression(random_state=42, max_iter=10000),
    'rfc' : RandomForestClassifier(random_state=42, n_jobs=-1),
    'abc' : AdaBoostClassifier(random_state=42),
    'gbc': GradientBoostingClassifier(random_state=42),
    'xgb' : xgb.XGBClassifier(n_jobs=-1, random_state=42, use_label_encoder=False),
    'mlp' : make_pipeline(StandardScaler(),MLPClassifier(random_state=42)),
}

for dname, data in datasets.items():
    print(f'{dname}:')
    for clf_name,clf in clfs.items():
        print(f'  {clf_name}:  ',end='')
        %time clf.fit(data['X_train'], data['y_train'])
        train_scores = score_clf(clf, data['X_train'], data['y_train'])
        test_scores = score_clf(clf, data['X_test'], data['y_test'])
        add_scores(dname, clf_name, train_scores, test_scores)
        print('    Test Scores:', test_scores)
    print(' ')

20210806a:
  gnb:  Wall time: 793 ms
    Test Scores: {'f1': 0.397, 'pr': 0.3591, 'rc': 0.444}
 
  lrc:  Wall time: 2min 30s
    Test Scores: {'f1': 0.0226, 'pr': 0.3024, 'rc': 0.0117}
 
  rfc:  Wall time: 11.2 s
    Test Scores: {'f1': 0.1671, 'pr': 0.3644, 'rc': 0.1084}
 
  abc:  Wall time: 2min 27s
    Test Scores: {'f1': 0.0031, 'pr': 0.5263, 'rc': 0.0016}
 
  gbc:  Wall time: 12min 52s
    Test Scores: {'f1': 0.0006, 'pr': 0.1333, 'rc': 0.0003}
 
Wall time: 41.8 s
    Test Scores: {'f1': 0.1025, 'pr': 0.3536, 'rc': 0.0599}
 
  mlp:  Wall time: 4min 38s
    Test Scores: {'f1': 0.2724, 'pr': 0.3403, 'rc': 0.2272}
 
20210806b:
  gnb:  Wall time: 813 ms
    Test Scores: {'f1': 0.284, 'pr': 0.224, 'rc': 0.3879}
 
  lrc:  Wall time: 2min 31s
    Test Scores: {'f1': 0.0015, 'pr': 0.2143, 'rc': 0.0008}
 
  rfc:  Wall time: 13.1 s
    Test Scores: {'f1': 0.0109, 'pr': 0.275, 'rc': 0.0056}
 
  abc:  Wall time: 2min 30s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0}
 
  gbc:  Wall time: 

  gbc:  Wall time: 12min 49s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0}
 
Wall time: 39.7 s
    Test Scores: {'f1': 0.0412, 'pr': 0.2576, 'rc': 0.0224}
 
  mlp:  Wall time: 4min 34s
    Test Scores: {'f1': 0.1971, 'pr': 0.2663, 'rc': 0.1565}
 
20210806k:
  gnb:  Wall time: 800 ms
    Test Scores: {'f1': 0.2653, 'pr': 0.1894, 'rc': 0.4424}
 
  lrc:  Wall time: 4min 30s
    Test Scores: {'f1': 0.0023, 'pr': 0.129, 'rc': 0.0011}
 
  rfc:  Wall time: 12.8 s
    Test Scores: {'f1': 0.0028, 'pr': 0.0794, 'rc': 0.0014}
 
  abc:  Wall time: 2min 27s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0}
 
  gbc:  Wall time: 12min 40s
    Test Scores: {'f1': 0.0011, 'pr': 0.25, 'rc': 0.0006}
 
Wall time: 39.7 s
    Test Scores: {'f1': 0.0117, 'pr': 0.2121, 'rc': 0.006}
 
  mlp:  Wall time: 4min 35s
    Test Scores: {'f1': 0.1417, 'pr': 0.1858, 'rc': 0.1145}
 
20210806l:
  gnb:  Wall time: 802 ms
    Test Scores: {'f1': 0.3112, 'pr': 0.2573, 'rc': 0.3937}
 
  lrc:  Wall time: 2min 46s
    Te

# Limit to 3 lookbacks

In [11]:
import re

def get_columns(X_train,lookbacks):
    # Drop columns with lookbacks equal to or greater than X
    columns = list(X_train.columns)
    for c in X_train.columns:
        if m := re.match(r'^.*_([0-9]+)$', c):
            if int(m[1]) > lookbacks:
                columns.remove(c)
    return columns


def add_scores(dname, clf_name, train_scores, test_scores):
    try:
        data_file_df = pd.read_pickle(f'data/data_file_hist_some_cols.pkl')
        if dname in data_file_df.index:
            train_name = f'{clf_name}_train_scores'
            test_name = f'{clf_name}_test_scores'
            data_file_df.at[dname,train_name] = f'{train_scores}'
            data_file_df.at[dname,test_name] = f'{test_scores}'
            data_file_df.to_pickle(f'data/data_file_hist_some_cols.pkl')
    except Exception as e:
        #raise e
        print(f'Exception ({dname},{clf_name}): ', e)

clfs = {
    'gnb' : GaussianNB(),
    'lrc' : LogisticRegression(random_state=42, max_iter=10000),
    'rfc' : RandomForestClassifier(random_state=42, n_jobs=-1),
    'abc' : AdaBoostClassifier(random_state=42),
    'gbc': GradientBoostingClassifier(random_state=42),
    'xgb' : xgb.XGBClassifier(n_jobs=-1, random_state=42, use_label_encoder=False),
    'mlp' : make_pipeline(StandardScaler(),MLPClassifier(random_state=42)),
}

for dname, data in datasets.items():
    print(f'{dname}:')
    
    columns = get_columns(data['X_train'],lookbacks=3)
    data['X_train'] = data['X_train'][columns]
    data['X_test'] = data['X_test'][columns]
    
    for clf_name,clf in clfs.items():
        print(f'  {clf_name}:  ',end='')
        %time clf.fit(data['X_train'], data['y_train'])
        train_scores = score_clf(clf, data['X_train'], data['y_train'])
        test_scores = score_clf(clf, data['X_test'], data['y_test'])
        add_scores(dname, clf_name, train_scores, test_scores)
        print('    Test Scores:', test_scores)
    print(' ')

20210806a:
  gnb:  Wall time: 219 ms
    Test Scores: {'f1': 0.3709, 'pr': 0.3565, 'rc': 0.3866}
  lrc:  Wall time: 33.6 s
    Test Scores: {'f1': 0.0068, 'pr': 0.2973, 'rc': 0.0034}
  rfc:  Wall time: 5.31 s
    Test Scores: {'f1': 0.1973, 'pr': 0.3709, 'rc': 0.1344}
  abc:  Wall time: 39.9 s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0}
  gbc:  Wall time: 3min 24s
    Test Scores: {'f1': 0.0003, 'pr': 0.5, 'rc': 0.0002}
Wall time: 11.1 s
    Test Scores: {'f1': 0.1308, 'pr': 0.3145, 'rc': 0.0826}
  mlp:  Wall time: 1min 47s
    Test Scores: {'f1': 0.1447, 'pr': 0.3541, 'rc': 0.0909}
 
20210806b:
  gnb:  Wall time: 213 ms
    Test Scores: {'f1': 0.2737, 'pr': 0.2234, 'rc': 0.3534}
  lrc:  Wall time: 43.9 s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0}
  rfc:  Wall time: 5.7 s
    Test Scores: {'f1': 0.0243, 'pr': 0.2825, 'rc': 0.0127}
  abc:  Wall time: 40 s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0}
  gbc:  Wall time: 3min 24s
    Test Scores: {'f1': 0.0, 'pr': 0.0,

Wall time: 11 s
    Test Scores: {'f1': 0.0561, 'pr': 0.3071, 'rc': 0.0309}
  mlp:  Wall time: 1min 46s
    Test Scores: {'f1': 0.1041, 'pr': 0.2862, 'rc': 0.0636}
 
20210806k:
  gnb:  Wall time: 243 ms
    Test Scores: {'f1': 0.1594, 'pr': 0.2136, 'rc': 0.1271}
  lrc:  Wall time: 46.8 s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0}
  rfc:  Wall time: 5.79 s
    Test Scores: {'f1': 0.0131, 'pr': 0.1481, 'rc': 0.0069}
  abc:  Wall time: 39.6 s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0}
  gbc:  Wall time: 3min 24s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0}
Wall time: 11 s
    Test Scores: {'f1': 0.0187, 'pr': 0.2394, 'rc': 0.0097}
  mlp:  Wall time: 1min 49s
    Test Scores: {'f1': 0.0551, 'pr': 0.1871, 'rc': 0.0323}
 
20210806l:
  gnb:  Wall time: 204 ms
    Test Scores: {'f1': 0.2909, 'pr': 0.2572, 'rc': 0.3348}
  lrc:  Wall time: 41.1 s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0}
  rfc:  Wall time: 5.85 s
    Test Scores: {'f1': 0.0187, 'pr': 0.2115, 'rc': 

# No lookbacks

In [13]:
def add_scores(dname, clf_name, train_scores, test_scores):
    try:
        data_file_df = pd.read_pickle(f'data/data_file_hist_no_lkbk.pkl')
        if dname in data_file_df.index:
            train_name = f'{clf_name}_train_scores'
            test_name = f'{clf_name}_test_scores'
            data_file_df.at[dname,train_name] = f'{train_scores}'
            data_file_df.at[dname,test_name] = f'{test_scores}'
            data_file_df.to_pickle(f'data/data_file_hist_no_lkbk.pkl')
    except Exception as e:
        #raise e
        print(f'Exception ({dname},{clf_name}): ', e)

clfs = {
    'gnb' : GaussianNB(),
    'lrc' : LogisticRegression(random_state=42, max_iter=10000),
    'rfc' : RandomForestClassifier(random_state=42, n_jobs=-1),
    'abc' : AdaBoostClassifier(random_state=42),
    'gbc': GradientBoostingClassifier(random_state=42),
    'xgb' : xgb.XGBClassifier(n_jobs=-1, random_state=42, use_label_encoder=False),
    'mlp' : make_pipeline(StandardScaler(),MLPClassifier(random_state=42)),
}

for dname, data in datasets.items():
    print(f'{dname}:')
    
    columns = get_columns(data['X_train'],lookbacks=0)
    data['X_train'] = data['X_train'][columns]
    data['X_test'] = data['X_test'][columns]
    
    for clf_name,clf in clfs.items():
        print(f'  {clf_name}:  ',end='')
        %time clf.fit(data['X_train'], data['y_train'])
        train_scores = score_clf(clf, data['X_train'], data['y_train'])
        test_scores = score_clf(clf, data['X_test'], data['y_test'])
        add_scores(dname, clf_name, train_scores, test_scores)
        print('    Test Scores:', test_scores)
    print(' ')

20210806a:
  gnb:  Wall time: 58.9 ms
    Test Scores: {'f1': 0.0217, 'pr': 0.3077, 'rc': 0.0113}
  lrc:  Wall time: 655 ms
    Test Scores: {'f1': 0.0016, 'pr': 0.2083, 'rc': 0.0008}
  rfc:  Wall time: 2.46 s
    Test Scores: {'f1': 0.211, 'pr': 0.3804, 'rc': 0.146}
  abc:  Wall time: 10.8 s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0}
  gbc:  Wall time: 52.8 s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0}
Wall time: 4.01 s
    Test Scores: {'f1': 0.1685, 'pr': 0.3605, 'rc': 0.11}
  mlp:  Wall time: 39 s
    Test Scores: {'f1': 0.1116, 'pr': 0.4145, 'rc': 0.0645}
 
20210806b:
  gnb:  Wall time: 56.8 ms
    Test Scores: {'f1': 0.0148, 'pr': 0.2586, 'rc': 0.0076}
  lrc:  Wall time: 655 ms
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0}
  rfc:  Wall time: 2.67 s
    Test Scores: {'f1': 0.038, 'pr': 0.3008, 'rc': 0.0203}
  abc:  Wall time: 10.7 s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0}
  gbc:  Wall time: 53 s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0}
Wall

Wall time: 3.95 s
    Test Scores: {'f1': 0.0632, 'pr': 0.252, 'rc': 0.0362}
  mlp:  Wall time: 1min 4s
    Test Scores: {'f1': 0.0425, 'pr': 0.354, 'rc': 0.0226}
 
20210806k:
  gnb:  Wall time: 53.9 ms
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0}
  lrc:  Wall time: 2.17 s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0}
  rfc:  Wall time: 2.66 s
    Test Scores: {'f1': 0.0302, 'pr': 0.2127, 'rc': 0.0163}
  abc:  Wall time: 10.6 s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0}
  gbc:  Wall time: 52.4 s
    Test Scores: {'f1': 0.0006, 'pr': 1.0, 'rc': 0.0003}
Wall time: 3.89 s
    Test Scores: {'f1': 0.0236, 'pr': 0.1442, 'rc': 0.0129}
  mlp:  Wall time: 38.5 s
    Test Scores: {'f1': 0.0011, 'pr': 0.1818, 'rc': 0.0006}
 
20210806l:
  gnb:  Wall time: 54.9 ms
    Test Scores: {'f1': 0.0152, 'pr': 0.2917, 'rc': 0.0078}
  lrc:  Wall time: 1.74 s
    Test Scores: {'f1': 0.0, 'pr': 0.0, 'rc': 0.0}
  rfc:  Wall time: 2.76 s
    Test Scores: {'f1': 0.0485, 'pr': 0.2874, 'rc': 0.02