In [1]:
import pandas as pd
import numpy as np
import json

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

data_hist_files = {'A': 'data_file_hist_all_cols', 
                   'B': 'data_file_hist_some_cols',
                   'C': 'data_file_hist_no_lkbk'}

data = []
for c, fname in data_hist_files.items():
    df_hist = pd.read_pickle(f'data/{fname}.pkl')
    test_cols = [d for d in df_hist.columns if d.endswith('test_scores')]
    other_cols = ['pos_labels', 'sim_num_trades', 'sim_max_profit']
    df = df_hist[other_cols + test_cols].copy()
    
    # calculate true positives and false positives
    for i,r in df.iterrows():
        relevant_elements = r[0]
        trades = r[1]
        profit = r[2]
        models = list(r.index)[3:]
        for m in models:
            if pd.isnull(r[m]):
                continue
            d = json.loads(r[m].replace("'",'"'))
            tp = int(d['rc'] * relevant_elements)
            fp = int(tp/d['pr']-tp) if d['pr'] != 0 else 0
            ratio = round(tp/fp,2) if fp>0 else 0
            act_trades = min(trades,tp)
            row = dict(c=c,
                       dataset=i,
                       classifer=m,
                       tp=tp,
                       fp=fp,
                       diff=tp-fp,
                       ratio=ratio,
                       pr=d.get('pr',0),
                       rc=d.get('rc',0),
                       pf=d.get('pf',0),
                       est_max_profit=(profit/trades)*act_trades if trades>0 else 0
                      )
            data.append(row)
df = pd.DataFrame(data).set_index(['c','dataset','classifer'])

In [18]:
rank1 = df['pr'].rank(ascending=False)              # Highest precision
rank2 = df['diff'].rank(ascending=False)            # Highest tp-fp difference
rank3 = df['ratio'].rank(ascending=False)           # Highest tp/fp ratio (similar to precision)
#rank4 = df['tp'].rank(ascending=False)              # Highest true positives (not good indicator)
#rank5 = df['est_max_profit'].rank(ascending=False)  # Highest estimated max profit

#ranks = [rank1,rank2,rank3,rank4,rank5]
ranks = [rank1,rank2,rank3]

df['score'] = sum(ranks)/len(ranks)                 # Average of ranks
df.sort_values(by='score').head(11)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tp,fp,diff,ratio,pr,rc,pf,est_max_profit,score
c,dataset,classifer,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
C,20210806i,lrc_test_scores,453,277,176,1.64,0.6204,0.0082,1.321,4316.470906,3.666667
A,20210806g,abc_test_scores,98,13,85,7.54,0.8824,0.0031,1.0243,288.939732,4.0
B,20210806i,lrc_test_scores,840,631,209,1.33,0.5708,0.0152,1.6651,8004.052011,5.0
A,20210806g,rfc_test_scores,2075,1856,219,1.12,0.5278,0.0656,1.2348,1084.998178,6.0
A,20210806i,lrc_test_scores,1731,1535,196,1.13,0.5299,0.0313,1.0503,9843.078247,6.0
C,20210806d,lrc_test_scores,40,26,14,1.54,0.6,0.0009,1.0045,614.999094,6.166667
C,20210806d,gbc_test_scores,58,38,20,1.53,0.6,0.0013,1.0126,891.748687,6.166667
B,20210806i,gbc_test_scores,2710,2460,250,1.1,0.5241,0.049,0.6239,9843.078247,7.0
A,20210806i,gbc_test_scores,1919,1770,149,1.08,0.5201,0.0347,0.84,9843.078247,9.333333
A,20210806a,abc_test_scores,60,54,6,1.11,0.5263,0.0016,0.9951,673.231226,9.833333


In [22]:
df.sort_values(by='score').reset_index()[['classifer', 'dataset','c','tp','fp','diff','ratio','pr','rc','pf']].head(11)

Unnamed: 0,classifer,dataset,c,tp,fp,diff,ratio,pr,rc,pf
0,lrc_test_scores,20210806i,C,453,277,176,1.64,0.6204,0.0082,1.321
1,abc_test_scores,20210806g,A,98,13,85,7.54,0.8824,0.0031,1.0243
2,lrc_test_scores,20210806i,B,840,631,209,1.33,0.5708,0.0152,1.6651
3,rfc_test_scores,20210806g,A,2075,1856,219,1.12,0.5278,0.0656,1.2348
4,lrc_test_scores,20210806i,A,1731,1535,196,1.13,0.5299,0.0313,1.0503
5,lrc_test_scores,20210806d,C,40,26,14,1.54,0.6,0.0009,1.0045
6,gbc_test_scores,20210806d,C,58,38,20,1.53,0.6,0.0013,1.0126
7,gbc_test_scores,20210806i,B,2710,2460,250,1.1,0.5241,0.049,0.6239
8,gbc_test_scores,20210806i,A,1919,1770,149,1.08,0.5201,0.0347,0.84
9,abc_test_scores,20210806a,A,60,54,6,1.11,0.5263,0.0016,0.9951


In [21]:
df.sort_values(by='pf', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tp,fp,diff,ratio,pr,rc,pf,est_max_profit,score
c,dataset,classifer,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
B,20210806i,lrc_test_scores,840,631,209,1.33,0.5708,0.0152,1.6651,8004.052011,5.0
C,20210806e,mlp_test_scores,156,400,-244,0.39,0.2805,0.0092,1.3758,193.980465,112.0
C,20210806i,lrc_test_scores,453,277,176,1.64,0.6204,0.0082,1.321,4316.470906,3.666667
A,20210806d,lrc_test_scores,308,393,-85,0.78,0.4393,0.0069,1.2961,4735.493026,54.666667
A,20210806h,rfc_test_scores,813,1167,-354,0.7,0.4106,0.0307,1.2911,1295.216356,72.333333
B,20210806g,rfc_test_scores,3287,3631,-344,0.91,0.4751,0.1039,1.2846,1084.998178,55.333333
B,20210806d,lrc_test_scores,98,124,-26,0.79,0.4412,0.0022,1.2377,1506.747781,49.833333
A,20210806g,rfc_test_scores,2075,1856,219,1.12,0.5278,0.0656,1.2348,1084.998178,6.0
C,20210806g,mlp_test_scores,1452,2194,-742,0.66,0.3982,0.0459,1.2196,1084.998178,85.333333
C,20210806e,gnb_test_scores,88,385,-297,0.23,0.1857,0.0052,1.1821,109.424878,151.333333


In [15]:
# df[df.pf>1].sort_values(by='pf', ascending=False)

df['rank1'] = df['diff'].rank(ascending=False)            # Highest tp-fp difference
df['rank2'] = df['pr'].rank(ascending=False)              # Highest precision
df['rank3'] = df['ratio'].rank(ascending=False)           # Highest tp/fp ratio
df['rank4'] = df['tp'].rank(ascending=False)              # Highest true positives
df['rank5'] = df['est_max_profit'].rank(ascending=False)  # Highest estimated max profit


display(df[df.pf>1].describe())
display(df[df.pf<=1].describe())

Unnamed: 0,tp,fp,diff,ratio,pr,rc,pf,est_max_profit,score,rank1,rank2,rank3,rank4,rank5
count,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0,48.0
mean,557.854167,823.979167,-266.125,0.676458,0.356054,0.019,1.096977,1016.07208,94.652778,95.822917,90.46875,97.666667,137.770833,140.427083
std,861.602268,1155.39494,439.462177,1.092955,0.223421,0.027627,0.131353,1936.675398,51.086607,54.536781,70.961063,73.1483,44.980842,44.380127
min,0.0,0.0,-1509.0,0.0,0.0,0.0,1.0007,0.0,3.666667,2.0,1.5,1.0,56.0,42.0
25%,24.75,38.75,-345.0,0.23,0.206225,0.000875,1.012375,75.059917,59.458333,46.5,29.75,36.125,94.75,111.5
50%,103.0,279.0,-90.5,0.495,0.335,0.0044,1.0426,276.475821,90.0,104.5,79.5,81.0,148.25,150.5
75%,819.75,1259.0,0.0,0.7275,0.441425,0.03085,1.123975,1084.998178,139.208333,132.5,157.625,162.0,163.875,168.875
max,3309.0,4687.0,219.0,7.54,1.0,0.1046,1.6651,9843.078247,169.333333,180.0,220.5,219.5,220.5,220.5


Unnamed: 0,tp,fp,diff,ratio,pr,rc,pf,est_max_profit,score,rank1,rank2,rank3,rank4,rank5
count,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0
mean,3170.313725,6957.441176,-3787.127451,0.332451,0.215034,0.091053,0.718459,4097.526939,133.993464,133.718137,134.977941,133.284314,123.848039,123.223039
std,5623.007219,12199.495035,7774.428332,0.28932,0.163196,0.15282,0.330087,5688.15873,39.718288,74.079376,70.117904,70.487169,77.174848,77.092312
min,0.0,0.0,-42257.0,0.0,0.0,0.0,0.0183,0.0,7.0,1.0,10.0,8.0,1.0,6.5
25%,0.0,0.0,-2910.75,0.0,0.0,0.0,0.468525,0.0,107.416667,46.5,73.75,69.5,51.75,52.75
50%,312.5,819.0,-415.0,0.33,0.25,0.01205,0.89745,798.99396,150.25,140.5,134.5,133.0,119.5,120.5
75%,3700.5,7481.75,0.0,0.55,0.3533,0.110825,0.9952,8443.79499,162.166667,201.25,220.5,219.5,220.5,220.5
max,30871.0,50531.0,250.0,1.11,0.5263,0.5615,1.0,17342.974458,204.666667,252.0,220.5,219.5,220.5,220.5


In [4]:
df_hist

Unnamed: 0_level_0,use_atr,ratio,reverse,window,length,pos_labels,imbalance,train_imbal,test_imbal,sim_init_val,sim_fee_buy,sim_fee_sell,sim_fee_per,sim_num_trades,sim_max_profit,sim_bad_trades,gnb_train_scores,gnb_test_scores,lrc_train_scores,lrc_test_scores,rfc_train_scores,rfc_test_scores,abc_train_scores,abc_test_scores,gbc_train_scores,gbc_test_scores,xgb_train_scores,xgb_test_scores,mlp_train_scores,mlp_test_scores
suffix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
20210806a,False,"(0.01, 0.005)",True,30,141174,37979,0.269023,0.262033,0.319201,1.0,0.0,0.0,0.1,1179.0,13228.9936,0.0,"{'f1': 0.0377, 'pr': 0.3764, 'rc': 0.0198, 'pf...","{'f1': 0.0217, 'pr': 0.3077, 'rc': 0.0113, 'pf...","{'f1': 0.0001, 'pr': 0.1818, 'rc': 0.0001, 'pf...","{'f1': 0.0016, 'pr': 0.2083, 'rc': 0.0008, 'pf...","{'f1': 1.0, 'pr': 1.0, 'rc': 1.0, 'pf': 109462...","{'f1': 0.211, 'pr': 0.3804, 'rc': 0.146, 'pf':...","{'f1': 0.0096, 'pr': 0.6683, 'rc': 0.0048, 'pf...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}","{'f1': 0.0153, 'pr': 0.9636, 'rc': 0.0077, 'pf...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}","{'f1': 0.588, 'pr': 0.9309, 'rc': 0.4298, 'pf'...","{'f1': 0.1685, 'pr': 0.3605, 'rc': 0.11, 'pf':...","{'f1': 0.1603, 'pr': 0.5985, 'rc': 0.0926, 'pf...","{'f1': 0.1116, 'pr': 0.4145, 'rc': 0.0645, 'pf..."
20210806b,False,"(0.01, 0.0025)",True,30,141174,23782,0.168459,0.169634,0.196954,1.0,0.0,0.0,0.1,1093.0,6620.481356,0.0,"{'f1': 0.0203, 'pr': 0.2266, 'rc': 0.0106, 'pf...","{'f1': 0.0148, 'pr': 0.2586, 'rc': 0.0076, 'pf...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 0.9955}","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}","{'f1': 1.0, 'pr': 1.0, 'rc': 1.0, 'pf': 248910...","{'f1': 0.038, 'pr': 0.3008, 'rc': 0.0203, 'pf'...","{'f1': 0.0035, 'pr': 1.0, 'rc': 0.0017, 'pf': ...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}","{'f1': 0.0044, 'pr': 0.975, 'rc': 0.0022, 'pf'...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}","{'f1': 0.3172, 'pr': 0.9575, 'rc': 0.1901, 'pf...","{'f1': 0.0153, 'pr': 0.265, 'rc': 0.0079, 'pf'...","{'f1': 0.0469, 'pr': 0.5709, 'rc': 0.0245, 'pf...","{'f1': 0.0255, 'pr': 0.4031, 'rc': 0.0132, 'pf..."
20210806c,False,"(0.0075, 0.0025)",True,30,141174,29824,0.211257,0.220094,0.233808,1.0,0.0,0.0,0.1,1456.0,3126.592203,0.0,"{'f1': 0.2332, 'pr': 0.2506, 'rc': 0.218, 'pf'...","{'f1': 0.182, 'pr': 0.2573, 'rc': 0.1408, 'pf'...","{'f1': 0.0001, 'pr': 0.5, 'rc': 0.0, 'pf': 1.001}","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}","{'f1': 1.0, 'pr': 1.0, 'rc': 1.0, 'pf': 251641...","{'f1': 0.042, 'pr': 0.3302, 'rc': 0.0224, 'pf'...","{'f1': 0.005, 'pr': 0.8169, 'rc': 0.0025, 'pf'...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}","{'f1': 0.0063, 'pr': 0.9481, 'rc': 0.0032, 'pf...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}","{'f1': 0.3233, 'pr': 0.9507, 'rc': 0.1948, 'pf...","{'f1': 0.0222, 'pr': 0.2857, 'rc': 0.0115, 'pf...","{'f1': 0.032, 'pr': 0.654, 'rc': 0.0164, 'pf':...","{'f1': 0.0117, 'pr': 0.28, 'rc': 0.006, 'pf': ..."
20210806d,True,"(2, 1)",True,15,141174,44769,0.317119,0.317522,0.341973,1.0,0.0,0.0,0.1,1128.0,17342.974458,1.0,"{'f1': 0.0816, 'pr': 0.3498, 'rc': 0.0462, 'pf...","{'f1': 0.0753, 'pr': 0.3689, 'rc': 0.0419, 'pf...","{'f1': 0.0002, 'pr': 0.2308, 'rc': 0.0001, 'pf...","{'f1': 0.0017, 'pr': 0.6, 'rc': 0.0009, 'pf': ...","{'f1': 1.0, 'pr': 1.0, 'rc': 1.0, 'pf': 185098...","{'f1': 0.1975, 'pr': 0.3904, 'rc': 0.1322, 'pf...","{'f1': 0.0081, 'pr': 0.6476, 'rc': 0.0041, 'pf...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}","{'f1': 0.0229, 'pr': 0.8178, 'rc': 0.0116, 'pf...","{'f1': 0.0026, 'pr': 0.6, 'rc': 0.0013, 'pf': ...","{'f1': 0.534, 'pr': 0.8965, 'rc': 0.3802, 'pf'...","{'f1': 0.1707, 'pr': 0.3983, 'rc': 0.1086, 'pf...","{'f1': 0.219, 'pr': 0.5916, 'rc': 0.1344, 'pf'...","{'f1': 0.1145, 'pr': 0.4266, 'rc': 0.0662, 'pf..."
20210806e,True,"(4, 2)",True,15,141174,17024,0.120589,0.12343,0.124245,1.0,0.0,0.0,0.1,335.0,416.560613,0.0,"{'f1': 0.0139, 'pr': 0.1601, 'rc': 0.0073, 'pf...","{'f1': 0.0102, 'pr': 0.1857, 'rc': 0.0052, 'pf...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 0.8928}","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}","{'f1': 1.0, 'pr': 1.0, 'rc': 1.0, 'pf': 331591...","{'f1': 0.0125, 'pr': 0.2222, 'rc': 0.0064, 'pf...","{'f1': 0.0017, 'pr': 0.7857, 'rc': 0.0009, 'pf...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}","{'f1': 0.0111, 'pr': 0.9863, 'rc': 0.0056, 'pf...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}","{'f1': 0.523, 'pr': 0.9794, 'rc': 0.3568, 'pf'...","{'f1': 0.0627, 'pr': 0.4421, 'rc': 0.0338, 'pf...","{'f1': 0.0607, 'pr': 0.7057, 'rc': 0.0317, 'pf...","{'f1': 0.0179, 'pr': 0.2805, 'rc': 0.0092, 'pf..."
20210806f,True,"(4, 1)",True,15,141174,15304,0.108405,0.11071,0.111111,1.0,0.0,0.0,0.1,340.0,467.52109,1.0,"{'f1': 0.0323, 'pr': 0.1545, 'rc': 0.018, 'pf'...","{'f1': 0.0314, 'pr': 0.1054, 'rc': 0.0184, 'pf...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 0.9223}","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}","{'f1': 1.0, 'pr': 1.0, 'rc': 1.0, 'pf': 337728...","{'f1': 0.0071, 'pr': 0.2667, 'rc': 0.0036, 'pf...","{'f1': 0.0074, 'pr': 0.8958, 'rc': 0.0037, 'pf...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}","{'f1': 0.011, 'pr': 1.0, 'rc': 0.0055, 'pf': 1...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}","{'f1': 0.5257, 'pr': 0.9723, 'rc': 0.3602, 'pf...","{'f1': 0.0731, 'pr': 0.3434, 'rc': 0.0409, 'pf...","{'f1': 0.0281, 'pr': 0.7345, 'rc': 0.0143, 'pf...","{'f1': 0.007, 'pr': 0.1633, 'rc': 0.0036, 'pf'..."
20210806g,True,"(4, 2)",True,30,141174,31640,0.224121,0.227442,0.238302,1.0,0.0,0.0,0.1,368.0,1084.998178,0.0,"{'f1': 0.0987, 'pr': 0.3374, 'rc': 0.0578, 'pf...","{'f1': 0.0609, 'pr': 0.3306, 'rc': 0.0335, 'pf...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 0.943}","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}","{'f1': 1.0, 'pr': 1.0, 'rc': 0.9999, 'pf': 798...","{'f1': 0.1669, 'pr': 0.4138, 'rc': 0.1046, 'pf...","{'f1': 0.0065, 'pr': 0.8298, 'rc': 0.0033, 'pf...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}","{'f1': 0.0245, 'pr': 0.9338, 'rc': 0.0124, 'pf...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 0.9763}","{'f1': 0.6495, 'pr': 0.9486, 'rc': 0.4939, 'pf...","{'f1': 0.1447, 'pr': 0.4017, 'rc': 0.0882, 'pf...","{'f1': 0.145, 'pr': 0.6702, 'rc': 0.0813, 'pf'...","{'f1': 0.0823, 'pr': 0.3982, 'rc': 0.0459, 'pf..."
20210806h,True,"(4, 1)",True,30,141174,26488,0.187627,0.189435,0.201498,1.0,0.0,0.0,0.1,376.0,1295.216356,2.0,"{'f1': 0.0657, 'pr': 0.2465, 'rc': 0.0379, 'pf...","{'f1': 0.0455, 'pr': 0.2282, 'rc': 0.0253, 'pf...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 0.9473}","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 0.992}","{'f1': 0.9999, 'pr': 1.0, 'rc': 0.9999, 'pf': ...","{'f1': 0.1002, 'pr': 0.401, 'rc': 0.0572, 'pf'...","{'f1': 0.0044, 'pr': 0.6111, 'rc': 0.0022, 'pf...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}","{'f1': 0.0151, 'pr': 0.9497, 'rc': 0.0076, 'pf...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 0.9937}","{'f1': 0.5398, 'pr': 0.9534, 'rc': 0.3765, 'pf...","{'f1': 0.0986, 'pr': 0.3367, 'rc': 0.0577, 'pf...","{'f1': 0.081, 'pr': 0.6743, 'rc': 0.0431, 'pf'...","{'f1': 0.0588, 'pr': 0.3686, 'rc': 0.032, 'pf'..."
20210806i,True,"(2, 1)",True,30,141174,55315,0.391821,0.39199,0.409738,1.0,0.0,0.0,0.1,1033.0,9843.078247,2.0,"{'f1': 0.3127, 'pr': 0.4508, 'rc': 0.2393, 'pf...","{'f1': 0.2811, 'pr': 0.4655, 'rc': 0.2013, 'pf...","{'f1': 0.0047, 'pr': 0.5079, 'rc': 0.0023, 'pf...","{'f1': 0.0161, 'pr': 0.6204, 'rc': 0.0082, 'pf...","{'f1': 1.0, 'pr': 1.0, 'rc': 1.0, 'pf': 471466...","{'f1': 0.33, 'pr': 0.4342, 'rc': 0.2662, 'pf':...","{'f1': 0.1194, 'pr': 0.5591, 'rc': 0.0668, 'pf...","{'f1': 0.1327, 'pr': 0.4781, 'rc': 0.077, 'pf'...","{'f1': 0.1417, 'pr': 0.6978, 'rc': 0.0788, 'pf...","{'f1': 0.1138, 'pr': 0.4925, 'rc': 0.0644, 'pf...","{'f1': 0.69, 'pr': 0.8645, 'rc': 0.5741, 'pf':...","{'f1': 0.2855, 'pr': 0.4452, 'rc': 0.2101, 'pf...","{'f1': 0.4311, 'pr': 0.5688, 'rc': 0.3471, 'pf...","{'f1': 0.2606, 'pr': 0.4477, 'rc': 0.1838, 'pf..."
20210806j,False,"(0.01, 0.005)",True,15,141174,29035,0.205668,0.187259,0.265218,1.0,0.0,0.0,0.1,1199.0,15539.693313,0.0,"{'f1': 0.0933, 'pr': 0.2354, 'rc': 0.0582, 'pf...","{'f1': 0.1299, 'pr': 0.2583, 'rc': 0.0868, 'pf...","{'f1': 0.0004, 'pr': 0.0597, 'rc': 0.0002, 'pf...","{'f1': 0.0037, 'pr': 0.0885, 'rc': 0.0019, 'pf...","{'f1': 1.0, 'pr': 1.0, 'rc': 1.0, 'pf': 708751...","{'f1': 0.0935, 'pr': 0.2588, 'rc': 0.0571, 'pf...","{'f1': 0.0037, 'pr': 0.9231, 'rc': 0.0018, 'pf...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}","{'f1': 0.0121, 'pr': 0.9675, 'rc': 0.0061, 'pf...","{'f1': 0.0, 'pr': 0.0, 'rc': 0.0, 'pf': 1}","{'f1': 0.4201, 'pr': 0.9469, 'rc': 0.2699, 'pf...","{'f1': 0.0632, 'pr': 0.252, 'rc': 0.0362, 'pf'...","{'f1': 0.1104, 'pr': 0.6179, 'rc': 0.0606, 'pf...","{'f1': 0.0425, 'pr': 0.354, 'rc': 0.0226, 'pf'..."


# Build Best Performers

In [5]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

import pickle
import datetime
import re

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)

def train_test_split(X, y, train_idx=None, test_idx=None):
    X_train = X.loc[train_idx]
    y_train = y.loc[train_idx]
    X_test = X.loc[test_idx]
    y_test = y.loc[test_idx]
    return (X_train, y_train, X_test, y_test)

def load_split_data(suffix, split=False):
    X = pd.read_pickle(f'data/X_{suffix}.pkl')
    y = pd.read_pickle(f'data/y_{suffix}.pkl')
    if split:
        X_train, y_train, X_test, y_test = train_test_split(X, y, X.loc['2018':'2020'].index, X.loc['2021':].index)
        return X_train, y_train, X_test, y_test
    else:
        return X, y

def get_columns(X,lookbacks):
    # Drop columns with lookbacks equal to or greater than X
    columns = list(X.columns)
    for c in X.columns:
        if m := re.match(r'^.*_([0-9]+)$', c):
            if int(m[1]) > lookbacks:
                columns.remove(c)
    return columns

clfs = {
    'gnb' : GaussianNB(),
    'lrc' : LogisticRegression(random_state=42, max_iter=10000),
    'rfc' : RandomForestClassifier(random_state=42, n_jobs=-1),
    'abc' : AdaBoostClassifier(random_state=42),
    'gbc': GradientBoostingClassifier(random_state=42),
    'xgb' : xgb.XGBClassifier(n_jobs=-1, random_state=42, use_label_encoder=False),
    'mlp' : make_pipeline(StandardScaler(),MLPClassifier(random_state=42)),
}
numlkb_dict = {'A': 15, 
               'B': 3,
               'C': 0}

In [6]:
dataset = '20210806i'
ncols = 'A'
cname = 'abc'

print('Fitting model...')
X_train, y_train, X_test, y_test = load_split_data(dataset, split=True)
columns = get_columns(X_train, numlkb_dict[ncols])
clfs[cname].fit(X_train[columns],y_train)

timestamp = datetime.datetime.now().strftime('%y%m%d%H%M')
filename = f"models/nm_{cname}_{ncols}_{dataset}_{timestamp}.pkl"
print(f'Saving model to {filename}...')
pickle.dump(clfs[cname], open(filename, 'wb'))

Fitting model...


KeyboardInterrupt: 

In [None]:
dataset = '20210806i'
ncols = 'B'
cname = 'gbc'

print('Fitting model...')
X_train, y_train, X_test, y_test = load_split_data(dataset, split=True)
columns = get_columns(X_train, numlkb_dict[ncols])
clfs[cname].fit(X_train[columns],y_train)

timestamp = datetime.datetime.now().strftime('%y%m%d%H%M')
filename = f"models/nm_{cname}_{ncols}_{dataset}_{timestamp}.pkl"
print(f'Saving model to {filename}...')
pickle.dump(clfs[cname], open(filename, 'wb'))

In [None]:
dataset = '20210806i'
ncols = 'A'
cname = 'lrc'

print('Fitting model...')
X_train, y_train, X_test, y_test = load_split_data(dataset, split=True)
columns = get_columns(X_train, numlkb_dict[ncols])
clfs[cname].fit(X_train[columns],y_train)

timestamp = datetime.datetime.now().strftime('%y%m%d%H%M')
filename = f"models/nm_{cname}_{ncols}_{dataset}_{timestamp}.pkl"
print(f'Saving model to {filename}...')
pickle.dump(clfs[cname], open(filename, 'wb'))

In [None]:
dataset = '20210806i'
ncols = 'A'
cname = 'gbc'

print('Fitting model...')
X_train, y_train, X_test, y_test = load_split_data(dataset, split=True)
columns = get_columns(X_train, numlkb_dict[ncols])
clfs[cname].fit(X_train[columns],y_train)

timestamp = datetime.datetime.now().strftime('%y%m%d%H%M')
filename = f"models/nm_{cname}_{ncols}_{dataset}_{timestamp}.pkl"
print(f'Saving model to {filename}...')
pickle.dump(clfs[cname], open(filename, 'wb'))

In [None]:
dataset = '20210806i'
ncols = 'B'
cname = 'lrc'

print('Fitting model...')
X_train, y_train, X_test, y_test = load_split_data(dataset, split=True)
columns = get_columns(X_train, numlkb_dict[ncols])
clfs[cname].fit(X_train[columns],y_train)

timestamp = datetime.datetime.now().strftime('%y%m%d%H%M')
filename = f"models/nm_{cname}_{ncols}_{dataset}_{timestamp}.pkl"
print(f'Saving model to {filename}...')
pickle.dump(clfs[cname], open(filename, 'wb'))

In [None]:
dataset = '20210806i'
ncols = 'C'
cname = 'lrc'

print('Fitting model...')
X_train, y_train, X_test, y_test = load_split_data(dataset, split=True)
columns = get_columns(X_train, numlkb_dict[ncols])
clfs[cname].fit(X_train[columns],y_train)

timestamp = datetime.datetime.now().strftime('%y%m%d%H%M')
filename = f"models/nm_{cname}_{ncols}_{dataset}_{timestamp}.pkl"
print(f'Saving model to {filename}...')
pickle.dump(clfs[cname], open(filename, 'wb'))

In [None]:
dataset = '20210806g'
ncols = 'A'
cname = 'rfc'

print('Fitting model...')
X_train, y_train, X_test, y_test = load_split_data(dataset, split=True)
columns = get_columns(X_train, numlkb_dict[ncols])
clfs[cname].fit(X_train[columns],y_train)

timestamp = datetime.datetime.now().strftime('%y%m%d%H%M')
filename = f"models/nm_{cname}_{ncols}_{dataset}_{timestamp}.pkl"
print(f'Saving model to {filename}...')
pickle.dump(clfs[cname], open(filename, 'wb'))

# Validate Built Models

In [None]:
models = [
    'models/nm_abc_A_20210806i_2108100754.pkl',
    'models/nm_gbc_B_20210806i_2108100759.pkl',
    'models/nm_lrc_A_20210806i_2108100800.pkl',
    'models/nm_gbc_A_20210806i_2108100813.pkl',
    'models/nm_lrc_B_20210806i_2108100814.pkl',
    'models/nm_lrc_C_20210806i_2108100814.pkl',
    'models/nm_rfc_A_20210806g_2108100814.pkl',
]

numlkb_dict = {'A': 15, 
               'B': 3,
               'C': 0}

def score_clf(clf, X, y):
    f1 = round(f1_score(y,clf.predict(X)),4)
    pr = round(precision_score(y,clf.predict(X)),4)
    rc = round(recall_score(y,clf.predict(X)),4)
    return {'f1':f1, 'pr':pr, 'rc':rc}

for m in models:
    dataset = m[16:25]
    ncols = m[14]
    
    X_train, y_train, X_test, y_test = load_split_data(dataset, split=True)
    columns = get_columns(X_train, numlkb_dict[ncols])
    clf = pickle.load(open(m, 'rb'))
    
    scores = score_clf(clf, X_test[columns], y_test)
    print(f"{m}:  {scores}")