In [1]:
import sys
!{sys.executable} -m pip install holidays  pandas_market_calendars BorutaShap scikit-optimize mlxtend

!{sys.executable} -m pip install install --extra-index http://pypi.dcai2.local --upgrade --trusted-host pypi.dcai2.local dcaitrading==0.0.6-22-g9594cce

Collecting mlxtend
  Downloading mlxtend-0.21.0-py2.py3-none-any.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: mlxtend
Successfully installed mlxtend-0.21.0
Looking in indexes: https://pypi.org/simple, http://pypi.dcai2.local


In [1]:
from datetime import date, timedelta

from featureeng.times import get_trading_times_for_london_ny
from featureeng.features import *
import featureeng as feng
import data.readers as rd

import multiprocessing
import time
import threading
import json
import os

import pyarrow as pa
import pyarrow.parquet as pq

# Importing core libraries
import numpy as np
import pandas as pd
from time import time
import pprint
import joblib
from functools import partial

# Suppressing warnings because of skopt verbosity
import warnings
warnings.filterwarnings("ignore")

# Classifier/Regressor
from xgboost import XGBClassifier

# Feature selection
from BorutaShap import BorutaShap

# Data processing
from sklearn.preprocessing import OrdinalEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.mixture import GaussianMixture

import pickle
# Validation
from sklearn.model_selection import StratifiedKFold

root_dir = '/home/dcai/data/features'
instrument='EURCHF'
feature_set="4"



temp_save_dir="{}/{}".format(root_dir,instrument)

def save_df(save_dir, df, name):
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    pq.write_table(pa.Table.from_pandas(df), os.path.join(save_dir, name))



def read_parquet(save_dir, name):
    return pq.ParquetFile(os.path.join(save_dir, name)).read().to_pandas()


with open(os.path.join(temp_save_dir, 'y_stratified.pickle'), 'rb') as f:
    y_stratified = pickle.load(f)
    
#X_train = read_parquet(temp_save_dir, 'X_train-2.parquet')
#Y_train = read_parquet(temp_save_dir, 'Y_train.parquet')

In [2]:

def list_of_files(root_dir):
    filenames = []
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            # Check if the file is a CSV file
            if file.endswith('.parquet'):
                filenames.append(os.path.join(root, file))
    return filenames

def read_in_files(file_list):
    dfs = []
    for f in file_list:
        dfs.append(pq.ParquetFile(f).read().to_pandas())
    return pd.concat(dfs).sort_index()

def train_valid_test(instrument, feature_set, root_dir='/home/dcai/data/features',  train_size=0.75):
    files = list_of_files("{}/{}/{}".format(root_dir,instrument, feature_set))
    files.sort()

    train_index = int(len(files)*train_size)
    train = files[0:train_index]
    valid_test = files[train_index:]


    valid = valid_test[0::2]
    test = valid_test[1::2]

    return {'train': read_in_files(train),
            'valid': read_in_files(valid),
            'test': read_in_files(test)
           }



start_time = time()    
result = train_valid_test(instrument=instrument, feature_set=feature_set)
end_time = time()
print(f'Execution time: {end_time - start_time:.3f} seconds')

Execution time: 9.467 seconds


In [5]:
train.columns.values

array(['MeanPrice', 'close_MIN_1D_5', 'close_MIN_1D_20',
       'close_MIN_1D_60', 'close_MAX_1D_5', 'close_MAX_1D_20',
       'close_MAX_1D_60', 'close_AVG_1D_5', 'close_AVG_1D_20',
       'close_AVG_1D_60', 'close_STDEV_1D_5', 'close_STDEV_1D_20',
       'close_STDEV_1D_60', 'close_MEDIAN_1D_5', 'close_MEDIAN_1D_20',
       'close_MEDIAN_1D_60', 'close_MIN_1H_5', 'close_MIN_1H_20',
       'close_MIN_1H_60', 'close_MAX_1H_5', 'close_MAX_1H_20',
       'close_MAX_1H_60', 'close_AVG_1H_5', 'close_AVG_1H_20',
       'close_AVG_1H_60', 'close_STDEV_1H_5', 'close_STDEV_1H_20',
       'close_STDEV_1H_60', 'close_MEDIAN_1H_5', 'close_MEDIAN_1H_20',
       'close_MEDIAN_1H_60', 'close_MIN_15Min_5', 'close_MIN_15Min_20',
       'close_MIN_15Min_60', 'close_MAX_15Min_5', 'close_MAX_15Min_20',
       'close_MAX_15Min_60', 'close_AVG_15Min_5', 'close_AVG_15Min_20',
       'close_AVG_15Min_60', 'close_STDEV_15Min_5',
       'close_STDEV_15Min_20', 'close_STDEV_15Min_60',
       'close_MEDIAN_15Min

In [3]:
train = result['train']
#valid = result['valid']
result = None

In [6]:


selected_cols=['MeanPrice', 'close_MIN_1D_5', 'close_MIN_1D_20',
       'close_MIN_1D_60', 'close_MAX_1D_5', 'close_MAX_1D_20',
       'close_MAX_1D_60', 'close_AVG_1D_5', 'close_AVG_1D_20',
       'close_AVG_1D_60', 'close_STDEV_1D_5', 'close_STDEV_1D_20',
       'close_STDEV_1D_60', 'close_MEDIAN_1D_5', 'close_MEDIAN_1D_20',
       'close_MEDIAN_1D_60', 'close_MIN_1H_5', 'close_MIN_1H_20',
       'close_MIN_1H_60', 'close_MAX_1H_5', 'close_MAX_1H_20',
       'close_MAX_1H_60', 'close_AVG_1H_5', 'close_AVG_1H_20',
       'close_AVG_1H_60', 'close_STDEV_1H_5', 'close_STDEV_1H_20',
       'close_STDEV_1H_60', 'close_MEDIAN_1H_5', 'close_MEDIAN_1H_20',
       'close_MEDIAN_1H_60', 'close_MIN_15Min_5', 'close_MIN_15Min_20',
       'close_MIN_15Min_60', 'close_MAX_15Min_5', 'close_MAX_15Min_20',
       'close_MAX_15Min_60', 'close_AVG_15Min_5', 'close_AVG_15Min_20',
       'close_AVG_15Min_60', 'close_STDEV_15Min_5',
       'close_STDEV_15Min_20', 'close_STDEV_15Min_60',
       'close_MEDIAN_15Min_5', 'close_MEDIAN_15Min_20',
       'close_MEDIAN_15Min_60', 'London', 'NY', 'rolling_1D_60_3_1',
       'rolling_1D_60_3_2', 'rolling_1D_60_3_3', 'ewm_1D_60_3_1',
       'ewm_1D_60_3_2', 'ewm_1D_60_3_3', 'rolling_1D_20_4_1',
       'rolling_1D_20_4_2', 'rolling_1D_20_4_3', 'rolling_1D_20_4_4',
       'ewm_1D_20_4_1', 'ewm_1D_20_4_2', 'ewm_1D_20_4_3', 'ewm_1D_20_4_4',
       'rolling_4H_20_4_1', 'rolling_4H_20_4_2', 'rolling_4H_20_4_3',
       'rolling_4H_20_4_4', 'ewm_4H_20_4_1', 'ewm_4H_20_4_2',
       'ewm_4H_20_4_3', 'ewm_4H_20_4_4', 'rolling_1D_60_20_4_1',
       'rolling_1D_60_20_4_2', 'rolling_1D_60_20_4_3',
       'rolling_1D_60_20_4_4', 'rolling_4H_60_20_4_1',
       'rolling_4H_60_20_4_2', 'rolling_4H_60_20_4_3',
       'rolling_4H_60_20_4_4']

numerics = ['MeanPrice', 'close_MIN_1D_5', 'close_MIN_1D_20',
       'close_MIN_1D_60', 'close_MAX_1D_5', 'close_MAX_1D_20',
       'close_MAX_1D_60', 'close_AVG_1D_5', 'close_AVG_1D_20',
       'close_AVG_1D_60', 'close_STDEV_1D_5', 'close_STDEV_1D_20',
       'close_STDEV_1D_60', 'close_MEDIAN_1D_5', 'close_MEDIAN_1D_20',
       'close_MEDIAN_1D_60', 'close_MIN_1H_5', 'close_MIN_1H_20',
       'close_MIN_1H_60', 'close_MAX_1H_5', 'close_MAX_1H_20',
       'close_MAX_1H_60', 'close_AVG_1H_5', 'close_AVG_1H_20',
       'close_AVG_1H_60', 'close_STDEV_1H_5', 'close_STDEV_1H_20',
       'close_STDEV_1H_60', 'close_MEDIAN_1H_5', 'close_MEDIAN_1H_20',
       'close_MEDIAN_1H_60', 'close_MIN_15Min_5', 'close_MIN_15Min_20',
       'close_MIN_15Min_60', 'close_MAX_15Min_5', 'close_MAX_15Min_20',
       'close_MAX_15Min_60', 'close_AVG_15Min_5', 'close_AVG_15Min_20',
       'close_AVG_15Min_60', 'close_STDEV_15Min_5',
       'close_STDEV_15Min_20', 'close_STDEV_15Min_60',
       'close_MEDIAN_15Min_5', 'close_MEDIAN_15Min_20',
       'close_MEDIAN_15Min_60', 'rolling_1D_60_3_1',
       'rolling_1D_60_3_2', 'rolling_1D_60_3_3', 'ewm_1D_60_3_1',
       'ewm_1D_60_3_2', 'ewm_1D_60_3_3', 'rolling_1D_20_4_1',
       'rolling_1D_20_4_2', 'rolling_1D_20_4_3', 'rolling_1D_20_4_4',
       'ewm_1D_20_4_1', 'ewm_1D_20_4_2', 'ewm_1D_20_4_3', 'ewm_1D_20_4_4',
       'rolling_4H_20_4_1', 'rolling_4H_20_4_2', 'rolling_4H_20_4_3',
       'rolling_4H_20_4_4', 'ewm_4H_20_4_1', 'ewm_4H_20_4_2',
       'ewm_4H_20_4_3', 'ewm_4H_20_4_4', 'rolling_1D_60_20_4_1',
       'rolling_1D_60_20_4_2', 'rolling_1D_60_20_4_3',
       'rolling_1D_60_20_4_4', 'rolling_4H_60_20_4_1',
       'rolling_4H_60_20_4_2', 'rolling_4H_60_20_4_3',
       'rolling_4H_60_20_4_4']

labels = [ 'label_120_5_buy', 'label_120_5_sell', 'label_120_5_stay']

In [7]:
X_train = train[numerics]

Y_train = train[labels]

#X_valid = valid[numerics]
#Y_valid = valid[labels]

X_train.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6540376 entries, 2018-04-16 07:00:00.489000 to 2021-10-19 20:59:43.152000
Data columns (total 76 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   MeanPrice              float64
 1   close_MIN_1D_5         float64
 2   close_MIN_1D_20        float64
 3   close_MIN_1D_60        float64
 4   close_MAX_1D_5         float64
 5   close_MAX_1D_20        float64
 6   close_MAX_1D_60        float64
 7   close_AVG_1D_5         float64
 8   close_AVG_1D_20        float64
 9   close_AVG_1D_60        float64
 10  close_STDEV_1D_5       float64
 11  close_STDEV_1D_20      float64
 12  close_STDEV_1D_60      float64
 13  close_MEDIAN_1D_5      float64
 14  close_MEDIAN_1D_20     float64
 15  close_MEDIAN_1D_60     float64
 16  close_MIN_1H_5         float64
 17  close_MIN_1H_20        float64
 18  close_MIN_1H_60        float64
 19  close_MAX_1H_5         float64
 20  close_MAX_1H_20        float64
 21  close_M

In [11]:
# Stratifying the target for cross-validation
km = KMeans(random_state=0)
pca = PCA(n_components=len(numerics), random_state=0)

pca.fit(X_train[numerics])

PCA(n_components=76, random_state=0)

In [9]:
with open(os.path.join(temp_save_dir, 'pca.pickle'), 'rb') as f:
    pca = pickle.load(f)

with open(os.path.join(temp_save_dir, 'km.pickle'), 'rb') as f:
    km =  pickle.load(f)
    
with open(os.path.join(temp_save_dir, 'fa.pickle'), 'rb') as f:
    fa =  pickle.load(f)

In [13]:
km.fit(pca.transform(X_train))
y_stratified = km.labels_

with open(os.path.join(temp_save_dir, 'y_stratified.pickle'), 'wb') as f:
     pickle.dump(y_stratified, f)

In [14]:
with open(os.path.join(temp_save_dir, 'pca.pickle'), 'wb') as f:
    pickle.dump(pca, f)

with open(os.path.join(temp_save_dir, 'km.pickle'), 'wb') as f:
    pickle.dump(km, f)
#km = None

In [15]:
len(y_stratified)
print(len(numerics))
X_train.info()

76
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6540376 entries, 2018-04-16 07:00:00.489000 to 2021-10-19 20:59:43.152000
Data columns (total 76 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   MeanPrice              float64
 1   close_MIN_1D_5         float64
 2   close_MIN_1D_20        float64
 3   close_MIN_1D_60        float64
 4   close_MAX_1D_5         float64
 5   close_MAX_1D_20        float64
 6   close_MAX_1D_60        float64
 7   close_AVG_1D_5         float64
 8   close_AVG_1D_20        float64
 9   close_AVG_1D_60        float64
 10  close_STDEV_1D_5       float64
 11  close_STDEV_1D_20      float64
 12  close_STDEV_1D_60      float64
 13  close_MEDIAN_1D_5      float64
 14  close_MEDIAN_1D_20     float64
 15  close_MEDIAN_1D_60     float64
 16  close_MIN_1H_5         float64
 17  close_MIN_1H_20        float64
 18  close_MIN_1H_60        float64
 19  close_MAX_1H_5         float64
 20  close_MAX_1H_20        float64
 21  clos

In [16]:
fa = FactorAnalysis(n_components=len(numerics), max_iter=100, rotation='varimax', random_state=1)
fa.fit(X_train[numerics])

extra_feats = [f'fa_{i}'for i in range(len(numerics))]#[:4]

print(len(extra_feats))
X_train[extra_feats] = fa.transform(X_train[numerics])#[:,:4]
#valid[extra_feats] = fa.transform(valid[numerics])[:,:4]

#fa = None

76


In [17]:
import pickle

temp_save_dir="{}/{}".format(root_dir,instrument)

with open(os.path.join(temp_save_dir, 'fa.pickle'), 'wb') as f:
    pickle.dump(fa, f)
    

In [18]:
pca = PCA(n_components=len(numerics), random_state=0)
pca.fit(train[numerics])

pca_feats = [f'pca_{i}'for i in range(len(numerics))]

X_train[pca_feats] = pca.transform(X_train[numerics])
#valid[pca_feats] = pca.transform(valid[numerics])

extra_feats += pca_feats


#with open(os.path.join(temp_save_dir, 'pca.pickle'), 'wb') as f:
#    pickle.dump(pca, f)
#pca = None

In [19]:
print(extra_feats)
X_train.info()
print(X_train.columns.values)

['fa_0', 'fa_1', 'fa_2', 'fa_3', 'fa_4', 'fa_5', 'fa_6', 'fa_7', 'fa_8', 'fa_9', 'fa_10', 'fa_11', 'fa_12', 'fa_13', 'fa_14', 'fa_15', 'fa_16', 'fa_17', 'fa_18', 'fa_19', 'fa_20', 'fa_21', 'fa_22', 'fa_23', 'fa_24', 'fa_25', 'fa_26', 'fa_27', 'fa_28', 'fa_29', 'fa_30', 'fa_31', 'fa_32', 'fa_33', 'fa_34', 'fa_35', 'fa_36', 'fa_37', 'fa_38', 'fa_39', 'fa_40', 'fa_41', 'fa_42', 'fa_43', 'fa_44', 'fa_45', 'fa_46', 'fa_47', 'fa_48', 'fa_49', 'fa_50', 'fa_51', 'fa_52', 'fa_53', 'fa_54', 'fa_55', 'fa_56', 'fa_57', 'fa_58', 'fa_59', 'fa_60', 'fa_61', 'fa_62', 'fa_63', 'fa_64', 'fa_65', 'fa_66', 'fa_67', 'fa_68', 'fa_69', 'fa_70', 'fa_71', 'fa_72', 'fa_73', 'fa_74', 'fa_75', 'pca_0', 'pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7', 'pca_8', 'pca_9', 'pca_10', 'pca_11', 'pca_12', 'pca_13', 'pca_14', 'pca_15', 'pca_16', 'pca_17', 'pca_18', 'pca_19', 'pca_20', 'pca_21', 'pca_22', 'pca_23', 'pca_24', 'pca_25', 'pca_26', 'pca_27', 'pca_28', 'pca_29', 'pca_30', 'pca_31', 'pca_32', 'pca_

In [6]:
print()

extra_feats = ['fa_0', 'fa_1', 'fa_2', 'fa_3', 'fa_4', 'fa_5', 'fa_6', 'fa_7', 'fa_8', 'fa_9', 'fa_10', 'fa_11', 'fa_12', 'fa_13', 'fa_14', 'fa_15', 'fa_16', 'fa_17', 'fa_18', 'fa_19', 'fa_20', 'fa_21', 'fa_22', 'fa_23', 'fa_24', 'fa_25', 'fa_26', 'fa_27', 'fa_28', 'fa_29', 'fa_30', 'fa_31', 'fa_32', 'fa_33', 'fa_34', 'fa_35', 'fa_36', 'fa_37', 'fa_38', 'fa_39', 'fa_40', 'fa_41', 'fa_42', 'fa_43', 'fa_44', 'fa_45', 'fa_46', 'fa_47', 'fa_48', 'fa_49', 'fa_50', 'fa_51', 'fa_52', 'fa_53', 'fa_54', 'fa_55', 'fa_56', 'fa_57', 'fa_58', 'fa_59', 'fa_60', 'fa_61', 'fa_62', 'fa_63', 'fa_64', 'fa_65', 'fa_66', 'fa_67', 'fa_68', 'fa_69', 'fa_70', 'fa_71', 'fa_72', 'fa_73', 'fa_74', 'fa_75', 'pca_0', 'pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7', 'pca_8', 'pca_9', 'pca_10', 'pca_11', 'pca_12', 'pca_13', 'pca_14', 'pca_15', 'pca_16', 'pca_17', 'pca_18', 'pca_19', 'pca_20', 'pca_21', 'pca_22', 'pca_23', 'pca_24', 'pca_25', 'pca_26', 'pca_27', 'pca_28', 'pca_29', 'pca_30', 'pca_31', 'pca_32', 'pca_33', 'pca_34', 'pca_35', 'pca_36', 'pca_37', 'pca_38', 'pca_39', 'pca_40', 'pca_41', 'pca_42', 'pca_43', 'pca_44', 'pca_45', 'pca_46', 'pca_47', 'pca_48', 'pca_49', 'pca_50', 'pca_51', 'pca_52', 'pca_53', 'pca_54', 'pca_55', 'pca_56', 'pca_57', 'pca_58', 'pca_59', 'pca_60', 'pca_61', 'pca_62', 'pca_63', 'pca_64', 'pca_65', 'pca_66', 'pca_67', 'pca_68', 'pca_69', 'pca_70', 'pca_71', 'pca_72', 'pca_73', 'pca_74', 'pca_75']

print("feats: {}, {}".format(len(X_train.columns.tolist()),len(extra_feats)))


feats: 454, 128


In [8]:
import numpy as np
import itertools
dists  = [14] #np.arange(8, 20, 4).tolist()
print(dists)


print(len([x for x in itertools.product(numerics + extra_feats[::3], dists)]))

start_time = time()
for feature, dist in itertools.product(numerics + extra_feats[::3], dists):
    
    pickle_path = os.path.join(temp_save_dir, "gmm-{}-{}.pickle".format(feature, dist))
    
    if os.path.exists(pickle_path):
        print("already done {} {}".format(feature, dist))
        continue
    print("doing {} {}".format(feature, dist))
    
    x = X_train[[feature]][feature].values.reshape(-1, 1)
    
    gmm = GaussianMixture(n_components=dist,
                           max_iter=120,
                           random_state=0).fit(x)
    
    with open(pickle_path, 'wb') as f:
        pickle.dump(gmm, f)

        
end_time = time()
print(f'Execution time: {end_time - start_time:.3f} seconds')      

[14]
107
already done MeanPrice_x 14
already done close_MIN_1D_5 14
already done close_MIN_1D_20 14
already done close_MIN_1D_60 14
already done close_MAX_1D_5 14
already done close_MAX_1D_20 14
already done close_MAX_1D_60 14
already done close_AVG_1D_5 14
already done close_AVG_1D_20 14
already done close_AVG_1D_60 14
already done close_STDEV_1D_5 14
already done close_STDEV_1D_20 14
already done close_STDEV_1D_60 14
already done close_MEDIAN_1D_5 14
already done close_MEDIAN_1D_20 14
already done close_MEDIAN_1D_60 14
already done close_MIN_1H_5 14
already done close_MIN_1H_20 14
doing close_MIN_1H_60 14
doing close_MAX_1H_5 14
doing close_MAX_1H_20 14
doing close_MAX_1H_60 14
doing close_AVG_1H_5 14
doing close_AVG_1H_20 14
doing close_AVG_1H_60 14
doing close_STDEV_1H_5 14
doing close_STDEV_1H_20 14
doing close_STDEV_1H_60 14
doing close_MEDIAN_1H_5 14
doing close_MEDIAN_1H_20 14
doing close_MEDIAN_1H_60 14
doing London 14
doing NY 14
doing rolling_1D_60_3_1 14
doing rolling_1D_60

In [7]:
import numpy as np
import itertools
dists  = [14] #np.arange(8, 20, 4).tolist()
print(dists)

gmm_temp = os.path.join(temp_save_dir, "gmm")

start_time = time()
for feature, dist in itertools.product(numerics + extra_feats[::3], dists):
    df = pd.DataFrame()

    
    pickle_path = os.path.join(temp_save_dir, "gmm-{}-{}.pickle".format(feature, dist))
    
    if not os.path.exists(pickle_path):
        print("file missing; skip {} {}".format(feature, dist))
        continue
    print("doing {} {}".format(feature, dist))
    
    with open(pickle_path, 'rb') as f:
        gmm = pickle.load(f)

    x = X_train[[feature]][feature].values.reshape(-1, 1)
    
    clus = pd.get_dummies(gmm.predict(x)).values * x
    
    clus_feats = [f'{feature}_gmm_dev_{i}'for i in range(clus.shape[1])]
    df[clus_feats] = clus
    save_df(gmm_temp, df, "gmm-{}-{}.parquet".format(feature, dist))
    
    df = None
    clus_feats = None
    clus = None
    x = None
    
    

end_time = time()
print(f'Execution time: {end_time - start_time:.3f} seconds')

[14]
doing MeanPrice_x 14
doing close_MIN_1D_5 14
doing close_MIN_1D_20 14
doing close_MIN_1D_60 14
doing close_MAX_1D_5 14
doing close_MAX_1D_20 14
doing close_MAX_1D_60 14
doing close_AVG_1D_5 14
doing close_AVG_1D_20 14
doing close_AVG_1D_60 14
doing close_STDEV_1D_5 14
doing close_STDEV_1D_20 14
doing close_STDEV_1D_60 14
doing close_MEDIAN_1D_5 14
doing close_MEDIAN_1D_20 14
doing close_MEDIAN_1D_60 14
doing close_MIN_1H_5 14
doing close_MIN_1H_20 14
doing close_MIN_1H_60 14
doing close_MAX_1H_5 14
doing close_MAX_1H_20 14
doing close_MAX_1H_60 14
doing close_AVG_1H_5 14
doing close_AVG_1H_20 14
doing close_AVG_1H_60 14
doing close_STDEV_1H_5 14
doing close_STDEV_1H_20 14
doing close_STDEV_1H_60 14
doing close_MEDIAN_1H_5 14
doing close_MEDIAN_1H_20 14
doing close_MEDIAN_1H_60 14
doing London 14
doing NY 14
doing rolling_1D_60_3_1 14
doing rolling_1D_60_3_2 14
doing rolling_1D_60_3_3 14
doing ewm_1D_60_3_1 14
doing ewm_1D_60_3_2 14
doing ewm_1D_60_3_3 14
doing rolling_1D_20_4_1 14

In [4]:
import numpy as np
import glob
import gc

gc.collect()

gmm_temp = os.path.join(temp_save_dir, "gmm")

df = pd.DataFrame()

start_time = time()
files = glob.glob(os.path.join(gmm_temp, "gmm*.parquet"))
files.sort()

for f in files[::3]:
    if f.find("-fa_")<0 and f.find("-pca_")<0 and f.find("-London-")<0 and f.find("-NY-")<0:
        print(f)
        dfx = pq.ParquetFile(f).read().to_pandas()
        df = pd.concat([df, dfx], axis=1)
        dfx = None
    

end_time = time()
print(df.info())
print(f'Execution time: {end_time - start_time:.3f} seconds')

/home/dcai/data/features/EURCHF/gmm/gmm-close_AVG_1D_60-14.parquet
/home/dcai/data/features/EURCHF/gmm/gmm-close_AVG_1H_60-14.parquet
/home/dcai/data/features/EURCHF/gmm/gmm-close_MAX_1D_60-14.parquet
/home/dcai/data/features/EURCHF/gmm/gmm-close_MAX_1H_60-14.parquet
/home/dcai/data/features/EURCHF/gmm/gmm-close_MEDIAN_1D_60-14.parquet
/home/dcai/data/features/EURCHF/gmm/gmm-close_MEDIAN_1H_60-14.parquet
/home/dcai/data/features/EURCHF/gmm/gmm-close_MIN_1D_60-14.parquet
/home/dcai/data/features/EURCHF/gmm/gmm-close_MIN_1H_60-14.parquet
/home/dcai/data/features/EURCHF/gmm/gmm-close_STDEV_1D_60-14.parquet
/home/dcai/data/features/EURCHF/gmm/gmm-close_STDEV_1H_60-14.parquet
/home/dcai/data/features/EURCHF/gmm/gmm-ewm_1D_20_4_3-14.parquet
/home/dcai/data/features/EURCHF/gmm/gmm-ewm_1D_60_3_2-14.parquet
/home/dcai/data/features/EURCHF/gmm/gmm-ewm_4H_20_4_2-14.parquet
/home/dcai/data/features/EURCHF/gmm/gmm-rolling_1D_20_4_3-14.parquet
/home/dcai/data/features/EURCHF/gmm/gmm-rolling_1D_60_20

In [14]:
X_train[df.columns.values] = df.values

In [17]:
X_train.head(4).values

array([[1.18753   , 0.99745691, 0.98542774, ..., 0.        , 0.        ,
        0.        ],
       [1.187415  , 0.99755351, 0.98552317, ..., 0.        , 0.        ,
        0.        ],
       [1.18743   , 0.99754091, 0.98551072, ..., 0.        , 0.        ,
        0.        ],
       [1.187415  , 0.99755351, 0.98552317, ..., 0.        , 0.        ,
        0.        ]])

In [11]:
from matplotlib import pyplot as plt
import gc

print(X_train[::20].info())
start_time = time()    

print(X_train.columns.tolist())


#plt.figure(figsize=(15, 10))
#for k, col_name in enumerate(numerics):
#    plt.subplot(16, 4, k+1)
#    train[col_name].plot.density()
#plt.show()

end_time = time()
print(f'Execution time: {end_time - start_time:.3f} seconds')

train = None


gc.collect()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 654039 entries, 2018-04-16 07:00:00.668000 to 2021-10-19 20:58:17.151000
Columns: 260 entries, MeanPrice_x to close_AVG_1H_60_gmm_dev_15
dtypes: float64(254), int64(6)
memory usage: 1.3 GB
None
['MeanPrice_x', 'close_MIN_1D_5', 'close_MIN_1D_20', 'close_MIN_1D_60', 'close_MAX_1D_5', 'close_MAX_1D_20', 'close_MAX_1D_60', 'close_AVG_1D_5', 'close_AVG_1D_20', 'close_AVG_1D_60', 'close_STDEV_1D_5', 'close_STDEV_1D_20', 'close_STDEV_1D_60', 'close_MEDIAN_1D_5', 'close_MEDIAN_1D_20', 'close_MEDIAN_1D_60', 'close_MIN_1H_5', 'close_MIN_1H_20', 'close_MIN_1H_60', 'close_MAX_1H_5', 'close_MAX_1H_20', 'close_MAX_1H_60', 'close_AVG_1H_5', 'close_AVG_1H_20', 'close_AVG_1H_60', 'close_STDEV_1H_5', 'close_STDEV_1H_20', 'close_STDEV_1H_60', 'close_MEDIAN_1H_5', 'close_MEDIAN_1H_20', 'close_MEDIAN_1H_60', 'London', 'NY', 'rolling_1D_60_3_1', 'rolling_1D_60_3_2', 'rolling_1D_60_3_3', 'ewm_1D_60_3_1', 'ewm_1D_60_3_2', 'ewm_1D_60_3_3', 'rolling_1D_20_4_

0

In [7]:

old_columns=['MeanPrice_x', 'close_MIN_1D_5', 'close_MIN_1D_20', 'close_MIN_1D_60', 'close_MAX_1D_5', 'close_MAX_1D_20', 'close_MAX_1D_60', 'close_AVG_1D_5', 'close_AVG_1D_20', 'close_AVG_1D_60', 'close_STDEV_1D_5', 'close_STDEV_1D_20', 'close_STDEV_1D_60', 'close_MEDIAN_1D_5', 'close_MEDIAN_1D_20', 'close_MEDIAN_1D_60', 'close_MIN_1H_5', 'close_MIN_1H_20', 'close_MIN_1H_60', 'close_MAX_1H_5', 'close_MAX_1H_20', 'close_MAX_1H_60', 'close_AVG_1H_5', 'close_AVG_1H_20', 'close_AVG_1H_60', 'close_STDEV_1H_5', 'close_STDEV_1H_20', 'close_STDEV_1H_60', 'close_MEDIAN_1H_5', 'close_MEDIAN_1H_20', 'close_MEDIAN_1H_60', 'London', 'NY', 'rolling_1D_60_3_1', 'rolling_1D_60_3_2', 'rolling_1D_60_3_3', 'ewm_1D_60_3_1', 'ewm_1D_60_3_2', 'ewm_1D_60_3_3', 'rolling_1D_20_4_1', 'rolling_1D_20_4_2', 'rolling_1D_20_4_3', 'rolling_1D_20_4_4', 'ewm_1D_20_4_1', 'ewm_1D_20_4_2', 'ewm_1D_20_4_3', 'ewm_1D_20_4_4', 'rolling_4H_20_4_1', 'rolling_4H_20_4_2', 'rolling_4H_20_4_3', 'rolling_4H_20_4_4', 'ewm_4H_20_4_1', 'ewm_4H_20_4_2', 'ewm_4H_20_4_3', 'ewm_4H_20_4_4', 'rolling_1D_60_20_4_1', 'rolling_1D_60_20_4_2', 'rolling_1D_60_20_4_3', 'rolling_1D_60_20_4_4', 'rolling_4H_60_20_4_1', 'rolling_4H_60_20_4_2', 'rolling_4H_60_20_4_3', 'rolling_4H_60_20_4_4', 'MeanPrice_y', 'EUR', 'CHF', 'USD', 'GBP', 'fa_0', 'fa_1', 'fa_2', 'fa_3', 'fa_4', 'fa_5', 'fa_6', 'fa_7', 'fa_8', 'fa_9', 'fa_10', 'fa_11', 'fa_12', 'fa_13', 'fa_14', 'fa_15', 'fa_16', 'fa_17', 'fa_18', 'fa_19', 'fa_20', 'fa_21', 'fa_22', 'fa_23', 'fa_24', 'fa_25', 'fa_26', 'fa_27', 'fa_28', 'fa_29', 'fa_30', 'fa_31', 'fa_32', 'fa_33', 'fa_34', 'fa_35', 'fa_36', 'fa_37', 'fa_38', 'fa_39', 'fa_40', 'fa_41', 'fa_42', 'fa_43', 'fa_44', 'fa_45', 'fa_46', 'fa_47', 'fa_48', 'fa_49', 'fa_50', 'fa_51', 'fa_52', 'fa_53', 'fa_54', 'fa_55', 'fa_56', 'fa_57', 'fa_58', 'fa_59', 'fa_60', 'fa_61', 'fa_62', 'fa_63', 'pca_0', 'pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7', 'pca_8', 'pca_9', 'pca_10', 'pca_11', 'pca_12', 'pca_13', 'pca_14', 'pca_15', 'pca_16', 'pca_17', 'pca_18', 'pca_19', 'pca_20', 'pca_21', 'pca_22', 'pca_23', 'pca_24', 'pca_25', 'pca_26', 'pca_27', 'pca_28', 'pca_29', 'pca_30', 'pca_31', 'pca_32', 'pca_33', 'pca_34', 'pca_35', 'pca_36', 'pca_37', 'pca_38', 'pca_39', 'pca_40', 'pca_41', 'pca_42', 'pca_43', 'pca_44', 'pca_45', 'pca_46', 'pca_47', 'pca_48', 'pca_49', 'pca_50', 'pca_51', 'pca_52', 'pca_53', 'pca_54', 'pca_55', 'pca_56', 'pca_57', 'pca_58', 'pca_59', 'pca_60', 'pca_61', 'pca_62', 'pca_63', 'MeanPrice_x_gmm_dev_0', 'MeanPrice_x_gmm_dev_1', 'MeanPrice_x_gmm_dev_2', 'MeanPrice_x_gmm_dev_3', 'MeanPrice_x_gmm_dev_4', 'MeanPrice_x_gmm_dev_5', 'MeanPrice_x_gmm_dev_6', 'MeanPrice_x_gmm_dev_7', 'MeanPrice_x_gmm_dev_8', 'MeanPrice_x_gmm_dev_9', 'MeanPrice_x_gmm_dev_10', 'MeanPrice_x_gmm_dev_11', 'MeanPrice_x_gmm_dev_12', 'MeanPrice_x_gmm_dev_13', 'MeanPrice_x_gmm_dev_14', 'MeanPrice_x_gmm_dev_15', 'close_MAX_1D_60_gmm_dev_0', 'close_MAX_1D_60_gmm_dev_1', 'close_MAX_1D_60_gmm_dev_2', 'close_MAX_1D_60_gmm_dev_3', 'close_MAX_1D_60_gmm_dev_4', 'close_MAX_1D_60_gmm_dev_5', 'close_MAX_1D_60_gmm_dev_6', 'close_MAX_1D_60_gmm_dev_7', 'close_MAX_1D_60_gmm_dev_8', 'close_MAX_1D_60_gmm_dev_9', 'close_MAX_1D_60_gmm_dev_10', 'close_MAX_1D_60_gmm_dev_11', 'close_MAX_1D_60_gmm_dev_12', 'close_MAX_1D_60_gmm_dev_13', 'close_MAX_1D_60_gmm_dev_14', 'close_MAX_1D_60_gmm_dev_15', 'close_STDEV_1D_60_gmm_dev_0', 'close_STDEV_1D_60_gmm_dev_1', 'close_STDEV_1D_60_gmm_dev_2', 'close_STDEV_1D_60_gmm_dev_3', 'close_STDEV_1D_60_gmm_dev_4', 'close_MIN_1H_60_gmm_dev_0', 'close_MIN_1H_60_gmm_dev_1', 'close_MIN_1H_60_gmm_dev_2', 'close_MIN_1H_60_gmm_dev_3', 'close_MIN_1H_60_gmm_dev_4', 'close_MIN_1H_60_gmm_dev_5', 'close_MIN_1H_60_gmm_dev_6', 'close_MIN_1H_60_gmm_dev_7', 'close_MIN_1H_60_gmm_dev_8', 'close_MIN_1H_60_gmm_dev_9', 'close_MIN_1H_60_gmm_dev_10', 'close_AVG_1H_60_gmm_dev_0', 'close_AVG_1H_60_gmm_dev_1', 'close_AVG_1H_60_gmm_dev_2', 'close_AVG_1H_60_gmm_dev_3', 'close_AVG_1H_60_gmm_dev_4', 'close_AVG_1H_60_gmm_dev_5', 'close_AVG_1H_60_gmm_dev_6', 'close_AVG_1H_60_gmm_dev_7', 'close_AVG_1H_60_gmm_dev_8', 'close_AVG_1H_60_gmm_dev_9', 'close_AVG_1H_60_gmm_dev_10', 'close_AVG_1H_60_gmm_dev_11', 'close_AVG_1H_60_gmm_dev_12', 'close_AVG_1H_60_gmm_dev_13', 'close_AVG_1H_60_gmm_dev_14', 'close_AVG_1H_60_gmm_dev_15']

print(len(old_columns))


260


In [20]:

save_df(temp_save_dir, X_train, 'X_train.parquet')
save_df(temp_save_dir, Y_train, 'Y_train.parquet')

with open(os.path.join(temp_save_dir, 'y_stratified.pickle'), 'wb') as f:
    pickle.dump(y_stratified, f)

In [5]:

X_train_old = read_parquet(temp_save_dir, 'X_train.parquet')

In [8]:
X_train_old[['close_AVG_1H_60_gmm_dev_6']].head(10)

Unnamed: 0_level_0,close_AVG_1H_60_gmm_dev_6
DateTime,Unnamed: 1_level_1
2018-04-16 07:00:00.668,0.999078
2018-04-16 07:00:02.087,0.999174
2018-04-16 07:00:02.175,0.999162
2018-04-16 07:00:02.178,0.999174
2018-04-16 07:00:02.179,0.999166
2018-04-16 07:00:04.143,0.999191
2018-04-16 07:00:09.773,0.999166
2018-04-16 07:00:14.703,0.999162
2018-04-16 07:00:15.167,0.999174
2018-04-16 07:00:15.569,0.999115


In [8]:
from xgboost.sklearn import XGBClassifier
#initial model
store = {}


    
param = {'objective': 'multi:softprob', # Specify multiclass classification
         'num_class': 3, # Number of possible output classes
         'tree_method': 'gpu_hist', # Use GPU accelerated algorithm
         'enable_categorical': True,
         'eval_metric': 'auc',
         #'eval_metric': ['mlogloss','merror'],
         'evals_result': store,
         'verbose_eval': 100
         }



every_nth = 25
X_train_1 = X_train[old_columns][::every_nth]
Y_train_1 = Y_train[::every_nth]
y_stratified_1 = y_stratified[::every_nth]



folds = 5
skf = StratifiedKFold(n_splits=folds,
                      shuffle=True, 
                      random_state=0)

selected_columns = list()

print(X_train_1.info())
    
for k, (train_idx, val_idx) in enumerate(skf.split(X_train_1, y_stratified_1)):
    
    print(f"FOLD {k+1}/{folds}")
    
    
    y = Y_train_1.iloc[train_idx, :].values.argmax(axis=1)

    
    xgb1 = XGBClassifier(learning_rate=0.1,
                        n_estimators=1000,
                        max_depth=3,
                        min_child_weight=1,
                        gamma=0,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        objective='multi:softmax',
                        tree_method='gpu_hist',
                        num_class=3,
                        seed=27)
    
    Feature_Selector = BorutaShap(model=xgb1,
                              importance_measure='shap', 
                              classification=True)
    X = X_train_1.iloc[train_idx, :]
    print(X.info())

    Feature_Selector.fit(X=X, y=y, n_trials=50, random_state=0)
    
    #Feature_Selector.plot(which_features='all', figsize=(24,12))
    
    selected_columns.append(sorted(Feature_Selector.Subset().columns))
    
    print(f"Selected features at fold {k+1} are: {selected_columns[-1]}")
    X = None
    y = None
    
    
final_selection = sorted({item for selection in selected_columns for item in selection})

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 523231 entries, 2018-04-16 07:00:00.668000 to 2021-10-19 20:55:14.242000
Columns: 260 entries, MeanPrice_x to close_AVG_1H_60_gmm_dev_15
dtypes: float64(254), int64(6)
memory usage: 1.0 GB
None
FOLD 1/5
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 418584 entries, 2018-04-16 07:00:00.668000 to 2021-10-19 20:55:14.242000
Columns: 260 entries, MeanPrice_x to close_AVG_1H_60_gmm_dev_15
dtypes: float64(254), int64(6)
memory usage: 833.5 MB
None


  0%|          | 0/50 [00:00<?, ?it/s]

133 attributes confirmed important: ['pca_46', 'close_MAX_1D_60_gmm_dev_7', 'fa_12', 'close_AVG_1H_20', 'MeanPrice_x_gmm_dev_0', 'close_MEDIAN_1D_20', 'fa_8', 'close_MAX_1D_5', 'close_MIN_1H_20', 'pca_57', 'rolling_1D_60_20_4_4', 'pca_61', 'pca_15', 'pca_30', 'pca_9', 'fa_11', 'pca_14', 'close_STDEV_1D_60_gmm_dev_1', 'pca_2', 'close_MIN_1D_20', 'fa_13', 'pca_49', 'close_MIN_1H_5', 'fa_10', 'close_MAX_1D_60_gmm_dev_3', 'pca_6', 'rolling_4H_60_20_4_1', 'close_AVG_1H_60_gmm_dev_0', 'pca_3', 'ewm_1D_60_3_2', 'close_STDEV_1D_20', 'close_MAX_1D_20', 'pca_51', 'pca_41', 'fa_1', 'MeanPrice_x', 'fa_7', 'close_MAX_1D_60_gmm_dev_10', 'pca_42', 'pca_12', 'close_MEDIAN_1H_20', 'close_MAX_1H_5', 'close_STDEV_1D_60_gmm_dev_4', 'fa_0', 'fa_9', 'pca_52', 'pca_22', 'close_STDEV_1H_20', 'close_MEDIAN_1D_5', 'close_AVG_1H_60_gmm_dev_2', 'pca_32', 'pca_16', 'pca_5', 'close_MEDIAN_1H_5', 'close_STDEV_1D_60_gmm_dev_3', 'ewm_1D_20_4_1', 'close_AVG_1H_60', 'rolling_1D_60_20_4_3', 'pca_11', 'pca_58', 'close_MED

  0%|          | 0/50 [00:00<?, ?it/s]

129 attributes confirmed important: ['pca_46', 'close_MAX_1D_60_gmm_dev_7', 'fa_12', 'close_AVG_1H_20', 'MeanPrice_x_gmm_dev_0', 'close_MEDIAN_1D_20', 'fa_8', 'close_MAX_1D_5', 'close_MIN_1H_20', 'pca_57', 'rolling_1D_60_20_4_4', 'pca_61', 'pca_15', 'pca_30', 'pca_9', 'fa_11', 'pca_14', 'pca_2', 'close_MIN_1D_20', 'fa_13', 'pca_49', 'close_MIN_1H_5', 'fa_10', 'pca_6', 'rolling_4H_60_20_4_1', 'close_AVG_1H_60_gmm_dev_0', 'pca_3', 'close_STDEV_1D_20', 'close_AVG_1H_60_gmm_dev_11', 'close_MAX_1D_20', 'pca_51', 'pca_41', 'fa_1', 'MeanPrice_x', 'fa_7', 'close_MAX_1D_60_gmm_dev_10', 'pca_42', 'pca_12', 'close_MEDIAN_1H_20', 'close_MAX_1H_5', 'close_STDEV_1D_60_gmm_dev_4', 'fa_0', 'fa_9', 'pca_52', 'pca_22', 'close_STDEV_1H_20', 'close_MEDIAN_1D_5', 'close_AVG_1H_60_gmm_dev_2', 'pca_32', 'ewm_4H_20_4_3', 'pca_16', 'pca_5', 'close_MEDIAN_1H_5', 'close_STDEV_1D_60_gmm_dev_3', 'rolling_1D_60_20_4_2', 'close_AVG_1H_60', 'rolling_1D_60_20_4_3', 'pca_11', 'pca_58', 'close_MEDIAN_1D_60', 'London', '

  0%|          | 0/50 [00:00<?, ?it/s]

131 attributes confirmed important: ['pca_46', 'close_MAX_1D_60_gmm_dev_7', 'fa_12', 'close_AVG_1H_20', 'close_MEDIAN_1D_20', 'fa_8', 'close_MAX_1D_5', 'close_MIN_1H_20', 'pca_57', 'rolling_1D_60_20_4_4', 'pca_61', 'pca_15', 'pca_30', 'pca_9', 'fa_11', 'pca_14', 'pca_2', 'close_MIN_1D_20', 'fa_13', 'pca_49', 'close_MIN_1H_5', 'fa_10', 'pca_6', 'rolling_4H_60_20_4_1', 'close_AVG_1H_60_gmm_dev_0', 'pca_3', 'ewm_1D_20_4_4', 'close_STDEV_1D_20', 'close_AVG_1H_60_gmm_dev_11', 'close_MAX_1D_20', 'pca_51', 'pca_41', 'fa_1', 'MeanPrice_x', 'fa_7', 'close_MAX_1D_60_gmm_dev_10', 'pca_42', 'pca_12', 'close_MEDIAN_1H_20', 'close_MAX_1H_5', 'close_STDEV_1D_60_gmm_dev_4', 'fa_0', 'fa_9', 'pca_52', 'pca_22', 'close_STDEV_1H_20', 'close_MEDIAN_1D_5', 'close_AVG_1H_60_gmm_dev_2', 'pca_32', 'pca_16', 'pca_5', 'close_MEDIAN_1H_5', 'close_STDEV_1D_60_gmm_dev_3', 'ewm_1D_20_4_1', 'close_AVG_1H_60', 'pca_11', 'pca_58', 'close_MEDIAN_1D_60', 'London', 'pca_56', 'pca_36', 'fa_3', 'pca_28', 'pca_37', 'pca_55',

  0%|          | 0/50 [00:00<?, ?it/s]

134 attributes confirmed important: ['pca_46', 'close_MAX_1D_60_gmm_dev_7', 'fa_12', 'close_AVG_1H_20', 'MeanPrice_x_gmm_dev_0', 'close_MEDIAN_1D_20', 'fa_8', 'close_MAX_1D_5', 'close_MIN_1H_20', 'pca_57', 'rolling_1D_60_20_4_4', 'pca_61', 'pca_15', 'pca_30', 'pca_9', 'fa_11', 'pca_14', 'pca_2', 'close_MIN_1D_20', 'fa_13', 'pca_49', 'close_MIN_1H_5', 'fa_10', 'close_MAX_1D_60_gmm_dev_3', 'pca_6', 'rolling_4H_60_20_4_1', 'close_AVG_1H_60_gmm_dev_0', 'pca_3', 'ewm_1D_20_4_4', 'close_STDEV_1D_20', 'close_MAX_1D_20', 'pca_51', 'pca_41', 'fa_1', 'MeanPrice_x', 'fa_7', 'close_MAX_1D_60_gmm_dev_10', 'pca_42', 'pca_12', 'close_MEDIAN_1H_20', 'close_MAX_1H_5', 'close_STDEV_1D_60_gmm_dev_4', 'fa_0', 'fa_9', 'pca_52', 'pca_22', 'close_STDEV_1H_20', 'close_MEDIAN_1D_5', 'close_AVG_1H_60_gmm_dev_2', 'pca_32', 'ewm_4H_20_4_3', 'pca_16', 'pca_5', 'close_MEDIAN_1H_5', 'close_STDEV_1D_60_gmm_dev_3', 'ewm_1D_20_4_1', 'close_AVG_1H_60', 'rolling_1D_60_20_4_3', 'pca_11', 'pca_58', 'London', 'pca_56', 'pca

  0%|          | 0/50 [00:00<?, ?it/s]

132 attributes confirmed important: ['pca_46', 'close_MAX_1D_60_gmm_dev_7', 'fa_12', 'close_AVG_1H_20', 'MeanPrice_x_gmm_dev_0', 'close_MEDIAN_1D_20', 'fa_8', 'close_MAX_1D_5', 'close_MIN_1H_20', 'pca_57', 'rolling_1D_60_20_4_4', 'pca_61', 'pca_15', 'pca_30', 'pca_9', 'fa_11', 'pca_14', 'pca_2', 'close_MIN_1D_20', 'fa_13', 'pca_49', 'close_MIN_1H_5', 'fa_10', 'close_MAX_1D_60_gmm_dev_3', 'pca_6', 'rolling_4H_60_20_4_1', 'close_AVG_1H_60_gmm_dev_0', 'pca_3', 'close_STDEV_1D_20', 'close_MAX_1D_20', 'pca_51', 'pca_41', 'fa_1', 'MeanPrice_x', 'fa_7', 'close_MAX_1D_60_gmm_dev_10', 'pca_42', 'pca_12', 'close_MEDIAN_1H_20', 'close_MAX_1H_5', 'close_STDEV_1D_60_gmm_dev_4', 'fa_0', 'fa_9', 'pca_52', 'pca_22', 'close_STDEV_1H_20', 'close_MEDIAN_1D_5', 'close_AVG_1H_60_gmm_dev_2', 'pca_32', 'pca_16', 'pca_5', 'close_MEDIAN_1H_5', 'ewm_1D_20_4_1', 'close_AVG_1H_60', 'rolling_1D_60_20_4_3', 'pca_11', 'pca_58', 'London', 'pca_56', 'pca_36', 'close_MIN_1H_60_gmm_dev_8', 'fa_3', 'pca_28', 'pca_37', 'p

In [9]:
print(final_selection)

['London', 'MeanPrice_x', 'MeanPrice_x_gmm_dev_0', 'MeanPrice_x_gmm_dev_11', 'MeanPrice_x_gmm_dev_13', 'MeanPrice_x_gmm_dev_14', 'MeanPrice_x_gmm_dev_4', 'close_AVG_1D_5', 'close_AVG_1H_20', 'close_AVG_1H_5', 'close_AVG_1H_60', 'close_AVG_1H_60_gmm_dev_0', 'close_AVG_1H_60_gmm_dev_10', 'close_AVG_1H_60_gmm_dev_11', 'close_AVG_1H_60_gmm_dev_2', 'close_MAX_1D_20', 'close_MAX_1D_5', 'close_MAX_1D_60', 'close_MAX_1D_60_gmm_dev_0', 'close_MAX_1D_60_gmm_dev_1', 'close_MAX_1D_60_gmm_dev_10', 'close_MAX_1D_60_gmm_dev_15', 'close_MAX_1D_60_gmm_dev_3', 'close_MAX_1D_60_gmm_dev_4', 'close_MAX_1D_60_gmm_dev_7', 'close_MAX_1D_60_gmm_dev_9', 'close_MAX_1H_20', 'close_MAX_1H_5', 'close_MAX_1H_60', 'close_MEDIAN_1D_20', 'close_MEDIAN_1D_5', 'close_MEDIAN_1D_60', 'close_MEDIAN_1H_20', 'close_MEDIAN_1H_5', 'close_MEDIAN_1H_60', 'close_MIN_1D_20', 'close_MIN_1D_5', 'close_MIN_1D_60', 'close_MIN_1H_20', 'close_MIN_1H_5', 'close_MIN_1H_60', 'close_MIN_1H_60_gmm_dev_10', 'close_MIN_1H_60_gmm_dev_8', 'close_

In [10]:
print(final_selection)
print(len(final_selection))

with open(os.path.join(temp_save_dir, 'selected_feats-2.pickle'), 'wb') as f:
    pickle.dump(final_selection, f)

['London', 'MeanPrice_x', 'MeanPrice_x_gmm_dev_0', 'MeanPrice_x_gmm_dev_11', 'MeanPrice_x_gmm_dev_13', 'MeanPrice_x_gmm_dev_14', 'MeanPrice_x_gmm_dev_4', 'close_AVG_1D_5', 'close_AVG_1H_20', 'close_AVG_1H_5', 'close_AVG_1H_60', 'close_AVG_1H_60_gmm_dev_0', 'close_AVG_1H_60_gmm_dev_10', 'close_AVG_1H_60_gmm_dev_11', 'close_AVG_1H_60_gmm_dev_2', 'close_MAX_1D_20', 'close_MAX_1D_5', 'close_MAX_1D_60', 'close_MAX_1D_60_gmm_dev_0', 'close_MAX_1D_60_gmm_dev_1', 'close_MAX_1D_60_gmm_dev_10', 'close_MAX_1D_60_gmm_dev_15', 'close_MAX_1D_60_gmm_dev_3', 'close_MAX_1D_60_gmm_dev_4', 'close_MAX_1D_60_gmm_dev_7', 'close_MAX_1D_60_gmm_dev_9', 'close_MAX_1H_20', 'close_MAX_1H_5', 'close_MAX_1H_60', 'close_MEDIAN_1D_20', 'close_MEDIAN_1D_5', 'close_MEDIAN_1D_60', 'close_MEDIAN_1H_20', 'close_MEDIAN_1H_5', 'close_MEDIAN_1H_60', 'close_MIN_1D_20', 'close_MIN_1D_5', 'close_MIN_1D_60', 'close_MIN_1H_20', 'close_MIN_1H_5', 'close_MIN_1H_60', 'close_MIN_1H_60_gmm_dev_10', 'close_MIN_1H_60_gmm_dev_8', 'close_

In [14]:
old_selected_feats = ['MeanPrice_x', 'MeanPrice_x_gmm_dev_10', 'MeanPrice_x_gmm_dev_3', 'MeanPrice_x_gmm_dev_5', 'close_AVG_1D_5', 'close_AVG_1D_60', 'close_AVG_1H_20', 'close_AVG_1H_5', 'close_AVG_1H_60', 'close_AVG_1H_60_gmm_dev_10', 'close_AVG_1H_60_gmm_dev_2', 'close_MAX_1D_5', 'close_MAX_1D_60_gmm_dev_1', 'close_MAX_1D_60_gmm_dev_10', 'close_MAX_1D_60_gmm_dev_3', 'close_MAX_1D_60_gmm_dev_4', 'close_MAX_1D_60_gmm_dev_9', 'close_MAX_1H_20', 'close_MAX_1H_5', 'close_MAX_1H_60', 'close_MEDIAN_1D_5', 'close_MEDIAN_1D_60', 'close_MEDIAN_1H_20', 'close_MEDIAN_1H_5', 'close_MEDIAN_1H_60', 'close_MIN_1D_5', 'close_MIN_1H_20', 'close_MIN_1H_5', 'close_MIN_1H_60', 'close_MIN_1H_60_gmm_dev_9', 'close_STDEV_1D_60', 'close_STDEV_1H_20', 'close_STDEV_1H_5', 'close_STDEV_1H_60', 'fa_0', 'fa_10', 'fa_11', 'fa_12', 'fa_13', 'fa_2', 'fa_3', 'fa_4', 'fa_5', 'fa_6', 'fa_7', 'fa_8', 'fa_9', 'pca_0', 'pca_10', 'pca_11', 'pca_12', 'pca_13', 'pca_14', 'pca_15', 'pca_16', 'pca_17', 'pca_18', 'pca_19', 'pca_2', 'pca_20', 'pca_21', 'pca_22', 'pca_23', 'pca_24', 'pca_25', 'pca_26', 'pca_27', 'pca_28', 'pca_29', 'pca_30', 'pca_31', 'pca_32', 'pca_33', 'pca_34', 'pca_35', 'pca_36', 'pca_37', 'pca_38', 'pca_39', 'pca_4', 'pca_40', 'pca_41', 'pca_42', 'pca_43', 'pca_44', 'pca_45', 'pca_46', 'pca_47', 'pca_48', 'pca_49', 'pca_5', 'pca_50', 'pca_51', 'pca_52', 'pca_53', 'pca_54', 'pca_55', 'pca_56', 'pca_57', 'pca_6', 'pca_7', 'pca_8', 'pca_9', 'rolling_1D_60_20_4_1', 'rolling_1D_60_3_1', 'rolling_1D_60_3_2', 'rolling_4H_60_20_4_1', 'rolling_4H_60_20_4_2', 'rolling_4H_60_20_4_4']

intersect = list(set(old_selected_feats) & set(final_selection))
 
print(intersect)

59
