In [1]:
from datetime import date, timedelta

from featureeng.times import get_trading_times_for_london_ny
from featureeng.features import *
import featureeng as feng
import data.readers as rd
import joblib


import matplotlib.pyplot as plt

import multiprocessing
import time
import threading
import json
import os

import pyarrow as pa
import pyarrow.parquet as pq

# Importing core libraries
import numpy as np
import pandas as pd
from time import time
import pprint
import joblib
from functools import partial

# Suppressing warnings because of skopt verbosity
import warnings
warnings.filterwarnings("ignore")

# Classifier/Regressor
from xgboost import XGBClassifier

# Feature selection
from BorutaShap import BorutaShap

# Data processing
from sklearn.preprocessing import OrdinalEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.mixture import GaussianMixture

import pickle
# Validation
from sklearn.model_selection import StratifiedKFold

root_dir = '/home/dcai/data/features'
instrument='EURCHF'
feature_set="2"



temp_save_dir="{}/{}".format(root_dir,instrument)

def save_df(save_dir, df, name):
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    pq.write_table(pa.Table.from_pandas(df), os.path.join(save_dir, name))


temp_save_dir="{}/{}".format(root_dir,instrument)
def read_parquet(save_dir, name):
    return pq.ParquetFile(os.path.join(save_dir, name)).read().to_pandas()


with open(os.path.join(temp_save_dir, 'y_stratified.pickle'), 'rb') as f:
    y_stratified = pickle.load(f)

    
def read_pickle(path, file_name):
    with open(os.path.join(path, file_name), 'rb') as f:
        return pickle.load(f)


#allow logloss and classification error plots for each iteraetion of xgb model
def plot_compare(metrics,eval_results,epochs):
    for m in metrics:
        test_score = eval_results['val'][m]
        train_score = eval_results['train'][m]
        rang = range(0, epochs)
        plt.rcParams["figure.figsize"] = [6,6]
        plt.plot(rang, test_score,"c", label="Val")
        plt.plot(rang, train_score,"orange", label="Train")
        title_name = m + " plot"
        plt.title(title_name)
        plt.xlabel('Iterations')
        plt.ylabel(m)
        lgd = plt.legend()
        plt.show()    

In [2]:
### Load the data
X_train = read_parquet(temp_save_dir, "trainX.parqet")
y_train = pd.DataFrame(read_pickle(temp_save_dir, "trainY.pickle"), columns=['target']).target

X_valid = read_parquet(temp_save_dir, "validateX.parqet")
y_valid = pd.DataFrame(read_pickle(temp_save_dir, "validateY.pickle"), columns=['target']).target

eval_set = [(X_valid, y_valid)]


X_test = read_parquet(temp_save_dir, "testX.parqet")
y_test = pd.DataFrame(read_pickle(temp_save_dir, "testY.pickle"), columns=['target']).target

In [5]:
X_train.columns.values

array(['MeanPrice_x', 'close_AVG_1D_20', 'close_AVG_1D_5',
       'close_AVG_1D_60', 'close_AVG_1H_20', 'close_AVG_1H_5',
       'close_AVG_1H_60', 'close_MAX_1D_20', 'close_MAX_1D_5',
       'close_MAX_1D_60', 'close_MAX_1H_20', 'close_MAX_1H_5',
       'close_MAX_1H_60', 'close_MEDIAN_1D_20', 'close_MEDIAN_1D_5',
       'close_MEDIAN_1D_60', 'close_MEDIAN_1H_20', 'close_MEDIAN_1H_5',
       'close_MEDIAN_1H_60', 'close_MIN_1D_20', 'close_MIN_1D_5',
       'close_MIN_1D_60', 'close_MIN_1H_20', 'close_MIN_1H_5',
       'close_MIN_1H_60', 'close_STDEV_1D_20', 'close_STDEV_1D_5',
       'close_STDEV_1D_60', 'close_STDEV_1H_20', 'close_STDEV_1H_5',
       'close_STDEV_1H_60', 'ewm_1D_20_4_1', 'ewm_1D_20_4_4',
       'ewm_4H_20_4_1', 'ewm_4H_20_4_3', 'ewm_4H_20_4_4', 'fa_0', 'fa_1',
       'fa_10', 'fa_11', 'fa_12', 'fa_2', 'fa_3', 'fa_4', 'fa_5', 'fa_6',
       'fa_7', 'pca_0', 'pca_1', 'pca_10', 'pca_11', 'pca_12', 'pca_13',
       'pca_14', 'pca_15', 'pca_16', 'pca_17', 'pca_18', 'pca_1