In [33]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
import os

In [34]:
pairs = ['AUDJPY', 'AUDNZD', 'AUDUSD', 'CADJPY', 'CHFJPY', 'EURGBP', 'EURJPY', 'EURUSD', 'GBPJPY', 'GBPUSD', 'NZDUSD', 'USDCAD']
os.listdir('data/per_pair')

['AUD_USD.pickle',
 'EUR_USD.pickle',
 'CAD_JPY.pickle',
 'EUR_GBP.pickle',
 'NZD_USD.pickle',
 'EUR_JPY.pickle',
 'AUD_JPY.pickle',
 'USD_CAD.pickle',
 'CHF_JPY.pickle',
 'GBP_USD.pickle',
 'AUD_NZD.pickle',
 'GBP_JPY.pickle']

In [114]:
class DataStore:
    def __init__(self, prepare_data=True):
        """Class to create, store, and fetch data
        """
        
        self.df = pd.DataFrame([])
        
        if prepare_data:
            self.load_data('data/master.pickle')
            self.prepare_state_space()
        
    def load_data(self, file):
        self.clean_df = pd.read_pickle(file)
        return self.clean_df
        
    def get_unique_pairs(self):
        self.pairs = self.clean_df.pair.unique()
        return self.pairs
    
    def slice_data_by_pair(self, pair):
        return self.clean_df[self.clean_df.pair == pair]
    
    def gen_next_open(self, d):
        d['next_open'] = d.open.shift(-1)
        return d
    
    def sin_encode(self, t, T):
        return np.sin(2 * np.pi * (t / T))

    def encode_temporal(self, index, name):
        _index = index
        if name == "day_of_week":
            index = index.dayofweek
        if name == "hour":
            index = index.hour
        if name == "minute":
            index = index.minute

        T = len(index.unique())
        return pd.Series(self.sin_encode(index, T), index=_index, name=name)
    
    def log_return(self, series):
        return np.log(series) - np.log(series.shift(1))
    
    def rolling_zscore(self, arr):
        return zscore(arr)[-1]
    
    def gen_pos_encoding(self, dff):
        return pd.DataFrame(np.zeros(shape=(dff.index.size, 3)),
                      columns=['pos_short', 'pos_neutral', 'pos_long'],
                      index=dff.index)
    
    def gen_log_return(self, series, zscore_roll=96, lookback=8, clip=(-10,10)):
        log_ret = self.log_return(series)
        norms = log_ret.rolling(zscore_roll).apply(self.rolling_zscore, raw=False)
        norms = norms.clip(*clip)

        log_returns = {}
        for s in range(lookback):
            log_returns['{}_log_returns_{}'.format(series.name, s)] = norms.shift(s)

        return pd.DataFrame(log_returns)
    
    def calculate_meta_variables(self, dff):
        # Time Encoding
        dff = dff.join(self.encode_temporal(dff.index, name='day_of_week'))
        dff = dff.join(self.encode_temporal(dff.index, name='hour'))
        dff = dff.join(self.encode_temporal(dff.index, name='minute'))

        return dff


    def calculate_market_variables(self, dff, label=''):
        # Market Feature
        _df = self.gen_log_return(dff.next_open)
        _df = _df.join(self.gen_log_return(dff.volume))
        _df.columns = [str(col) + '_' + label for col in _df.columns]
        
        return _df
    
    def prepare_state_space(self):
        self.state = None
        
        self.dfps = {} # DataFrame Dictionary per Pair
        pairs = self.get_unique_pairs()
        
        for pair in pairs:
            print(pair)
            _df = self.slice_data_by_pair(pair).copy()
            _df = self.gen_next_open(_df)
            _df = self.calculate_market_variables(_df, pair.replace('/', '_'))
            self.dfps[pair] = _df
            if type(self.state) == type(None):
                self.state = _df
            else:
                self.state = self.state.join(_df)
            
        # Position Encoding
        self.state = self.calculate_meta_variables(self.state)
        self.state = self.state.join(self.gen_pos_encoding(self.state))

In [115]:
ds = DataStore()

USD/CAD
GBP/JPY
AUD/USD
AUD/JPY
EUR/GBP
EUR/JPY
CHF/JPY
GBP/USD
EUR/USD
AUD/NZD
NZD/USD
CAD/JPY


In [132]:
df = None
for key in ds.dfps:
    if type(df) == type(None):
        df = ds.dfps[key]
    else:
        df = df.join(ds.dfps[key])
        
df = ds.calculate_meta_variables(df)
df = df.join(ds.gen_pos_encoding(df))

In [136]:
df.to_pickle('data/state_space.pickle')