In [1]:
"""
Contains a set of utility function to process data
"""

import csv
import datetime
import numpy as np
import pandas as pd
import h5py

start_date = '2020-03-20 02-PM'
end_date = '2020-06-20 01-PM'
date_format = '%Y-%m-%d %I-%p'
start_datetime = datetime.datetime.strptime(start_date, date_format)
end_datetime = datetime.datetime.strptime(end_date, date_format)
number_datetime = ((end_datetime - start_datetime).days) * 24 + (end_datetime - start_datetime).seconds/3600 + 1

def normalize(x):
    """ Create a universal normalization function across close/open ratio

    Args:
        x: input of any shape

    Returns: normalized data

    """
    return (x - 1) * 100

def create_spread(beta, alpha, first_currency, second_currency, start_datetime):
    BTC_df = pd.read_csv('Bitstamp_'+first_currency+'USD_1h.csv', skiprows = 1)
    XRP_df = pd.read_csv('Bitstamp_'+second_currency+'USD_1h.csv', skiprows = 1)

    BTC_df['Date'] = BTC_df['Date'].apply(lambda x: datetime.datetime.strptime(x, date_format))
    XRP_df['Date'] = XRP_df['Date'].apply(lambda x: datetime.datetime.strptime(x, date_format))

    BTC_df = BTC_df[BTC_df['Date'] >= start_datetime]
    XRP_df = XRP_df[XRP_df['Date'] >= start_datetime]

    BTC_df.sort_values(by = 'Date', inplace = True)
    XRP_df.sort_values(by = 'Date', inplace = True)

    BTC_df.set_index(['Date'], inplace = True)
    XRP_df.set_index(['Date'], inplace = True)

    spread_df = pd.DataFrame(index = BTC_df.index, columns = ['Open', 'High', 'Low', 'Close'])

    spread_df['Open'] = BTC_df['Open'] + beta * XRP_df['Open']+alpha
    spread_df['High'] = BTC_df['High'] + beta * XRP_df['High']+alpha
    spread_df['Low'] = BTC_df['Low'] + beta * XRP_df['Low']+alpha
    spread_df['Close'] = BTC_df['Close'] + beta * XRP_df['Close']+alpha
#    spread_df['Volume'] = BTC_df['Volume USD'] + beta * XRP_df['Volume USD']

    return spread_df

def create_dataset(spread_df, filepath = 'datasets/stocks_history_target.h5'):
    history = np.empty(shape=(1, len(spread_df), 4), dtype=np.float)
    for row in range(len(spread_df)):
        history[0][row] = spread_df.iloc[row, :4].values
    abbreviation = ['BTCXRP']
    write_to_h5py(history, abbreviation, filepath)
    return history, abbreviation

def write_to_h5py(history, abbreviation, filepath='datasets/stocks_history_target.h5'):
    """ Write a numpy array history and a list of string to h5py

    Args:
        history: (N, timestamp, 5)
        abbreviation: a list of stock abbreviations

    Returns:

    """
    with h5py.File(filepath, 'w') as f:
        f.create_dataset('history', data=history)
        abbr_array = np.array(abbreviation, dtype=object)
        string_dt = h5py.special_dtype(vlen=str)
        f.create_dataset("abbreviation", data=abbr_array, dtype=string_dt)

def create_target_dataset(target_list, filepath='datasets/stocks_history_target.h5'):
    """ Create 16 company history datasets

    Args:
        target_list:
        filepath:

    Returns:

    """
    history_all, abbreviation_all = read_stock_history()
    history = None
    for target in target_list:
        data = np.expand_dims(history_all[abbreviation_all.index(target)], axis=0)
        if history is None:
            history = data
        else:
            history = np.concatenate((history, data), axis=0)
    write_to_h5py(history, target_list, filepath=filepath)


def read_stock_history(filepath='datasets/stocks_history_target.h5'):
    """ Read data from extracted h5

    Args:
        filepath: path of file

    Returns:
        history:
        abbreviation:

    """
    with h5py.File(filepath, 'r') as f:
        history = f['history'][:]
        abbreviation = f['abbreviation']
#        print(abbreviation)
        abbreviation = [abbr.decode('utf-8') for abbr in abbreviation]
    return history, abbreviation


def index_to_date(index):
    """

    Args:
        index: the date from start-date (2017-11-12 05:00:00)

    Returns:

    """
    return str(start_datetime + datetime.timedelta(hours = index))
#    return pd.to_datetime(str(start_datetime + datetime.timedelta(hours = index)))

def date_to_index(dt):
    """

    Args:
        datetime: in format of 2017-11-12 05:00:00

    Returns: the days from start_date: 2017-11-12 05:00:00

    >>> date_to_index('2012-08-13')
    0
    >>> date_to_index('2012-08-12')
    -1
    >>> date_to_index('2012-08-15')
    2
    """
    return int(((dt - start_datetime).days) * 24 + (dt - start_datetime).seconds/3600)

def create_optimal_imitation_dataset(history, training_data_ratio=0.8, is_normalize=True):
    """ Create dataset for imitation optimal action given future observations

    Args:
        history: size of (num_stocks, T, num_features) contains (open, high, low, close)
        training_data_ratio: the ratio of training data

    Returns: un-normalized close/open ratio with size (T, num_stocks), labels: (T,)
             split the data according to training_data_ratio

    """
    num_stocks, T, num_features = history.shape
    cash_history = np.ones((1, T, num_features))
    history = np.concatenate((cash_history, history), axis=0)
    close_open_ratio = np.transpose(history[:, :, 3] / history[:, :, 0])
    if is_normalize:
        close_open_ratio = normalize(close_open_ratio)
    labels = np.argmax(close_open_ratio, axis=1)
    num_training_sample = int(T * training_data_ratio)
    return (close_open_ratio[:num_training_sample], labels[:num_training_sample]), \
           (close_open_ratio[num_training_sample:], labels[num_training_sample:])


def create_imitation_dataset(history, window_length, training_data_ratio=0.8, is_normalize=True):
    """ Create dataset for imitation optimal action given past observations

    Args:
        history: size of (num_stocks, T, num_features) contains (open, high, low, close)
        window_length: length of window as feature
        training_data_ratio: for splitting training data and validation data
        is_normalize: whether to normalize the data

    Returns: close/open ratio of size (num_samples, num_stocks, window_length)

    """
    num_stocks, T, num_features = history.shape
    cash_history = np.ones((1, T, num_features))
    history = np.concatenate((cash_history, history), axis=0)
    close_open_ratio = history[:, :, 3] / history[:, :, 0]
    if is_normalize:
        close_open_ratio = normalize(close_open_ratio)
    Xs = []
    Ys = []
    for i in range(window_length, T):
        obs = close_open_ratio[:, i - window_length:i]
        label = np.argmax(close_open_ratio[:, i:i+1], axis=0)
        Xs.append(obs)
        Ys.append(label)
    Xs = np.stack(Xs)
    Ys = np.concatenate(Ys)
    num_training_sample = int(T * training_data_ratio)
    return (Xs[:num_training_sample], Ys[:num_training_sample]), \
           (Xs[num_training_sample:], Ys[num_training_sample:])


In [6]:
beta = - 54832.949188
alpha = 4830.852766
first_currency = 'BTC'
second_currency = 'XRP'
spread_df = create_spread(beta, alpha, first_currency, second_currency, start_datetime)
history, abbre = create_dataset(spread_df)

In [3]:
spread_df

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-03-20 14:00:00,293.109906,198.688340,278.593201,198.887648
2020-03-20 15:00:00,198.887648,195.084584,221.413662,190.474353
2020-03-20 16:00:00,190.474353,197.897994,239.048638,276.419214
2020-03-20 17:00:00,276.419214,241.922970,237.258753,232.952625
2020-03-20 18:00:00,232.952625,240.839330,434.129398,375.436679
...,...,...,...,...
2020-06-20 09:00:00,1877.834448,1865.757973,1865.487742,1857.091268
2020-06-20 10:00:00,1857.091268,1864.027973,1865.251153,1855.527973
2020-06-20 11:00:00,1855.527973,1930.291498,1857.656936,1885.641383
2020-06-20 12:00:00,1885.641383,1863.834908,1868.091268,1876.171268


In [4]:
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)

In [5]:
spread_df

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-03-20 14:00:00,293.109906,198.68834,278.593201,198.887648
2020-03-20 15:00:00,198.887648,195.084584,221.413662,190.474353
2020-03-20 16:00:00,190.474353,197.897994,239.048638,276.419214
2020-03-20 17:00:00,276.419214,241.92297,237.258753,232.952625
2020-03-20 18:00:00,232.952625,240.83933,434.129398,375.436679
2020-03-20 19:00:00,375.436679,317.727831,360.596679,357.246794
2020-03-20 20:00:00,357.246794,362.826794,550.075134,468.777784
2020-03-20 21:00:00,468.777784,484.476333,528.910388,509.389283
2020-03-20 22:00:00,509.389283,448.110665,511.88813,493.855642
2020-03-20 23:00:00,493.855642,436.596564,473.411886,448.669628
