# Gathering Data V2


In [None]:
from IPython.display import FileLink, FileLinks


In [None]:
import os
import shutil
import sys
from time import time
from uuid import uuid4

import numpy as np
import pandas as pd

from data_manager import file_processor
# from returns_quantization import add_returns_in_place
# from utils import *
import datetime
import matplotlib


In [None]:
# np.set_printoptions(threshold=np.nan)
# pd.set_option('display.height', 1000)
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)

matplotlib.use('Agg')

In [None]:
PATH = 'data/btc/'

# Functions

In [None]:
def compute_returns(p):
    close_prices = p['price_close']
    close_prices_returns = 100 * ((close_prices.shift(-1) - close_prices) / close_prices).fillna(0.0)
    return close_prices_returns.shift(1).fillna(0)

In [None]:
def plot_p(df):
    import matplotlib.pyplot as plt
    from matplotlib.finance import candlestick2_ohlc
    fig, ax = plt.subplots()
    candlestick2_ohlc(ax,
                      df['price_open'].values,
                      df['price_high'].values,
                      df['price_low'].values,
                      df['price_close'].values,
                      width=0.6,
                      colorup='g',
                      colordown='r',
                      alpha=1)
    plt.show()
    print('Done.')

In [None]:
def save_to_file(df, filename):
    import matplotlib.pyplot as plt
    from matplotlib.finance import candlestick2_ohlc
    fig, ax = plt.subplots()
    candlestick2_ohlc(ax,
                      df['price_open'].values,
                      df['price_high'].values,
                      df['price_low'].values,
                      df['price_close'].values,
                      width=0.6,
                      colorup='g',
                      colordown='r',
                      alpha=1)
    plt.savefig(filename)
    plt.close(fig)

In [None]:
def mkdir_p(path):
    import os
    import errno
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

# Process Tick Data

In [None]:
data_file = f'{PATH}train2015_01_27__2018_02_06.csv'
data_file_test = f'{PATH}test2018_02_06__2018_03_06.csv'
data_output_folder = f'{PATH}btcgraphs/1/'

### This is def file_processor Fucntion
def file_processor(data_file):

In [None]:
print('Reading bitcoin market data file here: {}.'.format(data_file))

# create df from tick data
# [unix timestamp, price, volume]
# use the timestamp as the index
d = pd.read_table(data_file, sep=',', header=None, index_col=0, names=['price', 'volume'])

# map the index to datetime
d.index = d.index.map(lambda ts: datetime.datetime.fromtimestamp(int(ts)))
d.index.names = ['DateTime_UTC']

# split the prices into 5 minute groups 
p = pd.DataFrame(d['price'].resample('5Min').ohlc().bfill())
p.columns = ['price_open', 'price_high', 'price_low', 'price_close']

# sum volume by 5 minute chunks
v = pd.DataFrame(d['volume'].resample('5Min').sum())
v.columns = ['volume']
p['volume'] = v['volume']

print('Done')

In [None]:
print(p.isnull().sum())

In [None]:
print('Reading bitcoin market data file here: {}.'.format(data_file_test))

# create df from tick data
# [unix timestamp, price, volume]
# use the timestamp as the index
d = pd.read_table(data_file, sep=',', header=None, index_col=0, names=['price', 'volume'])

# map the index to datetime
d.index = d.index.map(lambda ts: datetime.datetime.fromtimestamp(int(ts)))
d.index.names = ['DateTime_UTC']

# split the prices into 5 minute groups 
p_test = pd.DataFrame(d['price'].resample('5Min').ohlc().bfill())
p_test.columns = ['price_open', 'price_high', 'price_low', 'price_close']

# sum volume by 5 minute chunks
v = pd.DataFrame(d['volume'].resample('5Min').sum())
v.columns = ['volume']
p_test['volume'] = v['volume']

# # drop NaN values.
# p = p.dropna()
print(p_test.isnull().sum())
print('Done')

# Generate the Data

### This is the generate_cnn_dataset function
def generate_cnn_dataset(data_folder, bitcoin_file, get_class_name):

In [None]:
data_folder = data_output_folder

In [None]:
# compute_returns(p)
close_prices = p['price_close']
close_prices_returns = 100 * ((close_prices.shift(-1) - close_prices) / close_prices).fillna(0.0)
close_prices_returns = close_prices_returns.shift(1).fillna(0)
close_prices_returns.head(n=5)

In [None]:
# def add_returns_in_place(p):  
# close_prices_returns = compute_returns(p)
num_bins = 10
returns_bins = pd.qcut(close_prices_returns, num_bins)
bins_categories = returns_bins.values.categories
returns_labels = pd.qcut(close_prices_returns, num_bins, labels=False)

p['close_price_returns'] = close_prices_returns
p['close_price_returns_bins'] = returns_bins
p['close_price_returns_labels'] = returns_labels

In [None]:
p.tail(n=20)
# return df, bins_categories

In [None]:
# btc_df, levels = add_returns_in_place(btc_df)
levels = bins_categories

print('-' * 80)
print('Those values should be roughly equal to 1/len(levels):')
for ii in range(len(levels)):
    print(ii, np.mean((p['close_price_returns_labels'] == ii).values))
print(levels)
print('-' * 80)

Two class UP/DOWN version

In [None]:
def get_price_direction(btc_df, btc_slice, i, slice_size):
    last_price = btc_slice[-2:-1]['price_close'].values[0]
    next_price = btc_df[i + slice_size:i + slice_size + 1]['price_close'].values[0]
    if last_price < next_price:
        class_name = 'UP'
    else:
        class_name = 'DOWN'
    return class_name

Three class version UP/DOWN/HOLD

movement_threshold = 1e-4 # this is a $0.10 movement at BTC = $10,000

def get_price_direction2(btc_df, btc_slice, i, slice_size):
    last_price = btc_slice[-2:-1]['price_close'].values[0]
    next_price = btc_df[i + slice_size:i + slice_size + 1]['price_close'].values[0]
    dif = next_price - last_price
    if dif > movement_threshold:
        class_name = 'UP'
    elif dif < -movement_threshold:
        class_name = 'DOWN'
    else:
        class_name = 'HOLD'
    return class_name

In [None]:
# number of periods in our input samples
slice_size = 40
# 1/10 data "chuncks" will be for testing
test_every_steps = 10
# number of 5-minute periods we are creating chunks from, 
# need to not start chunk within last 40 or will run out of space
n = len(p) - slice_size

shutil.rmtree(data_folder, ignore_errors=True)

# this is the number of samples we are going to make from the data
cycles = 1e6 

In [None]:
btc_df = p
for epoch in range(int(cycles)):
    st = time()
    
    # choose a random starting point
    i = np.random.choice(n)
    # take following 40 time periods (total 41)
    btc_slice = btc_df[i:i + slice_size]

    if btc_slice.isnull().values.any():
        # sometimes prices are discontinuous and nothing happened in one 5min bucket.
        # in that case, we consider this slice as wrong and we raise an exception.
        # it's likely to happen at the beginning of the data set where the volumes are low.
        raise Exception('NaN values detected. Please remove them.')

    class_name = get_price_direction(btc_df, btc_slice, i, slice_size)
    save_dir = os.path.join(data_folder, 'train', class_name)
    if epoch % test_every_steps == 0:
        save_dir = os.path.join(data_folder, 'valid', class_name)
    mkdir_p(save_dir)
    filename = save_dir + '/' + str(uuid4()) + '.png'
    save_to_file(btc_slice, filename=filename)
    print('epoch = {0}, time = {1:.3f}, filename = {2}'.format(str(epoch).zfill(8), time() - st, filename))

In [None]:
btc_df = p_test
nrow = len(p_test.index)
for epoch in range(int(nrow-slice_size-1)):
    st = time()
    
    # choose a random starting point
    i = epoch
    # take following 40 time periods (total 41)
    btc_slice = btc_df[i:i + slice_size]

    if btc_slice.isnull().values.any():
        # sometimes prices are discontinuous and nothing happened in one 5min bucket.
        # in that case, we consider this slice as wrong and we raise an exception.
        # it's likely to happen at the beginning of the data set where the volumes are low.
        raise Exception('NaN values detected. Please remove them.')

    class_name = get_price_direction(btc_df, btc_slice, i, slice_size)
    save_dir = os.path.join(data_folder, 'test', class_name)
    mkdir_p(save_dir)
    filename = save_dir + '/' + str(uuid4()) + '.png'
    save_to_file(btc_slice, filename=filename)
    print('epoch = {0}, time = {1:.3f}, filename = {2}'.format(str(epoch).zfill(8), time() - st, filename))