# Gathering Data and Generating Images for Classification

In [None]:
from IPython.display import FileLink, FileLinks


In [6]:
import os
import shutil
import sys
from time import time
from uuid import uuid4

import numpy as np
import pandas as pd

from data_manager import file_processor
# from returns_quantization import add_returns_in_place
# from utils import *
import datetime
import matplotlib


In [7]:
# np.set_printoptions(threshold=np.nan)
# pd.set_option('display.height', 1000)
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)

matplotlib.use('Agg')

In [8]:
PATH = 'data/btc/'

## Gather Cryptocurrency Data from Exchange APIs

In [9]:
# TODO?

# Functions

In [10]:
def compute_returns(p):
    close_prices = p['price_close']
    close_prices_returns = 100 * ((close_prices.shift(-1) - close_prices) / close_prices).fillna(0.0)
    return close_prices_returns.shift(1).fillna(0)

In [11]:
def plot_p(df):
    import matplotlib.pyplot as plt
    from matplotlib.finance import candlestick2_ohlc
    fig, ax = plt.subplots()
    candlestick2_ohlc(ax,
                      df['price_open'].values,
                      df['price_high'].values,
                      df['price_low'].values,
                      df['price_close'].values,
                      width=0.6,
                      colorup='g',
                      colordown='r',
                      alpha=1)
    plt.show()
    print('Done.')

In [12]:
def save_to_file(df, filename):
    import matplotlib.pyplot as plt
    from matplotlib.finance import candlestick2_ohlc
    fig, ax = plt.subplots()
    candlestick2_ohlc(ax,
                      df['price_open'].values,
                      df['price_high'].values,
                      df['price_low'].values,
                      df['price_close'].values,
                      width=0.6,
                      colorup='g',
                      colordown='r',
                      alpha=1)
    plt.savefig(filename)
    plt.close(fig)

In [13]:
def mkdir_p(path):
    import os
    import errno
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

# Process Tick Data

In [14]:
data_file = f'{PATH}coinbaseUSD.csv'
data_output_folder = f'{PATH}btcgraphs/'

### This is def file_processor Fucntion
def file_processor(data_file):

In [15]:
print('Reading bitcoin market data file here: {}.'.format(data_file))

# create df from tick data
# [unix timestamp, price, volume]
# use the timestamp as the index
d = pd.read_table(data_file, sep=',', header=None, index_col=0, names=['price', 'volume'])

# map the index to datetime
d.index = d.index.map(lambda ts: datetime.datetime.fromtimestamp(int(ts)))
d.index.names = ['DateTime_UTC']

# split the prices into 5 minute groups 
p = pd.DataFrame(d['price'].resample('5Min').ohlc().bfill())
p.columns = ['price_open', 'price_high', 'price_low', 'price_close']

# sum volume by 5 minute chunks
v = pd.DataFrame(d['volume'].resample('5Min').sum())
v.columns = ['volume']
p['volume'] = v['volume']

# # drop NaN values.
# p = p.dropna()
p.isnull().sum()
print('Done')

Reading bitcoin market data file here: data/btc/coinbaseUSD.csv.
Done


In [16]:
print(p.isnull().sum())

price_open     0
price_high     0
price_low      0
price_close    0
volume         0
dtype: int64


In [None]:
# choosing everything starting after 2015.... no data for first 6 days unfortunately... might need to find new data source
# p = p.loc[p.index >= datetime.datetime(2015,1,1,0,0,0)]
# p.head(n=5)

# Generate the Data

### This is the generate_cnn_dataset function
def generate_cnn_dataset(data_folder, bitcoin_file, get_class_name):

In [17]:
data_folder = data_output_folder

In [18]:
# compute_returns(p)
close_prices = p['price_close']
close_prices_returns = 100 * ((close_prices.shift(-1) - close_prices) / close_prices).fillna(0.0)
close_prices_returns = close_prices_returns.shift(1).fillna(0)
close_prices_returns.head(n=5)

DateTime_UTC
2014-12-01 00:30:00     0.000000
2014-12-01 00:35:00     0.000000
2014-12-01 00:40:00     0.000000
2014-12-01 00:45:00    23.333333
2014-12-01 00:50:00     0.000000
Freq: 5T, Name: price_close, dtype: float64

In [19]:
# def add_returns_in_place(p):  
# close_prices_returns = compute_returns(p)
num_bins = 10
returns_bins = pd.qcut(close_prices_returns, num_bins)
bins_categories = returns_bins.values.categories
returns_labels = pd.qcut(close_prices_returns, num_bins, labels=False)

p['close_price_returns'] = close_prices_returns
p['close_price_returns_bins'] = returns_bins
p['close_price_returns_labels'] = returns_labels

In [20]:
bins_categories

IntervalIndex([(-42.016, -0.175], (-0.175, -0.0683], (-0.0683, -0.025], (-0.025, -0.00282], (-0.00282, 0.0], (0.0, 0.00881], (0.00881, 0.0344], (0.0344, 0.0786], (0.0786, 0.181], (0.181, 67.889]]
              closed='right',
              dtype='interval[float64]')

In [22]:
p.tail(n=20)
# return df, bins_categories

Unnamed: 0_level_0,price_open,price_high,price_low,price_close,volume,close_price_returns,close_price_returns_bins,close_price_returns_labels
DateTime_UTC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-03-06 17:15:00,10755.01,10755.01,10726.0,10727.79,49.783898,-0.252999,"(-42.016, -0.175]",0
2018-03-06 17:20:00,10727.78,10730.0,10725.33,10726.06,42.211314,-0.016126,"(-0.025, -0.00282]",3
2018-03-06 17:25:00,10726.06,10745.0,10726.05,10745.0,35.748204,0.176579,"(0.0786, 0.181]",8
2018-03-06 17:30:00,10745.0,10765.2,10745.0,10758.87,29.571526,0.129083,"(0.0786, 0.181]",8
2018-03-06 17:35:00,10750.42,10788.47,10750.0,10782.98,69.172431,0.224094,"(0.181, 67.889]",9
2018-03-06 17:40:00,10782.98,10799.0,10782.97,10793.53,42.246305,0.097839,"(0.0786, 0.181]",8
2018-03-06 17:45:00,10793.54,10798.3,10793.53,10798.3,29.784662,0.044193,"(0.0344, 0.0786]",7
2018-03-06 17:50:00,10798.29,10798.3,10765.07,10777.56,59.524953,-0.192067,"(-42.016, -0.175]",0
2018-03-06 17:55:00,10777.55,10785.72,10765.7,10773.02,20.400061,-0.042125,"(-0.0683, -0.025]",2
2018-03-06 18:00:00,10773.02,10778.1,10773.01,10778.09,8.18742,0.047062,"(0.0344, 0.0786]",7


In [23]:
p.to_csv(f"{PATH}btc-out.csv", sep = "\t")

In [None]:
# btc_df, levels = add_returns_in_place(btc_df)
levels = bins_categories

print('-' * 80)
print('Those values should be roughly equal to 1/len(levels):')
for ii in range(len(levels)):
    print(ii, np.mean((p['close_price_returns_labels'] == ii).values))
print(levels)
print('-' * 80)

In [None]:
def get_price_direction(btc_df, btc_slice, i, slice_size):
    last_price = btc_slice[-2:-1]['price_close'].values[0]
    next_price = btc_df[i + slice_size:i + slice_size + 1]['price_close'].values[0]
    if last_price < next_price:
        class_name = 'UP'
    else:
        class_name = 'DOWN'
    return class_name

In [None]:
# number of periods in our input samples
slice_size = 40
# 1/10 data "chuncks" will be for testing
test_every_steps = 10
# number of 5-minute periods we are creating chunks from, 
# need to not start chunk within last 40 or will run out of space
n = len(p) - slice_size

shutil.rmtree(data_folder, ignore_errors=True)

# this is the number of samples we are going to make from the data
cycles = 1e6 

In [None]:
btc_df = p
for epoch in range(int(cycles)):
    st = time()
    
    # choose a random starting point
    i = np.random.choice(n)
    # take following 40 time periods (total 41)
    btc_slice = btc_df[i:i + slice_size]

    if btc_slice.isnull().values.any():
        # sometimes prices are discontinuous and nothing happened in one 5min bucket.
        # in that case, we consider this slice as wrong and we raise an exception.
        # it's likely to happen at the beginning of the data set where the volumes are low.
        raise Exception('NaN values detected. Please remove them.')

    class_name = get_price_direction(btc_df, btc_slice, i, slice_size)
    save_dir = os.path.join(data_folder, 'train', class_name)
    if epoch % test_every_steps == 0:
        save_dir = os.path.join(data_folder, 'test', class_name)
    mkdir_p(save_dir)
    filename = save_dir + '/' + str(uuid4()) + '.png'
    save_to_file(btc_slice, filename=filename)
    print('epoch = {0}, time = {1:.3f}, filename = {2}'.format(str(epoch).zfill(8), time() - st, filename))