In [1]:
from functools import reduce
from dask import delayed
from ipywidgets import interact, SelectionSlider
from plotly import offline as plotly
import plotly.graph_objs as go
import pandas as pd
import fastparquet
import distributed
import dask
import json
import gzip
import os

  (fname, cnt))
  (fname, cnt))


In [2]:
BASE_PATH = '/home/ec2-user/data/trades.parquet/pair={}'
OUTPUT_DIRECTORY = '/home/ec2-user/data'

In [3]:
plotly.init_notebook_mode(connected=True)
cluster = distributed.LocalCluster()
client = distributed.Client(cluster)
cluster

In [4]:
def all_pairs():
    dirs = !ls /home/ec2-user/data/trades.parquet/ | grep pair
    pairs = [p.split('=')[1] for p in dirs]
    return pairs

def all_parts_for_pair(pair):
    paths = [os.path.join(BASE_PATH.format(pair), p, 'part.0.parquet') for p in os.listdir(BASE_PATH.format('ADX-USD'))]
    existing_paths = [path for path in paths if os.path.exists(path)]
    return existing_paths

def read_pair(pair, columns=None):
    all_paths = all_parts_for_pair(pair)
    df = pd.concat([pd.read_parquet(p, columns=columns) for p in all_paths]).sort_values('timestamp')
    return df.reset_index().drop('index', axis=1).assign(pair=pair)

def create_returns_series(pair, write=True):
    df = read_pair(pair)
    df['relative_returns'] = (df.price - df.price.shift(1)) / df.price
    df['relative_returns_int16'] = (df.relative_returns * (2**14)).fillna(0).astype('int16')
    df['normalized_relative_returns'] = (df['price'] - df['price'].mean()) / df.price.std()
    if write:
        fastparquet.write(f'{OUTPUT_DIRECTORY}/returns.parquet',
                          df, compression='snappy', file_scheme='hive',
                          partition_on=['pair'], write_index=False)
    else:
        return df

In [5]:
pairs = all_pairs()
@interact(pair=SelectionSlider(options=pairs))
def plot(pair):
    df = read_pair(pair, ['timestamp', 'price']).set_index('timestamp').resample('500S').first().dropna()
    df['returns'] = (df.price - df.price.shift(1)) / df.price
    df['normalized_returns'] = (df['price'] - df['price'].mean()) / df.price.std()
    plotly.iplot([go.Scatter(x=df.index, y=df.price, name='price'),
                  go.Scatter(x=df.index, y=df.returns, name='returns'),
                  go.Scatter(x=df.index, y=df.normalized_returns, name='normalized_returns')])

In [6]:
@interact(pair=SelectionSlider(options=pairs))
def show_returns_series(pair):
    return create_returns_series(pair, False)[:20]

Unnamed: 0,id,price,quantity,timestamp,buy,time,pair,relative_returns,relative_returns_int16,normalized_relative_returns
0,182508999,1.359129,100.0,2018-02-05 01:01:06.477,True,20180205010100,ADX-USD,,0,0.904206
1,182508998,1.359128,200.0,2018-02-05 01:01:06.477,True,20180205010100,ADX-USD,-7.357659e-07,0,0.904203
2,182508997,1.359126,100.0,2018-02-05 01:01:06.477,True,20180205010100,ADX-USD,-1.471534e-06,0,0.904197
3,182509000,1.359131,300.0,2018-02-05 01:01:06.477,True,20180205010100,ADX-USD,3.678821e-06,0,0.904212
4,182839945,1.050031,200.0,2018-02-05 08:22:49.794,False,20180205082300,ADX-USD,-0.2943723,-4822,-0.031724
5,182839946,1.05003,100.0,2018-02-05 08:22:49.794,False,20180205082300,ADX-USD,-9.523537e-07,0,-0.031727
6,182839947,1.05,100.0,2018-02-05 08:22:49.794,False,20180205082300,ADX-USD,-2.857143e-05,0,-0.031818
7,182839948,1.00028,100.0,2018-02-05 08:22:49.794,False,20180205082300,ADX-USD,-0.04970608,-814,-0.182367
8,182839950,1.000004,700.0,2018-02-05 08:22:49.849,True,20180205082300,ADX-USD,-0.0002759989,-4,-0.183203
9,182839964,1.000004,200.0,2018-02-05 08:22:49.919,True,20180205082300,ADX-USD,0.0,0,-0.183203


In [7]:
### Write Returns Series
tasks = [delayed(create_returns_series)(pair) for pair in pairs]
futures = client.compute(tasks)
distributed.progress(futures, notebook=False)

[########################################] | 100% Completed | 19.3s