In [None]:
%matplotlib inline
import sys
import errno
import pandas as pd
import numpy as np
from scipy import stats
import scipy.cluster.hierarchy as hac
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import fcluster
from collections import OrderedDict
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
%matplotlib inline
np.set_printoptions(precision=5, suppress=True)

Pulls the data from the appropriate url and saves it as a data frame

In [None]:
import requests

POLONIEX_OHLCV_BASEURL = 'https://poloniex.com/public?command=returnChartData&currencyPair='

def get_ohlcv_poloniex(pair='BTC_ETH', start=1435699200, end=9999999999, period=14400):
    """
    returns ohlcv data for poloniex as pandas dataframe
    convert to unix timestamp using https://coderstoolbox.net/unixtimestamp/
    :param pair: str pair on poloniex
    :param start: int unix timestamp of beginning time
    :param end: int unix timestamp of ending time
    :param period: int candle width in seconds
    :return: pandas df of ohlcv data from poloniex for specified pair, times, and period
    """
    query = POLONIEX_OHLCV_BASEURL + pair + '&start=' + str(start) + '&end=' + str(end) + '&period=' + str(period)
    resp = requests.get(query)

    if resp.status_code != 200:
        raise requests.ApiError('GET /tasks/ {}'.format(resp.status_code))

    return pd.DataFrame(resp.json())

In [None]:
def get_pairs():
    for pair in ['USDT_BTC','USDT_ETH', 'USDT_LTC', 'USDT_DASH', 'BTC_ETH', 'BTC_LTC', 'BTC_DASH']:
        time_series = pd.DataFrame()
        df = get_ohlcv_poloniex(pair=pair, start=1483600958, end=9999999999, period=86400)
        df = df[df.index % 7 == 0]
        df['pct_change'] = df['weightedAverage'].pct_change()
        df['log_ret'] = np.log(df['weightedAverage']) - np.log(df['weightedAverage'].shift(1))
        df[pair] = df['log_ret']
        df = pd.DataFrame(df[pair])
        time_series = pd.concat([time_series,df], axis=1)
        time_series = time_series.iloc[1:]
        yield time_series, pair
    

In [None]:
def generate_clusters(timeSeries, pair):
    z_euclidean = hac.linkage(timeSeries, method='ward', metric='euclidean'),"Euclidean"
    z_minkowski = hac.linkage(timeSeries, method='ward', metric='minkowski'),"Minowski"
    z_cityblock = hac.linkage(timeSeries, method='ward', metric='cityblock'),"Cityblock"
    z_cosine = hac.linkage(timeSeries, method='ward', metric='cosine'),"Cosine"
    z_correlation = hac.linkage(timeSeries, method='ward', metric='correlation'),"Correlation"

    p = 12
    max_d = 50
    
    for z, name in [z_euclidean,z_minkowski,z_cityblock,z_cosine,z_correlation]:
        print name
        fancy_dendrogram(
            z,
            truncate_mode='lastp',
            p=p,
            leaf_rotation=90.,
            leaf_font_size=12.,
            show_contracted=True,
            annotate_above=10,  # useful in small plots so annotations don't overlap
            max_d=max_d,
        )
        plt.show()
        
        c, coph_dists = cophenet(z_euclidean, pdist(timeSeries))
        print(name + " Cophenetic Correlation Coefficient: " + str(c))
        
    print("Euclidean Clusters")
    clusters = fcluster(z_euclidean, max_d, criterion='distance')
    plt.figure(figsize=(10, 8))
    ar = np.arange(len(timeSeries))
    plt.scatter(timeSeries[pair], np.zeros_like(ar), c=clusters, cmap='prism')  # plot points with cluster dependent colors
    plt.show()

In [None]:
for series, pair in get_pairs():
    generate_clusters(series, pair)