<a id="project2">
### Predict Crypto Currency Price Movement from Reddit Sentiment  
</a>

**Overview**: 

Using reddit posts under a given crypto currency's subreddit I want to predict the probability that the price of the currency will spike the following day. 

**Data Source**: [Poloniex Exchange](https://poloniex.com/support/api/) and [Reddit](https://bigquery.cloud.google.com/results/omega-wind-88718:bquijob_2add0008_15797586a12)

In [2]:
import json
import time
import requests
%matplotlib inline
import pylab
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [3]:
def get_poloniex_data(command, args={}):
    url = 'https://poloniex.com/public?command='+command
    for arg, value in args.items():
        url += '&{}={}'.format(arg,value)
    return json.loads(requests.get(url).content.decode('utf-8'))

Select the markets with the highest daily volume

In [4]:
market_volumes = get_poloniex_data('return24hVolume')
market_volumes = [(m, float(market_volumes[m]['BTC'])) for m in market_volumes if m[:3] == 'BTC']
markets = [m[0] for m in sorted(market_volumes, key=lambda k: k[1], reverse=True)[:20]]
markets

['BTC_ETH',
 'BTC_REP',
 'BTC_FCT',
 'BTC_XMR',
 'BTC_XRP',
 'BTC_ETC',
 'BTC_NXT',
 'BTC_CGA',
 'BTC_AMP',
 'BTC_DASH',
 'BTC_STEEM',
 'BTC_MAID',
 'BTC_SC',
 'BTC_SDC',
 'BTC_1CR',
 'BTC_LBC',
 'BTC_DOGE',
 'BTC_STR',
 'BTC_NAUT',
 'BTC_LTC']

For each market collect trade data for past day

In [15]:
def unix_time(dt):
    return int(time.mktime(dt.timetuple()))

def get_trade_history(currency_pair, start, end, save_file=False, file_path='data/'):
    date_fmt = '%Y-%m-%dT%H:%M:%S'
    file_name = file_path+'Trades|{}|{}|{}.csv'.format(
        currency_pair, start.strftime(date_fmt), end.strftime(date_fmt))
    
    trades = pd.DataFrame()
    need_to_fetch = lambda t: len(t) == 0 or len(t) % 50000 == 0
    while need_to_fetch(trades):
        print('fetching trades...')
        new_trades = pd.DataFrame(
            get_poloniex_data('returnTradeHistory', {
                'currencyPair': currency_pair,
                'start': unix_time(start),
                'end': unix_time(end if len(trades) == 0 
                                 else datetime.utcfromtimestamp(
                            trades.date.tail(1).values[0].tolist()/1e9))
            }))
        
        if new_trades.empty:
            break
        
        for col in ['amount','rate','total']:
            new_trades[col] = new_trades[col].astype(float)
        new_trades['date'] = pd.to_datetime(new_trades.date)
        
        if save_file:
            with open(file_name, 'a') as f:
                new_trades.to_csv(f, index=False)
                trades = new_trades
        else:
            trades = pd.concat([trades, new_trades])
        
        time.sleep(2)
    
    if save_file:
        return file_name

    return trades   

In [16]:
file_paths = {}

for market in markets:
    print(market.replace('_',':'))
    file_paths[market] = get_trade_history(market, datetime(2016,10,2), datetime(2016,10,3), save_file=True)

BTC:ETH
fetching trades...
BTC:REP
fetching trades...
BTC:FCT
fetching trades...
BTC:XMR
fetching trades...
BTC:XRP
fetching trades...
BTC:ETC
fetching trades...
BTC:NXT
fetching trades...
BTC:CGA
fetching trades...
BTC:AMP
fetching trades...
BTC:DASH
fetching trades...
BTC:STEEM
fetching trades...
BTC:MAID
fetching trades...
BTC:SC
fetching trades...
BTC:SDC
fetching trades...
BTC:1CR
fetching trades...
BTC:LBC
fetching trades...
BTC:DOGE
fetching trades...
BTC:STR
fetching trades...
BTC:NAUT
fetching trades...
BTC:LTC
fetching trades...


In [91]:
xmr_trades = pd.read_csv(file_paths['BTC_XMR'])
# add columns 
def ceil_dt(dt, delta):
    dt = datetime.strptime(dt,'%Y-%m-%d %H:%M:%S')
    return dt + (datetime.min - dt) % delta

def get_volume_level(total):
    level = 'very low'
    if 0.2 <= total <= 1:
        level = 'low'
    elif 1 <= total <= 5:
        level = 'medium'
    elif 5 <= total <= 10:
        level = 'high'
    elif 10 <= total:
        level = 'very high'
    return level
        

xmr_trades['time_bucket'] = xmr_trades.date.apply(lambda dt: ceil_dt(dt, timedelta(minutes=15)))   
xmr_trades['time'] = xmr_trades.date.apply(lambda dt: datetime.strptime(dt,'%Y-%m-%d %H:%M:%S'))
xmr_trades['volume_level'] = xmr_trades.total.apply(get_volume_level)
xmr_trades = xmr_trades[['time','type','rate','total', 'volume_level','time_bucket']].set_index('time')
xmr_trades.head()

Unnamed: 0_level_0,type,rate,total,volume_level,time_bucket
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-10-02 23:59:52,sell,0.013562,0.077188,very low,2016-10-03
2016-10-02 23:59:16,sell,0.013562,0.11103,very low,2016-10-03
2016-10-02 23:59:11,buy,0.0136,0.0,very low,2016-10-03
2016-10-02 23:59:10,buy,0.0136,0.00013,very low,2016-10-03
2016-10-02 23:59:05,buy,0.0136,0.0,very low,2016-10-03


In [105]:
grouped_trades = xmr_trades.groupby(['time_bucket','type','volume_level'])

In [142]:
stats = grouped_trades.describe().unstack(['type','volume_level',3])
stats

Unnamed: 0_level_0,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,...,total,total,total,total,total,total,total,total,total,total
type,buy,buy,buy,buy,buy,buy,buy,buy,buy,buy,...,sell,sell,sell,sell,sell,sell,sell,sell,sell,sell
volume_level,very low,very low,very low,very low,very low,very low,very low,very low,low,low,...,high,high,very high,very high,very high,very high,very high,very high,very high,very high
Unnamed: 0_level_3,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
time_bucket,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
2016-10-02 00:00:00,1,0.012010,,0.012010,0.012010,0.012010,0.012010,0.012010,,,...,,,,,,,,,,
2016-10-02 00:15:00,96,0.011965,0.000023,0.011931,0.011948,0.011969,0.011974,0.012010,7,0.011970,...,,,,,,,,,,
2016-10-02 00:30:00,159,0.011993,0.000055,0.011917,0.011930,0.012000,0.012045,0.012081,26,0.012012,...,,,,,,,,,,
2016-10-02 00:45:00,367,0.012258,0.000093,0.012060,0.012180,0.012280,0.012340,0.012420,55,0.012239,...,5.634839,5.634839,,,,,,,,
2016-10-02 01:00:00,192,0.012311,0.000037,0.012205,0.012300,0.012318,0.012340,0.012359,36,0.012328,...,,,,,,,,,,
2016-10-02 01:15:00,120,0.012230,0.000036,0.012138,0.012205,0.012231,0.012248,0.012330,8,0.012217,...,8.742833,8.742833,,,,,,,,
2016-10-02 01:30:00,185,0.012342,0.000075,0.012195,0.012285,0.012301,0.012420,0.012460,37,0.012340,...,,,,,,,,,,
2016-10-02 01:45:00,311,0.012564,0.000085,0.012304,0.012508,0.012588,0.012630,0.012700,48,0.012571,...,6.639960,9.771359,2,12.014295,1.704397,10.809104,11.411699,12.014295,12.616890,13.219485
2016-10-02 02:00:00,86,0.012496,0.000048,0.012390,0.012472,0.012493,0.012522,0.012599,11,0.012506,...,,,1,15.000000,,15.000000,15.000000,15.000000,15.000000,15.000000
2016-10-02 02:15:00,48,0.012406,0.000029,0.012360,0.012392,0.012400,0.012418,0.012489,5,0.012415,...,,,,,,,,,,


In [149]:
results = []

for i, r in stats.fillna(0).iterrows():
    result = {'date': i}
    for I1 in r.index.get_level_values(0).unique():
        for I2 in r.index.get_level_values(1).unique():
            for I3 in r.index.get_level_values(2).unique():
                result[I1+'_'+I2+'_'+I3] = r[I1,I2,I3]['mean']

    results.append(result)

  user_expressions, allow_stdin)


In [157]:
trades = pd.DataFrame(results).set_index('date')
high_vol = trades[[col for col in trades.columns if 'high' in col]]

In [158]:
high_vol.head()

Unnamed: 0_level_0,rate_buy_high,rate_buy_very high,rate_sell_high,rate_sell_very high,total_buy_high,total_buy_very high,total_sell_high,total_sell_very high
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-10-02 00:00:00,0.0,0.0,0.0,0,0.0,0.0,0.0,0
2016-10-02 00:15:00,0.0,0.0,0.0,0,0.0,0.0,0.0,0
2016-10-02 00:30:00,0.0,0.0,0.0,0,0.0,0.0,0.0,0
2016-10-02 00:45:00,0.012288,0.01226,0.01218,0,6.07751,10.650957,5.634839,0
2016-10-02 01:00:00,0.0,0.0,0.0,0,0.0,0.0,0.0,0


In [116]:
trades.date.tail(1).values[0], trades.date.head(1).values[0]

(numpy.datetime64('2016-08-10T00:00:13.000000000+0000'),
 numpy.datetime64('2016-09-10T23:53:33.000000000+0000'))

In [76]:
trades = pd.DataFrame(trades)
for col in ['amount','rate','total']:
    trades[col] = trades[col].astype(float)
trades['date'] = pd.to_datetime(trades.date)
type(trades.date.values[0])

numpy.datetime64

In [31]:
datetime.min - np

datetime.datetime(1, 1, 1, 0, 0)

In [68]:
import asyncio
from os import environ
from autobahn.asyncio.wamp import ApplicationSession, ApplicationRunner


class Component(ApplicationSession):
    """
    An application component that subscribes and receives events, and
    stop after having received 5 events.
    """

    async def onJoin(self, details):

        self.received = 0

        def on_event(i):
            print("Got event: {}".format(i))
            self.received += 1
            if self.received > 5:
                self.leave()

        await self.subscribe(on_event, u'trollbox')

    def onDisconnect(self):
        asyncio.get_event_loop().stop()


if __name__ == '__main__':
    runner = ApplicationRunner(
        environ.get("AUTOBAHN_DEMO_ROUTER", u"wss://api.poloniex.com"),
        u"crossbardemo",
    )
    runner.run(Component)

ImportError: No module named 'autobahn'

In [159]:
!git add .
!git commit -m 'acquired trade data from poloniex'
!git push origin project

ExceptionPexpect: pty.fork() failed: [Errno 12] Cannot allocate memory