# Machine Learning meetup code samples

#### by Makis Tsantekids

### Tasks

- Parse Bitcoin price data.
- Extract features
- Normalize the features
- Create sample windows
- Construct the ML model
- Simple Price model
- Sentiment model

## Parsing Bitcoin price Data

First we need to load the tick data we have saved

In [1]:
import pandas as pd
from utils import coinbase_ticks_path, load_ticks

ticks = load_ticks(coinbase_ticks_path, symbol='BTC-USD', start=-10000)
ticks.tail(10)

Unnamed: 0_level_0,price,side,size,time
trade_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
43802476,8198.88,sell,0.00178,2018-05-22T18:17:58.734Z
43802477,8198.88,sell,0.00237807,2018-05-22T18:17:58.734Z
43802478,8199.93,sell,0.00128017,2018-05-22T18:17:58.734Z
43802479,8199.94,sell,0.0012802,2018-05-22T18:17:58.734Z
43802480,8199.95,sell,0.00201211,2018-05-22T18:17:58.734Z
43802481,8199.96,sell,0.00128048,2018-05-22T18:17:58.734Z
43802482,8199.97,sell,0.00115834,2018-05-22T18:17:58.734Z
43802483,8199.98,sell,0.00225591,2018-05-22T18:17:58.734Z
43802484,8199.99,sell,0.00164629,2018-05-22T18:17:58.734Z
43802485,8200.0,sell,0.00120303,2018-05-22T18:17:58.734Z


In [2]:
from utils import efficient_candle_load

candles = efficient_candle_load(coinbase_ticks_path, window='30min', symbol='BTC-USD', start=-20000000)
candles.tail(10)

40it [02:30,  3.77s/it]


Unnamed: 0_level_0,open,high,low,close
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-05-22 13:30:00,8190.839844,8232.650391,8190.839844,8224.150391
2018-05-22 14:00:00,8224.139648,8224.139648,8198.509766,8220.0
2018-05-22 14:30:00,8220.0,8231.839844,8219.990234,8228.0
2018-05-22 15:00:00,8229.980469,8239.849609,8205.910156,8239.839844
2018-05-22 15:30:00,8239.849609,8240.0,8224.0,8230.349609
2018-05-22 16:00:00,8230.360352,8239.990234,8228.040039,8239.980469
2018-05-22 16:30:00,8239.990234,8239.990234,8216.0,8220.490234
2018-05-22 17:00:00,8220.480469,8235.200195,8215.0,8233.459961
2018-05-22 17:30:00,8228.990234,8228.990234,8215.0,8215.0
2018-05-22 18:00:00,8215.0,8215.009766,8192.530273,8200.0


In [3]:
print(candles.shape)

(9178, 4)


In [4]:
from plotly.offline import init_notebook_mode, plot, iplot
from plotly import graph_objs as go
init_notebook_mode(connected=True)
sample_candles = candles.iloc[-500:]
iplot([
    go.Candlestick(
        open=sample_candles.open, 
        close=sample_candles.close,   
        high=sample_candles.high, 
        low=sample_candles.low, 
        x=sample_candles.index)
])

## Extracting features

From the constructed candles, a set of features is constructed

In [5]:
features = []

returns = candles.close.pct_change().bfill()
returns_10 = candles.close.rolling(window=10,min_periods=1).mean().pct_change(10).bfill()
returns_50 = candles.close.rolling(window=50,min_periods=1).mean().pct_change(50).bfill()
volatility = candles.close.pct_change(10).bfill().rolling(window=100, min_periods=1).std().bfill()

features = [returns, returns_10, returns_50, volatility]

iplot([
    go.Scatter(y=returns_10.iloc[-500:], x=returns_10.index, name='returns'),
    go.Scatter(y=volatility.iloc[-500:], x=volatility.index, name='volatility'),
])

In [6]:
iplot([go.Scatter(y=returns_50.iloc[-500:])])

## Normalization of Features

In [7]:
n_candles = candles.shape[0]
n_train = int(n_candles * 0.6)
n_test = n_candles - n_train

for feat in features:
    mean = feat.iloc[:n_train].mean()
    std = feat.iloc[:n_train].std()
    feat[:] = (feat - mean) / std

normalized_features = pd.concat(features, axis=1)
normalized_features.columns = ['returns','returns10','returns50','volatility']

In [8]:
normalized_features.tail(10)

Unnamed: 0_level_0,returns,returns10,returns50,volatility
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-05-22 13:30:00,0.335214,-0.455962,-0.411592,-1.43734
2018-05-22 14:00:00,-0.058359,-0.458231,-0.428373,-1.43982
2018-05-22 14:30:00,0.068877,-0.436818,-0.445503,-1.441834
2018-05-22 15:00:00,0.108975,-0.41863,-0.455813,-1.443037
2018-05-22 15:30:00,-0.114067,-0.408306,-0.465603,-1.444841
2018-05-22 16:00:00,0.085832,-0.354264,-0.47408,-1.446482
2018-05-22 16:30:00,-0.21855,-0.291948,-0.483834,-1.445961
2018-05-22 17:00:00,0.120921,-0.20077,-0.490259,-1.443551
2018-05-22 17:30:00,-0.207938,-0.113161,-0.49801,-1.442625
2018-05-22 18:00:00,-0.17211,-0.054349,-0.502259,-1.457333


## Creating Targets

The target of the model is to be able to predict whether the price of Bitcoin will increase or decrease in the near future.

In [9]:
future_price = candles.close.shift(-15).ffill().iloc[::-1].rolling(window=10, min_periods=1).mean().iloc[::-1]
iplot([
    go.Scatter(y=candles.close.iloc[-500:], x=candles.index[-500:], name='Close price'),
    go.Scatter(y=future_price.iloc[-500:], x=candles.index[-500:], name='Future price')
])

In [10]:
future_direction = (future_price / candles.close) - 1
data = [
        go.Scatter(y=candles.close.iloc[-500:], name='Close price'),
        go.Scatter(y=100.*future_direction.iloc[-500:], name='Future price', yaxis='y2')
       ]
layout = go.Layout(
    title='Double Y Axis Example',
    yaxis=dict(
        title='Bitcoin Price'
    ),
    yaxis2=dict(
        title='Future Direction',
        overlaying='y',
        side='right'
    )
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [11]:
threshold = .01
ups = future_direction > threshold
downs = future_direction < -threshold
stationary = (future_direction < threshold) & (future_direction > -threshold)
labels = pd.DataFrame(np.zeros(candles.shape[0]), index=candles.index)
labels[ups] = 1
labels[downs] = 2
print(np.unique(labels,return_counts=True))
iplot([
    go.Scatter(y=candles.close, x=candles.index),
    go.Scatter(y=candles.close[ups], x=candles.index[ups], mode='markers', marker=dict(color='green', size=8, symbol='triangle-up')),
    go.Scatter(y=candles.close[downs], x=candles.index[downs], mode='markers', marker=dict(color='red', size=8, symbol='triangle-down')),
])

(array([0., 1., 2.]), array([2706, 3395, 3077]))


## Create sample windows

### Why sample windows?

![image](http://karpathy.github.io/assets/rnn/diags.jpeg)

In [29]:
from utils import roll_window
from keras.utils import to_categorical

window_size = 50
step_size = 20

# Roll the window for train features
train_features = roll_window(normalized_features.iloc[:n_train].values.copy(), 
                             window_size=window_size, step_size=step_size)
test_features = roll_window(normalized_features.iloc[n_train:].values.copy(), 
                            window_size=window_size, step_size=step_size)

train_idx = roll_window(normalized_features.index[:n_train].values.copy(), 
                             window_size=window_size, step_size=step_size)
test_idx = roll_window(normalized_features.index[n_train:].values.copy(), 
                            window_size=window_size, step_size=step_size)

train_labels = roll_window(to_categorical(labels.iloc[:n_train].values.copy()),
                           window_size=window_size, step_size=step_size)
test_labels = roll_window(to_categorical(labels.iloc[n_train:].values.copy()),
                           window_size=window_size, step_size=step_size)

print(train_features.shape, train_labels.shape, test_features.shape, test_labels.shape)
# print(train_idx.shape)

(273, 50, 4) (273, 50, 3) (182, 50, 4) (182, 50, 3)


## LSTM network training

In [13]:
from keras.models import Sequential, load_model, Model
from keras.layers import PReLU, Conv1D, Dense, Activation, Flatten, LSTM, InputLayer, TimeDistributed, Lambda, \
    CuDNNLSTM, Dropout
from keras.optimizers import RMSprop

lstm_model = Sequential()
lstm_model.add(LSTM(6, input_shape=(train_features.shape[-2], train_features.shape[-1]),
                         return_sequences=True))
lstm_model.add(TimeDistributed(Dense(3)))
lstm_model.add(TimeDistributed(Activation('softmax')))
optimizer = RMSprop()

loss = 'categorical_crossentropy'
lstm_model.compile(optimizer=optimizer,
                       loss=loss, metrics=['acc'])


In [14]:
lstm_model.fit(train_features, train_labels, batch_size=10,
               epochs=200, verbose=2,
               validation_data=(test_features, test_labels)
              )

Train on 273 samples, validate on 182 samples
Epoch 1/200
 - 3s - loss: 1.1026 - acc: 0.3486 - val_loss: 1.1000 - val_acc: 0.3129
Epoch 2/200
 - 1s - loss: 1.0905 - acc: 0.3640 - val_loss: 1.1029 - val_acc: 0.3022
Epoch 3/200
 - 1s - loss: 1.0817 - acc: 0.3747 - val_loss: 1.1063 - val_acc: 0.2926
Epoch 4/200
 - 1s - loss: 1.0745 - acc: 0.3993 - val_loss: 1.1112 - val_acc: 0.2891
Epoch 5/200
 - 1s - loss: 1.0687 - acc: 0.4226 - val_loss: 1.1170 - val_acc: 0.3029
Epoch 6/200
 - 2s - loss: 1.0642 - acc: 0.4280 - val_loss: 1.1214 - val_acc: 0.2982
Epoch 7/200
 - 1s - loss: 1.0610 - acc: 0.4267 - val_loss: 1.1245 - val_acc: 0.2955
Epoch 8/200
 - 1s - loss: 1.0584 - acc: 0.4300 - val_loss: 1.1288 - val_acc: 0.2953
Epoch 9/200
 - 1s - loss: 1.0562 - acc: 0.4302 - val_loss: 1.1311 - val_acc: 0.2934
Epoch 10/200
 - 1s - loss: 1.0543 - acc: 0.4335 - val_loss: 1.1328 - val_acc: 0.2922
Epoch 11/200
 - 1s - loss: 1.0528 - acc: 0.4351 - val_loss: 1.1332 - val_acc: 0.2919
Epoch 12/200
 - 1s - loss: 1

Epoch 97/200
 - 1s - loss: 0.9960 - acc: 0.4905 - val_loss: 1.1686 - val_acc: 0.2932
Epoch 98/200
 - 2s - loss: 0.9958 - acc: 0.4900 - val_loss: 1.1702 - val_acc: 0.2911
Epoch 99/200
 - 1s - loss: 0.9947 - acc: 0.4919 - val_loss: 1.1716 - val_acc: 0.2860
Epoch 100/200
 - 1s - loss: 0.9944 - acc: 0.4929 - val_loss: 1.1685 - val_acc: 0.2948
Epoch 101/200
 - 1s - loss: 0.9948 - acc: 0.4916 - val_loss: 1.1711 - val_acc: 0.2896
Epoch 102/200
 - 1s - loss: 0.9948 - acc: 0.4919 - val_loss: 1.1747 - val_acc: 0.2885
Epoch 103/200
 - 2s - loss: 0.9942 - acc: 0.4927 - val_loss: 1.1722 - val_acc: 0.2914
Epoch 104/200
 - 1s - loss: 0.9930 - acc: 0.4948 - val_loss: 1.1709 - val_acc: 0.2956
Epoch 105/200
 - 2s - loss: 0.9935 - acc: 0.4935 - val_loss: 1.1681 - val_acc: 0.2926
Epoch 106/200
 - 2s - loss: 0.9933 - acc: 0.4939 - val_loss: 1.1738 - val_acc: 0.2890
Epoch 107/200
 - 2s - loss: 0.9928 - acc: 0.4938 - val_loss: 1.1718 - val_acc: 0.2946
Epoch 108/200
 - 2s - loss: 0.9934 - acc: 0.4908 - val_lo

Epoch 193/200
 - 2s - loss: 0.9737 - acc: 0.5098 - val_loss: 1.1859 - val_acc: 0.3170
Epoch 194/200
 - 1s - loss: 0.9723 - acc: 0.5109 - val_loss: 1.1908 - val_acc: 0.3155
Epoch 195/200
 - 1s - loss: 0.9722 - acc: 0.5135 - val_loss: 1.1964 - val_acc: 0.3144
Epoch 196/200
 - 1s - loss: 0.9723 - acc: 0.5142 - val_loss: 1.1945 - val_acc: 0.3144
Epoch 197/200
 - 2s - loss: 0.9725 - acc: 0.5126 - val_loss: 1.1944 - val_acc: 0.3157
Epoch 198/200
 - 1s - loss: 0.9721 - acc: 0.5119 - val_loss: 1.1884 - val_acc: 0.3146
Epoch 199/200
 - 1s - loss: 0.9713 - acc: 0.5127 - val_loss: 1.1870 - val_acc: 0.3158
Epoch 200/200
 - 1s - loss: 0.9712 - acc: 0.5119 - val_loss: 1.1846 - val_acc: 0.3184


<keras.callbacks.History at 0x1359cfcf8>

In [30]:
train_preds = np.argmax(lstm_model.predict(train_features),axis=-1).flatten()
test_preds = np.argmax(lstm_model.predict(test_features),axis=-1).flatten()
train_idx_flat = train_idx.flatten()
test_idx_flat = test_idx.flatten()

train_preds = pd.Series(train_preds,index=train_idx_flat)
test_preds = pd.Series(test_preds,index=test_idx_flat)

train_preds = train_preds[~train_preds.index.duplicated(keep='last')]
test_preds = test_preds[~test_preds.index.duplicated(keep='last')]

In [16]:
ups = train_preds[train_preds == 1].index
downs = train_preds[train_preds == 2].index
closes = candles.close.iloc[:n_train]
iplot([
    go.Scatter(y=closes, x=closes.index),
    go.Scatter(y=closes[ups], x=ups, mode='markers', marker=dict(color='green', size=8, symbol='triangle-up')),
    go.Scatter(y=closes[downs], x=downs, mode='markers', marker=dict(color='red', size=8, symbol='triangle-down')),
])

In [17]:
ups = test_preds[test_preds == 1].index
downs = test_preds[test_preds == 2].index
closes = candles.close.iloc[n_train:]
iplot([
    go.Scatter(y=closes, x=closes.index),
    go.Scatter(y=closes[ups], x=ups, mode='markers', marker=dict(color='green', size=8, symbol='triangle-up')),
    go.Scatter(y=closes[downs], x=downs, mode='markers', marker=dict(color='red', size=8, symbol='triangle-down')),
])

## Backtesting

In [18]:
from backtest import backtest_preds

closes = candles.close[train_preds.index]
pnl = backtest_preds(train_preds.values.astype(np.int32), closes, slippage=0.)
iplot(
    [
        go.Scatter(y=pnl.cumsum(), x=train_preds.index)
    ]
)

In [33]:
closes = candles.close[test_preds.index]
pnl = backtest_preds(test_preds.values.astype(np.int32), closes, slippage=0.01)
iplot(
    [
        go.Scatter(y=pnl.cumsum(), x=test_preds.index)
    ]
)

## Sentiment Analysis

In [20]:
from tweets import db_path, dataset
db = dataset.connect(f"sqlite:///{str(db_path)}")
tweets_table = db['tweets']
tweets = []
for i, tweet in enumerate(tweets_table):
    tweets.append(tweet['text'])
    if 305 > i > 300:
        print(pd.to_datetime(tweet['timestamp'], unit='ms').strftime('%B %d, %Y, %r'), ':', tweet['text'])
        print("_____________________________________________________")
    if i > 1000:
        break



April 24, 2018, 10:00:44 AM : I've been saying for a few days that #btc needs more volume at 9K to prove it to me re: long price. So far it hasn'… https://t.co/amFLkrGuj5
_____________________________________________________
April 24, 2018, 10:00:44 AM : RT @murthaburke: Huge savings on premium cigars and tobacco products!
@bnbtobacco 
https://t.co/e6jlt8KlTd 
#blockchain #cryptocurrency #cr…
_____________________________________________________
April 24, 2018, 10:00:44 AM : RT @murthaburke: Great Clothes At Great Prices! @riachuelo 
https://t.co/b9675obY57  
#blockchain #cryptocurrency #crypto #ethereum #trapad…
_____________________________________________________
April 24, 2018, 10:00:44 AM : World’s First Crypto Commodity Coin

https://t.co/QKID6Ys7lB
_____________________________________________________


In [21]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(tweets)
sequences = tokenizer.texts_to_sequences(tweets)
print(sequences[0])
print(sequences[1])

[5, 468, 31, 19, 208, 759, 599, 760, 761, 19, 31, 5, 360, 2, 1, 3]
[4, 8, 361, 762]


In [22]:
padded_sequences = np.asarray(pad_sequences(sequences, maxlen=15))
print(padded_sequences[0])
print(padded_sequences[1])

[468  31  19 208 759 599 760 761  19  31   5 360   2   1   3]
[  0   0   0   0   0   0   0   0   0   0   0   4   8 361 762]


In [23]:
from sentiment_model import load_train_test
from keras.models import Sequential
from keras.optimizers import Adam, RMSprop
from keras.layers import GlobalAveragePooling1D, Embedding, Dense, PReLU
from keras.layers.embeddings import Embedding

train_features, train_labels, train_idx, test_features, test_labels, test_idx = load_train_test(window='30min')
model = Sequential()
model.add(Embedding(10000, 200, input_length=10000))
model.add(GlobalAveragePooling1D())
model.add(Dense(64))
model.add(PReLU())
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])
model.fit(train_features, train_labels,
          batch_size=16,
          epochs=120,
          validation_data=(test_features, test_labels), verbose=2)
train_preds = model.predict(train_features)
test_preds = model.predict(test_features)
test_preds = test_preds.argmax(axis=-1)
train_preds = train_preds.argmax(axis=-1)

Train on 949 samples, validate on 408 samples
Epoch 1/120
 - 15s - loss: 1.0864 - acc: 0.3741 - val_loss: 1.1002 - val_acc: 0.3186
Epoch 2/120
 - 13s - loss: 1.0834 - acc: 0.3709 - val_loss: 1.1135 - val_acc: 0.3186
Epoch 3/120
 - 13s - loss: 1.0831 - acc: 0.3920 - val_loss: 1.1158 - val_acc: 0.3186
Epoch 4/120
 - 13s - loss: 1.0824 - acc: 0.3888 - val_loss: 1.1141 - val_acc: 0.3186
Epoch 5/120
 - 14s - loss: 1.0807 - acc: 0.3994 - val_loss: 1.1066 - val_acc: 0.3186
Epoch 6/120
 - 13s - loss: 1.0762 - acc: 0.3973 - val_loss: 1.1112 - val_acc: 0.3186
Epoch 7/120
 - 13s - loss: 1.0740 - acc: 0.4268 - val_loss: 1.0979 - val_acc: 0.3799
Epoch 8/120
 - 13s - loss: 1.0700 - acc: 0.4341 - val_loss: 1.1049 - val_acc: 0.3799
Epoch 9/120
 - 13s - loss: 1.0545 - acc: 0.4626 - val_loss: 1.1379 - val_acc: 0.3186
Epoch 10/120
 - 14s - loss: 1.0418 - acc: 0.4721 - val_loss: 1.1048 - val_acc: 0.3799
Epoch 11/120
 - 13s - loss: 1.0316 - acc: 0.5005 - val_loss: 1.1047 - val_acc: 0.3922
Epoch 12/120
 - 1

Epoch 96/120
 - 12s - loss: 0.1119 - acc: 0.9526 - val_loss: 2.0768 - val_acc: 0.3750
Epoch 97/120
 - 13s - loss: 0.1039 - acc: 0.9579 - val_loss: 2.2700 - val_acc: 0.3578
Epoch 98/120
 - 13s - loss: 0.1055 - acc: 0.9579 - val_loss: 2.0919 - val_acc: 0.3922
Epoch 99/120
 - 12s - loss: 0.1137 - acc: 0.9515 - val_loss: 2.1338 - val_acc: 0.3529
Epoch 100/120
 - 12s - loss: 0.1175 - acc: 0.9547 - val_loss: 3.0479 - val_acc: 0.3284
Epoch 101/120
 - 12s - loss: 0.1140 - acc: 0.9494 - val_loss: 2.7482 - val_acc: 0.3922
Epoch 102/120
 - 12s - loss: 0.1160 - acc: 0.9505 - val_loss: 2.3915 - val_acc: 0.3995
Epoch 103/120
 - 12s - loss: 0.1137 - acc: 0.9505 - val_loss: 1.9230 - val_acc: 0.3554
Epoch 104/120
 - 12s - loss: 0.1181 - acc: 0.9494 - val_loss: 1.9611 - val_acc: 0.3922
Epoch 105/120
 - 12s - loss: 0.1175 - acc: 0.9536 - val_loss: 1.9878 - val_acc: 0.3431
Epoch 106/120
 - 14s - loss: 0.1113 - acc: 0.9494 - val_loss: 1.8241 - val_acc: 0.3578
Epoch 107/120
 - 13s - loss: 0.1205 - acc: 0.94

## Backtesting

In [24]:
train_idx = pd.to_datetime(train_idx)
ups = train_idx[train_preds == 1]
downs = train_idx[train_preds == 2]
closes = candles.close.loc[train_idx[0]:train_idx[-1]]
iplot([
    go.Scatter(y=closes, x=closes.index),
    go.Scatter(y=closes[ups], x=ups, mode='markers', marker=dict(color='green', size=8, symbol='triangle-up')),
    go.Scatter(y=closes[downs], x=downs, mode='markers', marker=dict(color='red', size=8, symbol='triangle-down')),
])

In [25]:
test_idx = pd.to_datetime(test_idx)
ups = test_idx[test_preds == 1]
downs = test_idx[test_preds == 2]
closes = candles.close.loc[test_idx[0]:test_idx[-1]]
iplot([
    go.Scatter(y=closes, x=closes.index),
    go.Scatter(y=closes[ups], x=ups, mode='markers', marker=dict(color='green', size=8, symbol='triangle-up')),
    go.Scatter(y=closes[downs], x=downs, mode='markers', marker=dict(color='red', size=8, symbol='triangle-down')),
])

In [26]:
from backtest import backtest_preds

closes = candles.close[train_idx[0]:train_idx[-1]]
pnl = backtest_preds(train_preds.astype(np.int32), closes, slippage=0.)
iplot(
    [
        go.Scatter(y=pnl.cumsum(), x=closes.index)
    ]
)

In [27]:
from backtest import backtest_preds

closes = candles.close[test_idx[0]:test_idx[-1]] 
pnl = backtest_preds(test_preds.astype(np.int32), closes, slippage=0.)
iplot(
    [
        go.Scatter(y=pnl.cumsum(), x=closes.index)
    ]
)

# Thank You