In [35]:
import pandas as pd
import datetime as dt

dataset = pd.read_csv('./data/BTC-Daily.csv')

# convert column date of dataset to datetime
dataset['date'] = pd.to_datetime(dataset['date'])

# filter dataset to get data from 2015-01-01 to 2016-12-31
dataset = dataset[(dataset['date'] >= dt.datetime(2015, 1, 1)) & (dataset['date'] <= dt.datetime(2020, 12, 31))]

# describe dataset
dataset.describe()

Unnamed: 0,unix,open,high,low,close,Volume BTC,Volume USD
count,2192.0,2192.0,2192.0,2192.0,2192.0,2192.0,2192.0
mean,1514722000.0,5123.166638,5272.377637,4960.78266,5136.057441,16040490.0,34885620.0
std,54684310.0,4798.694398,4955.22607,4633.449376,4824.461973,51436560.0,56146240.0
min,1420070000.0,162.0,212.84,152.4,162.0,0.0,0.0
25%,1467396000.0,579.7375,588.0375,572.8525,581.76,7283.284,8078.573
50%,1514722000.0,4097.235,4208.365,3958.025,4099.775,932872.8,27596.54
75%,1562047000.0,8629.675,8835.3425,8314.7575,8629.715,4648965.0,52259200.0
max,1609373000.0,28893.21,29300.0,27930.75,28992.79,772329500.0,516210800.0


In [36]:
import numpy as np

# preserve only open, high, low, close, volume usd columns
dataset = dataset[['open', 'high', 'low', 'close', 'Volume USD']]
dataset.describe()

Unnamed: 0,open,high,low,close,Volume USD
count,2192.0,2192.0,2192.0,2192.0,2192.0
mean,5123.166638,5272.377637,4960.78266,5136.057441,34885620.0
std,4798.694398,4955.22607,4633.449376,4824.461973,56146240.0
min,162.0,212.84,152.4,162.0,0.0
25%,579.7375,588.0375,572.8525,581.76,8078.573
50%,4097.235,4208.365,3958.025,4099.775,27596.54
75%,8629.675,8835.3425,8314.7575,8629.715,52259200.0
max,28893.21,29300.0,27930.75,28992.79,516210800.0


In [37]:
# extract close column
close = dataset['close'].copy()

# drop close column from dataset
dataset.drop('close', axis=1, inplace=True)

In [38]:
dataset.describe()

Unnamed: 0,open,high,low,Volume USD
count,2192.0,2192.0,2192.0,2192.0
mean,5123.166638,5272.377637,4960.78266,34885620.0
std,4798.694398,4955.22607,4633.449376,56146240.0
min,162.0,212.84,152.4,0.0
25%,579.7375,588.0375,572.8525,8078.573
50%,4097.235,4208.365,3958.025,27596.54
75%,8629.675,8835.3425,8314.7575,52259200.0
max,28893.21,29300.0,27930.75,516210800.0


In [39]:
close.describe()

count     2192.000000
mean      5136.057441
std       4824.461973
min        162.000000
25%        581.760000
50%       4099.775000
75%       8629.715000
max      28992.790000
Name: close, dtype: float64

In [40]:
# convert labels to boolean
close = close > close.shift(-1)
close = np.array([int(c) for c in close])

In [41]:
# enhance the dataset with the data from the previous n days
n = 15

dataset_enhanced = np.array(
    [ 
        np.column_stack([dataset.to_numpy()[j,:] for j in range(i-n, i)]).flatten()
        for i in range(n, len(dataset))
    ]
)
labels = close[n:]

# print shape of dataset_enhanced and labels
print(dataset_enhanced.shape)
print(labels.shape)

(2177, 60)
(2177,)


In [42]:
# split dataset into training and test set and classify them with a linear SVM
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataset_enhanced, labels, test_size=0.2, random_state=42)



In [43]:
# try a linear SVC classifier

from sklearn.svm import LinearSVC

clf = LinearSVC(random_state=0, tol=1e-5, max_iter=1e4)
clf.fit(X_train, y_train)

# print accuracy of the classifier
print(clf.score(X_test, y_test))

0.48394495412844035




In [44]:
# try a hoeffding tree classifier

from skmultiflow.trees import HoeffdingTree

clf = HoeffdingTree()
clf.fit(X_train, y_train)

# print accuracy of the classifier
print(clf.score(X_test, y_test))

The old name will be removed in v0.7.0


0.518348623853211


In [45]:
# naive model that predicts the last observation from the training data

y_pred = [y_test[i-1] for i in range(0, len(y_test))]

acc = sum([1 if y_pred[i] == y_test[i] else 0 for i in range(0, len(y_test))]) / len(y_test)

print('Accuracy:', acc)

Accuracy: 0.5045871559633027


In [59]:
# try a random forest classifier

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=1000, max_depth=2, random_state=0)
clf.fit(X_train, y_train)

# print accuracy of the classifier
print(clf.score(X_test, y_test))

0.5206422018348624


In [65]:
# build a streaming random patches model

from skmultiflow.meta import StreamingRandomPatchesClassifier

clf = StreamingRandomPatchesClassifier()
clf.fit(X_train, y_train)

# print accuracy of the classifier
print(clf.score(X_test, y_test))

0.5160550458715596
