In [3]:
import pandas as pd
import numpy as np
import ta
import matplotlib.pyplot as plt

In [4]:
df = pd.DataFrame()

In [5]:
#merge data and set up original data
years = [2017, 2018, 2019, 2020, 2021]
df = pd.DataFrame()
for y in years:
    # might need to adjust file paths
    df_y =  pd.read_csv('Bitstamp_BTCUSD_{0}_minute.csv'.format(str(y)),)
    df_y = df_y.sort_values('date')
    df = df.append(df_y)

df = df.drop('symbol',axis = 1)
df['dates'] = pd.to_datetime(df['date']).dt.date
df['time'] = pd.to_datetime(df['date']).dt.time

#find missing dates
df_miss = pd.read_csv('BTC-USD.csv')
df_miss['Date'] = pd.to_datetime(df_miss['Date'])
df_miss.columns = ['date', 'open', 'high', 'low', 'close', 'adj close', 'Volume USD']
df_miss = df_miss.set_index('date')
df_miss['Volume BTC'] = df_miss['Volume USD'] / df_miss['close']
df_miss = df_miss[['open', 'close', 'high', 'low', 'Volume BTC', 'Volume USD']]

missing_dates = df_miss.index.tolist()

#get bigger dataset
bigdata = pd.read_csv('bitstampUSD_1-min_data_2012-01-01_to_2021-03-31.csv')
bigdata=bigdata.drop('Weighted_Price',axis=1)
bigdata['datetime'] = pd.to_datetime(bigdata['Timestamp'],unit='s')
bigdata['dates'] = pd.to_datetime(bigdata['datetime']).dt.date
bigdata['time'] = pd.to_datetime(bigdata['datetime']).dt.time

cols = ['Timestamp', 'datetime', 'Open', 'High', 'Low', 'Close', 'Volume_(BTC)',
        'Volume_(Currency)', 'dates', 'time']

bigdata_rearranged = bigdata[cols]

missing_dates_df = pd.DataFrame(missing_dates)
missing_dates_df['dates'] = pd.to_datetime(missing_dates_df[0]).dt.date

missingvals = bigdata_rearranged.loc[bigdata_rearranged['dates'].isin(missing_dates_df['dates'])]

missingvals = missingvals.rename(columns=dict(zip(missingvals.columns,bigdata_rearranged.columns)))

missingvals = missingvals.rename(columns=dict(zip(missingvals.columns,df.columns)))

df = df.append(missingvals)

df['date'] = pd.to_datetime(df['date'])

df = df.sort_values(by=['date'])

df.dropna(inplace=True)

#now lets get the actual technical features we want to use

df['SMA200'] = ta.trend.sma_indicator(close=df['close'],window=288000) #200 day simple moving average
df['SMA20'] = ta.trend.sma_indicator(close=df['close'],window=28800) #20 day simple moving average

df['next_min_change'] = ta.momentum.roc(df['close'],window=1).shift(-1) #minute over minute return

df['next_min_upordown'] = np.where(df['next_min_change']>0,1,0) #minute over minute up or down

#features related to volume
df['change_in_volume'] = df['Volume BTC']- df['Volume BTC'].shift(1) #simple absolute change
df['PVO'] = ta.momentum.PercentageVolumeOscillator(df['Volume BTC']).pvo_signal() #percentage volume oscillator, see ta docs
#df['PVO_positive'] = np.where(df['PVO']>0,1,0) # variable toget whether pvo was positive or negative. 
df['ADI'] = ta.volume.AccDistIndexIndicator(df['high'], df['low'], df['close'], df['Volume BTC']).acc_dist_index() #see ta docs
df['MFI'] = ta.volume.MFIIndicator(df['high'], df['low'], df['close'], df['Volume BTC'], 30).money_flow_index() #see ta docs
df['OBV'] = ta.volume.OnBalanceVolumeIndicator(df['close'], df['Volume BTC']).on_balance_volume() #see ta docs

#momentum indicators
df['RSI'] = ta.momentum.RSIIndicator(df['close'], 30).rsi() #using last 30 minutes
df['TSI'] = ta.momentum.TSIIndicator(df['close'], 30, 15).tsi()

#crossover features

df['crossed_SMA200_breakdown']= np.where((df['open'] > df['SMA200']) & (df['close'] < df['SMA200']),1,0)#if price goes from above to below
df['crossed_SMA200_breakout'] = np.where((df['open'] < df['SMA200']) & (df['close'] > df['SMA200']),1,0)#if price goes from below to above




In [6]:
dataset = df.set_index(df['date'])
dataset = dataset.dropna()
X = dataset.drop(['date','unix','open','high','low','close','Volume BTC','Volume USD','dates','time','next_min_change','next_min_upordown','SMA200', 'SMA20'],axis = 1)

y = dataset['next_min_upordown']

print(X.shape,y.shape)


(2225700, 9) (2225700,)


In [7]:
#separate into train and validation using sklearn
from sklearn.model_selection import TimeSeriesSplit


ts_cv = TimeSeriesSplit(n_splits=4)

all_splits = list(ts_cv.split(X, y))

In [19]:
#lets try gradient boosting since its an ensemble method that can handle categorical and numerical values

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, GridSearchCV

gbclassifier = HistGradientBoostingClassifier()
scaler = MinMaxScaler()

pipeline = make_pipeline(scaler, gbclassifier)

def evaluate(model, X, y, cv):
    cv_results = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring= ["accuracy"]
    )
    acc = cv_results["test_accuracy"]
    print(f"mean:     {acc.mean():.3f} +/- {acc.std():.3f}\n")
    return cv_results


defaulthgb = evaluate(pipeline, X, y, cv=ts_cv)



mean:     0.544 +/- 0.015



In [17]:
gb = gbclassifier.fit(X,y)

score = gb.train_score_
print(score)

[-0.69160581 -0.69058283 -0.68969381 -0.68896011 -0.68831249 -0.68774181
 -0.68724666 -0.68679967 -0.68639675 -0.68604571 -0.68573198 -0.68546272
 -0.685207   -0.68498693 -0.68478668 -0.68460761 -0.68444189 -0.68428477
 -0.68414899 -0.6840114  -0.68389245 -0.68378909 -0.68366026 -0.68357153
 -0.68346591 -0.68338464 -0.68330285 -0.68323463 -0.68315737 -0.68309549
 -0.68303559 -0.6829671  -0.68290803 -0.68285231 -0.68280306 -0.6827548
 -0.68269258 -0.68265088 -0.68260106 -0.68256153 -0.6825146  -0.68247278
 -0.68243419 -0.68240051 -0.68236647 -0.68233415 -0.68230447 -0.68227301
 -0.68224637 -0.68220833 -0.68214806 -0.68212    -0.68208992 -0.68206902
 -0.68204221 -0.68202523 -0.68199995 -0.68194828 -0.68193186 -0.6819096
 -0.681891   -0.68186898 -0.68184718 -0.68182336 -0.68180603 -0.68179258
 -0.68177568 -0.68173642 -0.68170955 -0.68169545 -0.68168382 -0.68166525
 -0.68164674 -0.68163022 -0.68161556 -0.68160333 -0.68158122 -0.68155038
 -0.68152588 -0.68151127 -0.68148955 -0.68147298 -0.6

In [42]:
#trying to tune model parameters
import warnings
from sklearn.model_selection import GridSearchCV

warnings.filterwarnings("ignore")

gbclassifier = HistGradientBoostingClassifier()

parameters = {
    'max_iter': [1200],
    'learning_rate': [.05,0.1],
    'max_depth' : [25, 50, 75],
    'l2_regularization': [.5,1.5,2],
    'scoring': ['accuracy']
}
#start with gridsearch
hgb_grid = GridSearchCV(gbclassifier, parameters, n_jobs=5, cv=ts_cv, scoring='accuracy',refit=True)

hgb_grid.fit(X, y)

print(hgb_grid.best_params_)
# Print the best scores found
print()
print(hgb_grid.best_score_)


#THIS TAKES APROX 20-25 MINS TO RUN



{'l2_regularization': 2, 'learning_rate': 0.1, 'max_depth': 75, 'max_iter': 1200, 'scoring': 'accuracy'}

0.5452071258480478


In [35]:
from sklearn.model_selection import RandomizedSearchCV


gbclassifier = HistGradientBoostingClassifier()

parameters = {
    'max_iter': [1200],
    'learning_rate': np.linspace(0,.1,5),
    'max_depth' : [25, 50, 75],
    'l2_regularization': [.5,1.5,2],
    'scoring': ['accuracy']
}
#start with gridsearch
hgb_grid = GridSearchCV(gbclassifier, parameters, n_jobs=5, cv=ts_cv, scoring='accuracy',refit=True)

hgb_grid.fit(X, y)

print(hgb_grid.best_params_)
# Print the best scores found
print()
print(hgb_grid.best_score_)


sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingClassifier

In [8]:
X


Unnamed: 0_level_0,change_in_volume,PVO,ADI,MFI,OBV,RSI,TSI,crossed_SMA200_breakdown,crossed_SMA200_breakout
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-07-20 00:00:00,1.694373,0.181176,88201.924832,64.163608,100673.053895,53.148599,7.235874,0,0
2017-07-20 00:01:00,-1.325690,-3.138730,88201.924832,63.406532,100673.422579,53.148599,7.974957,0,0
2017-07-20 00:02:00,0.899670,-6.786668,88203.193185,64.848825,100674.690932,53.215311,8.634439,0,0
2017-07-20 00:03:00,-0.668278,-10.824877,88203.193185,64.806681,100675.291008,56.514148,10.814191,0,0
2017-07-20 00:04:00,4.031428,-13.418827,88207.824689,65.889557,100679.922511,59.523902,14.216984,0,0
...,...,...,...,...,...,...,...,...,...
2021-10-13 08:14:00,0.312500,22.139374,486458.730778,52.973447,321457.078265,48.518396,-1.533382,0,0
2021-10-13 08:15:00,0.639219,17.696744,486459.782895,53.327325,321458.130383,49.128386,-0.634889,0,0
2021-10-13 08:16:00,2.132965,13.601141,486462.967978,54.447195,321461.315466,50.292905,0.623182,0,0
2021-10-13 08:17:00,-2.970556,8.736866,486463.159241,54.698170,321461.529993,50.699139,1.855500,0,0


In [8]:
testdata = pd.read_csv('btc_test.csv',header=1)
testdata['dates'] = pd.to_datetime(testdata['date']).dt.date
testdata['time'] = pd.to_datetime(testdata['date']).dt.time

testdata = testdata.reindex(index=testdata.index[::-1])



In [9]:
testdata

Unnamed: 0,unix,date,symbol,open,high,low,close,Volume BTC,Volume USD,dates,time
482832,1609459260,2021-01-01 00:01:00,BTC/USD,29007.31,29086.90,29007.31,29083.47,14.561951,423512.060397,2021-01-01,00:01:00
482831,1609459320,2021-01-01 00:02:00,BTC/USD,29069.80,29073.02,29028.14,29035.89,3.030301,87987.499279,2021-01-01,00:02:00
482830,1609459380,2021-01-01 00:03:00,BTC/USD,29037.68,29069.39,29019.00,29048.13,2.189621,63604.382387,2021-01-01,00:03:00
482829,1609459440,2021-01-01 00:04:00,BTC/USD,29048.13,29057.73,29035.61,29045.19,1.446538,42014.959434,2021-01-01,00:04:00
482828,1609459500,2021-01-01 00:05:00,BTC/USD,29021.86,29023.38,28982.33,28999.50,1.062360,30807.899250,2021-01-01,00:05:00
...,...,...,...,...,...,...,...,...,...,...,...
4,1638428940,2021-12-02 07:09:00,BTC/USD,56820.07,56820.07,56709.10,56736.26,8.958970,508298.451252,2021-12-02,07:09:00
3,1638429000,2021-12-02 07:10:00,BTC/USD,56734.81,56771.36,56701.98,56750.03,5.959220,338185.935909,2021-12-02,07:10:00
2,1638429060,2021-12-02 07:11:00,BTC/USD,56743.11,56755.59,56709.34,56716.94,3.183078,180534.447912,2021-12-02,07:11:00
1,1638429120,2021-12-02 07:12:00,BTC/USD,56714.44,56745.75,56700.00,56718.09,5.755797,326457.817940,2021-12-02,07:12:00


In [10]:
#now lets get the actual technical features we want to use

testdata['SMA200'] = ta.trend.sma_indicator(close=testdata['close'],window=288000) #200 day simple moving average
testdata['SMA20'] = ta.trend.sma_indicator(close=testdata['close'],window=28800) #20 day simple moving average

testdata['next_min_change'] = ta.momentum.roc(testdata['close'],window=1).shift(-1) #minute over minute return

testdata['next_min_upordown'] = np.where(testdata['next_min_change']>0,1,0) #minute over minute up or down

#features related to volume
testdata['change_in_volume'] = testdata['Volume BTC']- testdata['Volume BTC'].shift(1) #simple absolute change
testdata['PVO'] = ta.momentum.PercentageVolumeOscillator(testdata['Volume BTC']).pvo_signal() #percentage volume oscillator, see ta docs
#testdata['PVO_positive'] = np.where(testdata['PVO']>0,1,0) # variable toget whether pvo was positive or negative. 
testdata['ADI'] = ta.volume.AccDistIndexIndicator(testdata['high'], testdata['low'], testdata['close'], testdata['Volume BTC']).acc_dist_index() #see ta docs
testdata['MFI'] = ta.volume.MFIIndicator(testdata['high'], testdata['low'], testdata['close'], testdata['Volume BTC'], 30).money_flow_index() #see ta docs
testdata['OBV'] = ta.volume.OnBalanceVolumeIndicator(testdata['close'], testdata['Volume BTC']).on_balance_volume() #see ta docs

#momentum indicators
testdata['RSI'] = ta.momentum.RSIIndicator(testdata['close'], 30).rsi() #using last 30 minutes
testdata['TSI'] = ta.momentum.TSIIndicator(testdata['close'], 30, 15).tsi()

#crossover features

testdata['crossed_SMA200_breakdown']= np.where((testdata['open'] > testdata['SMA200']) & (testdata['close'] < testdata['SMA200']),1,0)#if price goes from above to below
testdata['crossed_SMA200_breakout'] = np.where((testdata['open'] < testdata['SMA200']) & (testdata['close'] > testdata['SMA200']),1,0)#if price goes from below to above

In [11]:
dataset1 = testdata.set_index(testdata['date'])
dataset1 = dataset1.dropna()



dataset1 = dataset1.loc['2021-10-13 08:19:00':,:]

xTe= dataset1.drop(['date','unix','open','high','low','close','Volume BTC','Volume USD','dates','time','next_min_change','next_min_upordown','SMA200', 'SMA20','symbol'],axis = 1)

yTe = dataset1['next_min_upordown']

print(xTe.shape,yTe.shape)

(71871, 9) (71871,)


In [12]:
xTe

Unnamed: 0_level_0,change_in_volume,PVO,ADI,MFI,OBV,RSI,TSI,crossed_SMA200_breakdown,crossed_SMA200_breakout
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-10-13 08:19:00,-10.882752,6.610585,39768.267983,59.436018,-12847.657124,57.611844,9.261903,0,0
2021-10-13 08:20:00,2.981771,5.442149,39767.830649,60.836503,-12842.628371,58.013711,12.392487,0,0
2021-10-13 08:21:00,1.022570,4.902918,39767.587499,58.527773,-12848.679694,57.347593,14.687582,0,0
2021-10-13 08:22:00,-5.250151,2.997298,39768.388671,59.645820,-12847.878522,58.053943,16.916681,0,0
2021-10-13 08:23:00,3.682714,1.587217,39772.706228,61.168693,-12843.394635,59.314480,19.342718,0,0
...,...,...,...,...,...,...,...,...,...
2021-12-02 07:08:00,16.046290,8.672490,43367.396755,78.479629,-12452.026760,54.712761,16.236022,0,0
2021-12-02 07:09:00,-7.088676,20.814225,43362.823216,62.886867,-12460.985730,51.574731,14.988832,0,0
2021-12-02 07:10:00,-2.999750,29.474769,43365.118263,56.069260,-12455.026510,52.249915,14.238016,0,0
2021-12-02 07:11:00,-2.776142,34.641093,43362.981300,52.421711,-12458.209588,50.499569,12.699682,0,0


In [13]:
gb.score(xTe,yTe)

NameError: name 'gb' is not defined

In [56]:
'''
X.to_csv('xTr_minute_classiification.csv')
y.to_csv('yTr_minute_classification.csv')
xTe.to_csv('xTe_minute_classification.csv')
yTe.to_csv('yTe_minute_classification.csv')
'''

In [59]:
from sklearn.model_selection import RandomizedSearchCV
import warnings

warnings.filterwarnings("ignore")

gbclassifier = HistGradientBoostingClassifier()

parameters = {
    'max_iter': [1200],
    'learning_rate': np.linspace(0,.1,5),
    'max_depth' : [25, 50, 75],
    'l2_regularization': [.5,1.5,2],
    'scoring': ['accuracy']
}
#start with gridsearch
hgb_grid = RandomizedSearchCV(gbclassifier, parameters, n_jobs=5, cv=ts_cv, scoring='accuracy',refit=True)

hgb_grid.fit(X, y)

print(hgb_grid.best_params_)
# Print the best scores found
print()
print(hgb_grid.best_score_)



{'scoring': 'accuracy', 'max_iter': 1200, 'max_depth': 25, 'learning_rate': 0.07500000000000001, 'l2_regularization': 2}

0.5457137080469066


In [17]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=5,max_depth=3).fit(X,y)

#print('training score: ', rfc.score(X,y))
#print('test score: ', rfc.score(xTe,yTe))

In [19]:
rfc.score(X,y)


0.5410158601788202