In [2]:
import pandas as pd
import numpy as np
import ta
import matplotlib.pyplot as plt

In [3]:
df = pd.DataFrame()

In [4]:
#merge data and set up original data
years = [2017, 2018, 2019, 2020, 2021]
df = pd.DataFrame()
for y in years:
    # might need to adjust file paths
    df_y =  pd.read_csv('Bitstamp_BTCUSD_{0}_minute.csv'.format(str(y)),)
    df_y = df_y.sort_values('date')
    df = df.append(df_y)

df = df.drop('symbol',axis = 1)
df['dates'] = pd.to_datetime(df['date']).dt.date
df['time'] = pd.to_datetime(df['date']).dt.time

#find missing dates
df_miss = pd.read_csv('BTC-USD.csv')
df_miss['Date'] = pd.to_datetime(df_miss['Date'])
df_miss.columns = ['date', 'open', 'high', 'low', 'close', 'adj close', 'Volume USD']
df_miss = df_miss.set_index('date')
df_miss['Volume BTC'] = df_miss['Volume USD'] / df_miss['close']
df_miss = df_miss[['open', 'close', 'high', 'low', 'Volume BTC', 'Volume USD']]

missing_dates = df_miss.index.tolist()

#get bigger dataset
bigdata = pd.read_csv('bitstampUSD_1-min_data_2012-01-01_to_2021-03-31.csv')
bigdata=bigdata.drop('Weighted_Price',axis=1)
bigdata['datetime'] = pd.to_datetime(bigdata['Timestamp'],unit='s')
bigdata['dates'] = pd.to_datetime(bigdata['datetime']).dt.date
bigdata['time'] = pd.to_datetime(bigdata['datetime']).dt.time

cols = ['Timestamp', 'datetime', 'Open', 'High', 'Low', 'Close', 'Volume_(BTC)',
        'Volume_(Currency)', 'dates', 'time']

bigdata_rearranged = bigdata[cols]

missing_dates_df = pd.DataFrame(missing_dates)
missing_dates_df['dates'] = pd.to_datetime(missing_dates_df[0]).dt.date

missingvals = bigdata_rearranged.loc[bigdata_rearranged['dates'].isin(missing_dates_df['dates'])]

missingvals = missingvals.rename(columns=dict(zip(missingvals.columns,bigdata_rearranged.columns)))

missingvals = missingvals.rename(columns=dict(zip(missingvals.columns,df.columns)))

df = df.append(missingvals)

df['date'] = pd.to_datetime(df['date'])

df = df.sort_values(by=['date'])

df.dropna(inplace=True)

#now lets get the actual technical features we want to use

df['SMA200'] = ta.trend.sma_indicator(close=df['close'],window=288000) #200 day simple moving average
df['SMA20'] = ta.trend.sma_indicator(close=df['close'],window=28800) #20 day simple moving average

df['next_min_change'] = ta.momentum.roc(df['close'],window=1).shift(-1) #minute over minute return

df['next_min_upordown'] = np.where(df['next_min_change']>0,1,0) #minute over minute up or down

#features related to volume
df['change_in_volume'] = df['Volume BTC']- df['Volume BTC'].shift(1) #simple absolute change
df['PVO'] = ta.momentum.PercentageVolumeOscillator(df['Volume BTC']).pvo_signal() #percentage volume oscillator, see ta docs
#df['PVO_positive'] = np.where(df['PVO']>0,1,0) # variable toget whether pvo was positive or negative. 
df['ADI'] = ta.volume.AccDistIndexIndicator(df['high'], df['low'], df['close'], df['Volume BTC']).acc_dist_index() #see ta docs
df['MFI'] = ta.volume.MFIIndicator(df['high'], df['low'], df['close'], df['Volume BTC'], 30).money_flow_index() #see ta docs
df['OBV'] = ta.volume.OnBalanceVolumeIndicator(df['close'], df['Volume BTC']).on_balance_volume() #see ta docs

#momentum indicators
df['RSI'] = ta.momentum.RSIIndicator(df['close'], 30).rsi() #using last 30 minutes
df['TSI'] = ta.momentum.TSIIndicator(df['close'], 30, 15).tsi()

#crossover features

df['crossed_SMA200_breakdown']= np.where((df['open'] > df['SMA200']) & (df['close'] < df['SMA200']),1,0)#if price goes from above to below
df['crossed_SMA200_breakout'] = np.where((df['open'] < df['SMA200']) & (df['close'] > df['SMA200']),1,0)#if price goes from below to above




In [5]:
dataset = df.set_index(df['date'])
dataset = dataset.dropna()
X = dataset.drop(['date','unix','open','high','low','close','Volume BTC','Volume USD','dates','time','next_min_change','next_min_upordown','SMA200', 'SMA20'],axis = 1)

y = dataset['next_min_upordown']

print(X.shape,y.shape)


(2225700, 9) (2225700,)


In [6]:
#separate into train and validation using sklearn
from sklearn.model_selection import TimeSeriesSplit


ts_cv = TimeSeriesSplit(n_splits=4)

all_splits = list(ts_cv.split(X, y))

In [13]:
#lets try gradient boosting since its an ensemble method that can handle categorical and numerical values

from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, GridSearchCV

gbclassifier = HistGradientBoostingClassifier()
scaler = MinMaxScaler()

pipeline = make_pipeline(scaler, gbclassifier)

def evaluate(model, X, y, cv):
    cv_results = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring= ["accuracy"]
    )
    acc = cv_results["test_accuracy"]
    print(f"mean:     {acc.mean():.3f} +/- {acc.std():.3f}\n")
    return cv_results


evaluate(pipeline, X, y, cv=ts_cv)

mean:     0.545 +/- 0.016



{'fit_time': array([ 5.95303988,  9.67890882, 16.37884808, 20.01375484]),
 'score_time': array([0.91013122, 0.8376503 , 1.09089804, 0.85246897]),
 'test_accuracy': array([0.56547603, 0.54208339, 0.55059083, 0.52043851])}

In [42]:
#trying to tune model parameters
import warnings
from sklearn.model_selection import GridSearchCV

warnings.filterwarnings("ignore")

gbclassifier = HistGradientBoostingClassifier()

parameters = {
    'max_iter': [1200],
    'learning_rate': [.05,0.1],
    'max_depth' : [25, 50, 75],
    'l2_regularization': [.5,1.5,2],
    'scoring': ['accuracy']
}
#start with gridsearch
hgb_grid = GridSearchCV(gbclassifier, parameters, n_jobs=5, cv=ts_cv, scoring='accuracy',refit=True)

hgb_grid.fit(X, y)

print(hgb_grid.best_params_)
# Print the best scores found
print()
print(hgb_grid.best_score_)


#THIS TAKES APROX 20-25 MINS TO RUN



{'l2_regularization': 2, 'learning_rate': 0.1, 'max_depth': 75, 'max_iter': 1200, 'scoring': 'accuracy'}

0.5452071258480478


In [35]:
from sklearn.model_selection import RandomizedSearchCV


gbclassifier = HistGradientBoostingClassifier()

parameters = {
    'max_iter': [1200],
    'learning_rate': np.linspace(0,.1,5),
    'max_depth' : [25, 50, 75],
    'l2_regularization': [.5,1.5,2],
    'scoring': ['accuracy']
}
#start with gridsearch
hgb_grid = GridSearchCV(gbclassifier, parameters, n_jobs=5, cv=ts_cv, scoring='accuracy',refit=True)

hgb_grid.fit(X, y)

print(hgb_grid.best_params_)
# Print the best scores found
print()
print(hgb_grid.best_score_)


sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingClassifier

dict_keys(['categorical_features', 'early_stopping', 'l2_regularization', 'learning_rate', 'loss', 'max_bins', 'max_depth', 'max_iter', 'max_leaf_nodes', 'min_samples_leaf', 'monotonic_cst', 'n_iter_no_change', 'random_state', 'scoring', 'tol', 'validation_fraction', 'verbose', 'warm_start'])

In [8]:
X


Unnamed: 0_level_0,change_in_volume,PVO,ADI,MFI,OBV,RSI,TSI,crossed_SMA200_breakdown,crossed_SMA200_breakout
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-07-20 00:00:00,1.694373,0.181176,88201.924832,64.163608,100673.053895,53.148599,7.235874,0,0
2017-07-20 00:01:00,-1.325690,-3.138730,88201.924832,63.406532,100673.422579,53.148599,7.974957,0,0
2017-07-20 00:02:00,0.899670,-6.786668,88203.193185,64.848825,100674.690932,53.215311,8.634439,0,0
2017-07-20 00:03:00,-0.668278,-10.824877,88203.193185,64.806681,100675.291008,56.514148,10.814191,0,0
2017-07-20 00:04:00,4.031428,-13.418827,88207.824689,65.889557,100679.922511,59.523902,14.216984,0,0
...,...,...,...,...,...,...,...,...,...
2021-10-13 08:14:00,0.312500,22.139374,486458.730778,52.973447,321457.078265,48.518396,-1.533382,0,0
2021-10-13 08:15:00,0.639219,17.696744,486459.782895,53.327325,321458.130383,49.128386,-0.634889,0,0
2021-10-13 08:16:00,2.132965,13.601141,486462.967978,54.447195,321461.315466,50.292905,0.623182,0,0
2021-10-13 08:17:00,-2.970556,8.736866,486463.159241,54.698170,321461.529993,50.699139,1.855500,0,0
