In [1]:
# imports 
import time
import dateparser
import pytz
import json
import talib
import pandas as pd 

from datetime import datetime
from binance.client import Client

from keras.models import Sequential
from keras.layers import Dense

In [2]:
def date_to_milliseconds(date_str): # date to milliseconds
    """Convert UTC date to milliseconds
    If using offset strings add "UTC" to date string e.g. "now UTC", "11 hours ago UTC"
    See dateparse docs for formats http://dateparser.readthedocs.io/en/latest/
    :param date_str: date in readable format, i.e. "January 01, 2018", "11 hours ago UTC", "now UTC"
    :type date_str: str
    """
    # get epoch value in UTC
    epoch = datetime.utcfromtimestamp(0).replace(tzinfo=pytz.utc)
    # parse our date string
    d = dateparser.parse(date_str)
    # if the date is not timezone aware apply UTC timezone
    if d.tzinfo is None or d.tzinfo.utcoffset(d) is None:
        d = d.replace(tzinfo=pytz.utc)

    # return the difference in time
    return int((d - epoch).total_seconds() * 1000.0)

In [3]:
def interval_to_milliseconds(interval): # interval to milliseconds
    """Convert a Binance interval string to milliseconds
    :param interval: Binance interval string 1m, 3m, 5m, 15m, 30m, 1h, 2h, 4h, 6h, 8h, 12h, 1d, 3d, 1w
    :type interval: str
    :return:
         None if unit not one of m, h, d or w
         None if string not in correct format
         int value of interval in milliseconds
    """
    ms = None
    seconds_per_unit = {
        "m": 60,
        "h": 60 * 60,
        "d": 24 * 60 * 60,
        "w": 7 * 24 * 60 * 60
    }

    unit = interval[-1]
    if unit in seconds_per_unit:
        try:
            ms = int(interval[:-1]) * seconds_per_unit[unit] * 1000
        except ValueError:
            pass
    return ms

In [4]:
def get_historical_klines(symbol, interval, start_str, end_str=None):
    """Get Historical Klines from Binance
    See dateparse docs for valid start and end string formats http://dateparser.readthedocs.io/en/latest/
    If using offset strings for dates add "UTC" to date string e.g. "now UTC", "11 hours ago UTC"
    :param symbol: Name of symbol pair e.g BNBBTC
    :type symbol: str
    :param interval: Biannce Kline interval
    :type interval: str
    :param start_str: Start date string in UTC format
    :type start_str: str
    :param end_str: optional - end date string in UTC format
    :type end_str: str
    :return: list of OHLCV values
    """
    # create the Binance client, no need for api key
    client = Client("", "")

    # init our list
    output_data = []

    # setup the max limit
    limit = 500

    # convert interval to useful value in seconds
    timeframe = interval_to_milliseconds(interval)

    # convert our date strings to milliseconds
    start_ts = date_to_milliseconds(start_str)

    # if an end time was passed convert it
    end_ts = None
    if end_str:
        end_ts = date_to_milliseconds(end_str)

    idx = 0
    # it can be difficult to know when a symbol was listed on Binance so allow start time to be before list date
    symbol_existed = False
    while True:
        # fetch the klines from start_ts up to max 500 entries or the end_ts if set
        temp_data = client.get_klines(
            symbol=symbol,
            interval=interval,
            limit=limit,
            startTime=start_ts,
            endTime=end_ts
        )

        # handle the case where our start date is before the symbol pair listed on Binance
        if not symbol_existed and len(temp_data):
            symbol_existed = True

        if symbol_existed:
            # append this loops data to our output data
            output_data += temp_data

            # update our start timestamp using the last value in the array and add the interval timeframe
            start_ts = temp_data[len(temp_data) - 1][0] + timeframe
        else:
            # it wasn't listed yet, increment our start date
            start_ts += timeframe

        idx += 1
        # check if we received less than the required limit and exit the loop
        if len(temp_data) < limit:
            # exit the while loop
            break

        # sleep after every 3rd call to be kind to the API
        if idx % 3 == 0:
            time.sleep(1)

    return output_data


# Load Dataset

In [5]:
# setting desired dataset characteristics
symbol = "ETHUSDT" 
start = "1 Nov, 2020"
end = "1 Nov, 2021"
interval = Client.KLINE_INTERVAL_12HOUR  # https://github.com/sammchardy/python-binance/blob/master/binance/client.py


**Binance's get_klines returns the following observations:**
* 'Open time'
* 'Open'
* 'High'
* 'Low'
* 'Close'
* 'Volume'
* 'Close time'
* 'Quote asset volume'
* 'Number of trades'
* 'Taker buy base asset volume'
* 'Taker buy quote asset volume'
* 'Ignore'


**The get_historical_klines returns a n-dimensional array. The oberservation's index of the n'th dimension structured:**
* [0]: Open time
* [1]: Open
* [2]: High
* [3]: Low
* [4]: Close
* [5]: Volume
* [6]: Close time
* [7]: Quote asset volume
* [8]: Number of trades
* [9]: Taker buy base asset volume
* [10]: Taker buy quote asset volume
* [11]: Ignore


In [6]:
# requesting dataset into pandas df 
columns = ['Open time', 'Open',
'High','Low','Close','Volume','Close time','Quote asset volume','Number of trades','Taker buy base asset volume','Taker buy quote asset volume','Ignore']
dataset = pd.DataFrame(get_historical_klines(symbol, interval, start, end), columns=columns)

  date_obj = stz.localize(date_obj)


# Exploring and Cleaning the Dataset

In [7]:
# shape of data
dataset.shape

(731, 12)

In [8]:
# look at head of data
dataset.head()

Unnamed: 0,Open time,Open,High,Low,Close,Volume,Close time,Quote asset volume,Number of trades,Taker buy base asset volume,Taker buy quote asset volume,Ignore
0,1604188800000,386.44,390.65,384.61,386.74,163277.9985,1604231999999,63237715.4504805,73393,80950.74287,31360857.7630129,0
1,1604232000000,386.75,398.29,386.1,396.67,275674.95715,1604275199999,108000602.2498434,107976,147334.95573,57739971.5223248,0
2,1604275200000,396.68,404.5,381.62,384.12,474842.96215,1604318399999,187022268.6900098,170021,232481.18258,91675387.5588087,0
3,1604318400000,384.06,388.63,377.92,383.02,305125.61742,1604361599999,117201277.2020246,120349,150443.07821,57804403.5251973,0
4,1604361600000,383.01,385.59,370.23,379.61,426229.01686,1604404799999,160984297.0912375,154543,199910.31928,75535923.1014113,0


## Data Types

In [9]:
# Observation Data Types
dataset.dtypes

Open time                        int64
Open                            object
High                            object
Low                             object
Close                           object
Volume                          object
Close time                       int64
Quote asset volume              object
Number of trades                 int64
Taker buy base asset volume     object
Taker buy quote asset volume    object
Ignore                          object
dtype: object

In [10]:
# Convert all observations to floats
dataset= dataset.astype(float)

## Descriptive Statistics on Variables

In [11]:
# pd.set_option('display.max_rows', None)
print(dataset.describe())

          Open time         Open         High          Low        Close  \
count  7.310000e+02   731.000000   731.000000   731.000000   731.000000   
mean   1.619957e+12  2154.377674  2217.083174  2085.067004  2159.763981   
std    9.122363e+09  1062.030048  1088.230791  1033.010850  1063.038904   
min    1.604189e+12   379.570000   385.590000   370.230000   379.610000   
25%    1.612073e+12  1378.800000  1437.600000  1320.075000  1386.005000   
50%    1.619957e+12  2112.220000  2160.520000  2055.000000  2113.600000   
75%    1.627841e+12  3044.155000  3144.920000  2956.280000  3052.665000   
max    1.635725e+12  4418.890000  4460.470000  4296.640000  4418.890000   

             Volume    Close time  Quote asset volume  Number of trades  \
count  7.310000e+02  7.310000e+02        7.310000e+02      7.310000e+02   
mean   4.955231e+05  1.620000e+12        9.594586e+08      6.255232e+05   
std    3.226686e+05  9.122363e+09        7.019499e+08      3.643294e+05   
min    1.212453e+05  1.6

In [12]:
# remove the "Ignore" column | 731 data points and all are zero
dataset.pop('Ignore')
dataset.columns

Index(['Open time', 'Open', 'High', 'Low', 'Close', 'Volume', 'Close time',
       'Quote asset volume', 'Number of trades', 'Taker buy base asset volume',
       'Taker buy quote asset volume'],
      dtype='object')

In [13]:
# correlation between all the columns of a dataframe
dataset.corr()

Unnamed: 0,Open time,Open,High,Low,Close,Volume,Close time,Quote asset volume,Number of trades,Taker buy base asset volume,Taker buy quote asset volume
Open time,1.0,0.899955,0.897999,0.904648,0.899967,-0.410129,1.0,0.220271,0.248501,-0.41419,0.220866
Open,0.899955,1.0,0.998592,0.996428,0.996089,-0.305076,0.899955,0.417204,0.416113,-0.310382,0.416979
High,0.897999,0.998592,1.0,0.996162,0.998075,-0.288968,0.897999,0.434918,0.435092,-0.292754,0.436467
Low,0.904648,0.996428,0.996162,1.0,0.997848,-0.349886,0.904648,0.364436,0.367838,-0.352726,0.366779
Close,0.899967,0.996089,0.998075,0.997848,1.0,-0.315046,0.899967,0.404041,0.406456,-0.317141,0.407423
Volume,-0.410129,-0.305076,-0.288968,-0.349886,-0.315046,1.0,-0.410129,0.635413,0.626646,0.99817,0.634077
Close time,1.0,0.899955,0.897999,0.904648,0.899967,-0.410129,1.0,0.220271,0.248501,-0.41419,0.220866
Quote asset volume,0.220271,0.417204,0.434918,0.364436,0.404041,0.635413,0.220271,1.0,0.966832,0.629036,0.998388
Number of trades,0.248501,0.416113,0.435092,0.367838,0.406456,0.626646,0.248501,0.966832,1.0,0.622209,0.968058
Taker buy base asset volume,-0.41419,-0.310382,-0.292754,-0.352726,-0.317141,0.99817,-0.41419,0.629036,0.622209,1.0,0.630645


**timestamps (open time and close time):*** 
* both have high correlations with price action and negative/miniscul correlations with the volumn oberserations. 
* the correlation between open time and close time is 100%  because they occure are a set frequency apart. 
 
**price action:***
* open has highest correlation with high 
* high has highest correlation with close 
* low has highest correlation with open 
* close has highest correlation with high 
 
**volumne:***
* volumne has highest negative correlation with high 

**questions to answer:**
 * what role do timestamps play on the influence of t(x+1)'s close price

# Defining Our Question
we want to know if t(x + 1)'s close price will greater or less than t(x)'s close price. 
first lets look at the relationship between t(x + 1)'s close and all of t(x)'s obervations 

In [15]:
# create a new column from 'close' and more it down one period 
datasetCopy = dataset.copy()
datasetCopy["T One Close"] = datasetCopy["Close"].shift(periods=-1) # dataset["T Zero Close"] = dataset["Close"].shift(periods=1)
datasetCopy.head()

Unnamed: 0,Open time,Open,High,Low,Close,Volume,Close time,Quote asset volume,Number of trades,Taker buy base asset volume,Taker buy quote asset volume,T One Close
0,1604189000000.0,386.44,390.65,384.61,386.74,163277.9985,1604232000000.0,63237720.0,73393.0,80950.74287,31360860.0,396.67
1,1604232000000.0,386.75,398.29,386.1,396.67,275674.95715,1604275000000.0,108000600.0,107976.0,147334.95573,57739970.0,384.12
2,1604275000000.0,396.68,404.5,381.62,384.12,474842.96215,1604318000000.0,187022300.0,170021.0,232481.18258,91675390.0,383.02
3,1604318000000.0,384.06,388.63,377.92,383.02,305125.61742,1604362000000.0,117201300.0,120349.0,150443.07821,57804400.0,379.61
4,1604362000000.0,383.01,385.59,370.23,379.61,426229.01686,1604405000000.0,160984300.0,154543.0,199910.31928,75535920.0,387.88


In [16]:
# correlation between all the columns of a dataframe
datasetCopy.corr()

Unnamed: 0,Open time,Open,High,Low,Close,Volume,Close time,Quote asset volume,Number of trades,Taker buy base asset volume,Taker buy quote asset volume,T One Close
Open time,1.0,0.899955,0.897999,0.904648,0.899967,-0.410129,1.0,0.220271,0.248501,-0.41419,0.220866,0.899572
Open,0.899955,1.0,0.998592,0.996428,0.996089,-0.305076,0.899955,0.417204,0.416113,-0.310382,0.416979,0.992362
High,0.897999,0.998592,1.0,0.996162,0.998075,-0.288968,0.897999,0.434918,0.435092,-0.292754,0.436467,0.994312
Low,0.904648,0.996428,0.996162,1.0,0.997848,-0.349886,0.904648,0.364436,0.367838,-0.352726,0.366779,0.994157
Close,0.899967,0.996089,0.998075,0.997848,1.0,-0.315046,0.899967,0.404041,0.406456,-0.317141,0.407423,0.996072
Volume,-0.410129,-0.305076,-0.288968,-0.349886,-0.315046,1.0,-0.410129,0.635413,0.626646,0.99817,0.634077,-0.311597
Close time,1.0,0.899955,0.897999,0.904648,0.899967,-0.410129,1.0,0.220271,0.248501,-0.41419,0.220866,0.899572
Quote asset volume,0.220271,0.417204,0.434918,0.364436,0.404041,0.635413,0.220271,1.0,0.966832,0.629036,0.998388,0.402924
Number of trades,0.248501,0.416113,0.435092,0.367838,0.406456,0.626646,0.248501,0.966832,1.0,0.622209,0.968058,0.408022
Taker buy base asset volume,-0.41419,-0.310382,-0.292754,-0.352726,-0.317141,0.99817,-0.41419,0.629036,0.622209,1.0,0.630645,-0.31375


In [17]:
# remove low negative correlations to T One Close
dataset.pop('Volume') # Volumne
dataset.pop('Taker buy base asset volume') # Taker buy base asset volume

# potentially remove mid correlations to T One Close
#dataset.pop('Quote asset volume') # Quote asset volume
#dataset.pop('Number of trades') # Number of trades
#dataset.pop('Taker buy quote asset volume') # Taker buy quote asset volume'
dataset.columns

Index(['Open time', 'Open', 'High', 'Low', 'Close', 'Close time',
       'Quote asset volume', 'Number of trades',
       'Taker buy quote asset volume'],
      dtype='object')

In [18]:
# add classification column and class
dataset["Class"] = 0 # dataset[(dataset.shape[1])] = 0
# dataset = dataset.iloc[:,0:6] # remove columns if added to many
dataset.head()

Unnamed: 0,Open time,Open,High,Low,Close,Close time,Quote asset volume,Number of trades,Taker buy quote asset volume,Class
0,1604189000000.0,386.44,390.65,384.61,386.74,1604232000000.0,63237720.0,73393.0,31360860.0,0
1,1604232000000.0,386.75,398.29,386.1,396.67,1604275000000.0,108000600.0,107976.0,57739970.0,0
2,1604275000000.0,396.68,404.5,381.62,384.12,1604318000000.0,187022300.0,170021.0,91675390.0,0
3,1604318000000.0,384.06,388.63,377.92,383.02,1604362000000.0,117201300.0,120349.0,57804400.0,0
4,1604362000000.0,383.01,385.59,370.23,379.61,1604405000000.0,160984300.0,154543.0,75535920.0,0


In [19]:
# convert dataframe to array
datasetArr = dataset.values

In [20]:
# add classification to class column
for i in range(1, datasetArr.shape[0]):
    if datasetArr[i, (dataset.columns.get_loc("Close"))] <= datasetArr[i - 1, (dataset.columns.get_loc("Close"))] : # t x + 1 is less than or equal to t
        datasetArr[i, (dataset.columns.get_loc("Class"))] = 0
    elif datasetArr[i, (dataset.columns.get_loc("Close"))] > datasetArr[i - 1, (dataset.columns.get_loc("Close"))]: # t is greater than
        datasetArr[i, (dataset.columns.get_loc("Class"))] = 1

# Modeling  - [WORKING]

In [23]:
from keras.models import Sequential
from keras.layers import Dense

In [37]:
# set dimension index 
startDim = 0
endDim = dataset.columns.get_loc("Class")

# input variables (X)
X = datasetArr[:,startDim:endDim] # select the first 8 columns from index 0 to index 7 via the slice 0:8

# output variables (y)
y = datasetArr[:,endDim]

In [38]:
# define the keras model
model = Sequential()
model.add(Dense(12, input_dim=endDim, activation='relu')) # input_dim=12 : number of obervations in x
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model

<keras.engine.sequential.Sequential at 0x135963580>

In [43]:
# Compile Keras Model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [46]:
# fit the keras model on the dataset
# model.fit(X, y, epochs=150, batch_size=2, verbose=0)
model.fit(X, y, epochs=10, batch_size=252) 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x135d42520>

In [29]:
# evaluate the keras model
_, accuracy = model.evaluate(X, y) # _, accuracy = model.evaluate(X, y, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 54.58


In [169]:
# make class predictions with the model - round predictions
predictions = (model.predict(X) > 0.5).astype(int)

In [None]:
# summarize the first 5 cases
for i in range(2):
	print('%s => %d (expected %d)' % (X[i].tolist(), predictions[i], y[i]))