In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Flatten, Dense, SimpleRNN
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.callbacks import EarlyStopping

# Import data

In [3]:
data = pd.read_csv('../../raw_data/bitstampUSD.csv')

# Clean data

In [4]:
data['Timestamp'] = pd.to_datetime(data['Timestamp'], unit='s', origin='unix')

In [5]:
data = data[["Timestamp", "Open"]].fillna(method='ffill')

In [6]:
def open_diff_col(data):
    data['Open_diff'] = data["Open"].diff()
    clean_data = data[1:]
    return clean_data

In [7]:
cleaned_data = open_diff_col(data)

In [8]:
data_sample = cleaned_data[2798176:]
data_test = data_sample[1829602:]

In [9]:
def y_encoding(data):
    data['Coded'] = data['Open_diff'].map(lambda x: 0 if x <= 0 else 1)
    return data

In [10]:
y_encoded = y_encoding(data_test)
y_encoded.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Coded'] = data['Open_diff'].map(lambda x: 0 if x <= 0 else 1)


Unnamed: 0,Timestamp,Open,Open_diff,Coded
4627779,2020-10-22 13:23:00,12955.46,14.34,1
4627780,2020-10-22 13:24:00,12959.98,4.52,1
4627781,2020-10-22 13:25:00,12959.01,-0.97,0
4627782,2020-10-22 13:26:00,12949.05,-9.96,0
4627783,2020-10-22 13:27:00,12952.39,3.34,1


# Dumb baseline model

In [11]:
baseline_sample = data_sample[:1000000]
y_base = y_encoding(baseline_sample)
base = y_base[['Coded']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Coded'] = data['Open_diff'].map(lambda x: 0 if x <= 0 else 1)


In [12]:
train_size = 0.6
index = round(train_size*base.shape[0])
df_train = base.iloc[:index]
df_test = base.iloc[index+1:]

In [13]:
y_pred = df_test.shift(1).dropna()
y_true = df_test[1:]
print(f"Accuracy:{accuracy_score(y_true, y_pred)}")

Accuracy:0.49721748608743044


# Simple Ridge Classifier model

In [14]:
y_base.shape

(1000000, 4)

In [15]:
y_base.head()

Unnamed: 0,Timestamp,Open,Open_diff,Coded
2798177,2017-05-01 00:01:00,1352.41,3.53,1
2798178,2017-05-01 00:02:00,1349.49,-2.92,0
2798179,2017-05-01 00:03:00,1350.11,0.62,1
2798180,2017-05-01 00:04:00,1351.25,1.14,1
2798181,2017-05-01 00:05:00,1351.24,-0.01,0


In [16]:
def input_data(data, sample_size, shift_size, train_size):

    data_size = data.shape[0]
    sample = data.iloc[(data_size-sample_size):data_size]
    sample_pp = sample[['Open_diff', 'Timestamp']].set_index("Timestamp").fillna(method='ffill')


    for i in range(1, shift_size+1):
        sample_pp[f't - {i}'] = sample_pp['Open_diff'].shift(i)
    sample_shifted = sample_pp.dropna() 


    X = sample_shifted.drop(columns=['Open_diff'])
    y = sample_shifted['Open_diff']


    X_train = X.iloc[0:train_size]
    y_train = y.iloc[0:train_size]
    X_test = X.iloc[(train_size+1):(sample_size-shift_size)]
    y_test = y.iloc[(train_size+1):(sample_size-shift_size)]
    
    return X_train, X_test, y_train, y_test

In [17]:
X_train, X_test, y_train, y_test = input_data(y_base, 10000, 20, 6000)

In [18]:
y_test.head(20)

Timestamp
2019-03-23 16:22:00    1.17
2019-03-23 16:23:00    0.00
2019-03-23 16:24:00    0.00
2019-03-23 16:25:00    0.40
2019-03-23 16:26:00    0.00
2019-03-23 16:27:00   -0.40
2019-03-23 16:28:00   -2.87
2019-03-23 16:29:00    0.00
2019-03-23 16:30:00    0.40
2019-03-23 16:31:00   -0.92
2019-03-23 16:32:00   -0.49
2019-03-23 16:33:00    0.35
2019-03-23 16:34:00    0.00
2019-03-23 16:35:00   -0.35
2019-03-23 16:36:00    0.24
2019-03-23 16:37:00    0.04
2019-03-23 16:38:00    0.00
2019-03-23 16:39:00    0.71
2019-03-23 16:40:00    0.00
2019-03-23 16:41:00    0.00
Name: Open_diff, dtype: float64

In [19]:
y_train[y_train > 0] = 1
y_train[y_train <= 0] = 0
y_test[y_test > 0] =1
y_test[y_test <= 0] = 0

In [20]:
def ridge_classifier(X_train, X_test, y_train, y_test):
    log_reg = RidgeClassifier()
    log_reg = log_reg.fit(X_train, y_train)
    results = log_reg.predict(X_test)
    score = log_reg.score(X_test, y_test)
    return score

In [21]:
ridge_classifier(X_train, X_test, y_train, y_test)

0.6154812767026892

## Random Forest Classifier

In [22]:
rf = RandomForestClassifier()
rf = rf.fit(X_train, y_train)
result = rf.predict(X_test)
result[:20]

array([0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0.])

In [23]:
rf_score = rf.score(X_test, y_test)
rf_score

0.6054284996230208

### with cross-val

In [24]:
cv_results = cross_validate(RandomForestClassifier(), X_train, y_train, cv=10)

In [25]:
cv_array = cv_results['test_score']
cv_array

array([0.58833333, 0.545     , 0.54333333, 0.585     , 0.49166667,
       0.57833333, 0.58      , 0.51333333, 0.60833333, 0.52      ])

In [26]:
cv_accuracy = cv_array.mean()
cv_accuracy

0.5553333333333332

## Functions for modeling

In [27]:
def preprocessing_data(data, shift_size, h=1):
    data_pp = data[2798176:4727776]
    data_pp['Timestamp'] = pd.to_datetime(data_pp['Timestamp'], unit='s', origin='unix')
    data_pp = data_pp[['Open', 'Timestamp']].set_index("Timestamp").fillna(method='ffill')
    data_pp['diff_Open'] = data_pp['Open'].diff(h)
    data_pp['diff_Open'] = data_pp['diff_Open'].dropna()
    data_pp[f"t+{h}"] = data_pp['diff_Open'].shift(-h)
    for i in range(0, shift_size):
        data_pp[f't-{i}'] = data_pp['Open'].shift(i)
    data_shifted = data_pp.dropna()
    X = data_shifted.drop(columns=['Open', 'diff_Open', f"t+{h}"])
    y = data_shifted[f"t+{h}"].copy()
    y[y > 0] = 1
    y[y <= 0] = 0
    return X, y, data_shifted

def input_data(data, sample_size, shift_size, train_size, h=1, w=0):
    X, y, data_shifted = preprocessing_data(data, shift_size, h)
    data_size = data_shifted.shape[0]
    sample_X = X.iloc[(data_size-sample_size-w):data_size-w]
    sample_y = y.iloc[(data_size-sample_size-w):data_size-w]
    X_train = sample_X.iloc[0:train_size]
    y_train = sample_y.iloc[0:train_size]
    X_test = sample_X.iloc[(train_size+h-1):(sample_size-shift_size)]
    y_test = sample_y.iloc[(train_size+h-1):(sample_size-shift_size)]
    return X_train, X_test, y_train, y_test

In [73]:
X_train, X_test, y_train, y_test = input_data(data, 3000, 5, 2000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_pp['Timestamp'] = pd.to_datetime(data_pp['Timestamp'], unit='s', origin='unix')


In [74]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((2000, 5), (2000,), (995, 5), (995,))

In [None]:
def deep_reshape(X_train, X_test, y_train, y_test):
    

In [75]:
X_retrain, y_retrain = np.array(X_train), np.array(y_train)
X_retrain = np.reshape(X_retrain, (X_retrain.shape[0], X_retrain.shape[1], 1))

In [76]:
X_retest, y_retest = np.array(X_test), np.array(y_test)
X_retest = np.reshape(X_retest, (X_retest.shape[0], X_retest.shape[1], 1))

In [None]:
def initialize_model():
    model = Sequential()
    model.add(layers.SimpleRNN(units=10, activation='tanh'))
    model.add(layers.Dense(1, activation='sigmoid'))
    return model

In [None]:
def compile_model(model):
    model.compile(loss='binary_crossentropy', 
              optimizer='rmsprop',
              metrics=['accuracy'])
    return model

In [None]:
model = initialize_model()
model = compile_model(model)

In [None]:
es = EarlyStopping(patience=2, restore_best_weights=True)
history = model.fit(X_retrain, y_retrain,
#                     validation_split=0.3,
                    epochs=50,
                    batch_size=32,
                    callbacks=[es])

In [None]:
model.evaluate(X_retest, y_retest, verbose=2)

In [77]:
def initialize_model():
    model = Sequential()
    model.add(layers.SimpleRNN(units=10, activation='tanh'))
    model.add(layers.Dense(1, activation='sigmoid'))
    return model

In [78]:
def compile_model(model):
    model.compile(loss='binary_crossentropy', 
              optimizer='rmsprop',
              metrics=['accuracy'])
    return model

In [79]:
model = initialize_model()
model = compile_model(model)

In [80]:
es = EarlyStopping(patience=2, restore_best_weights=True)
history = model.fit(X_retrain, y_retrain,
#                     validation_split=0.3,
                    epochs=50,
                    batch_size=32,
                    callbacks=[es])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50


Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [81]:
model.evaluate(X_retest, y_retest, verbose=2)

32/32 - 0s - loss: 0.6930 - accuracy: 0.5226


[0.6930075883865356, 0.5226130485534668]

## basic RNN results

In [63]:
# (data, 12000, 10, 8000, h=1, w=0)
# units=10, Dense 5, Dense 1
# loss: 0.6931475400924683, accuracy: 0.49724310636520386]

# More advanced model

In [None]:
# model = Sequential()
# model.add(layers.SimpleRNN(units=1, activation='tanh'))
# model.add(layers.Dense(1, activation="relu"))

# # The compilation
# model.compile(loss='binary_crossentropy', 
#               optimizer='rmsprop',
#               metrics=['accuracy'])

# # The fit
# model.fit(X_train, y_train,
#             validation_split=0.2,
#          batch_size=16,
#          epochs=5, verbose=0)

# # The prediction
# model.predict(X) 

In [None]:
# lstm_model = Sequential()
# lstm_model.add(layers.Masking(mask_value=-1000))
# lstm_model.add(layers.LSTM(units=10, activation='tanh')) 
# lstm_model.add(layers.Dense(20, activation="tanh"))
# lstm_model.add(layers.Dense(5, activation='softmax'))

# lstm_model.compile(loss='categorical_crossentropy', 
#               optimizer='rmsprop',
#               metrics = 'accuracy')

# es = EarlyStopping(patience=2, restore_best_weights=True)

# lstm_history = lstm_model.fit(X_train, y_train,
#           validation_split = 0.2,
#           batch_size=16,
#           callbacks=[es],
#           epochs=50)

# lstm_model.evaluate(X_test, y_test, verbose=2)

In [None]:
# GRU_model = Sequential()
# GRU_model.add(layers.Masking(mask_value=-1000))
# GRU_model.add(layers.GRU(units=10, activation='tanh')) 
# GRU_model.add(layers.Dense(20, activation="tanh"))
# GRU_model.add(layers.Dense(5, activation='softmax'))

# GRU_model.compile(loss='categorical_crossentropy', 
#               optimizer='rmsprop',
#               metrics = 'accuracy')

# es = EarlyStopping(patience=2, restore_best_weights=True)

# GRU_history = lstm_model.fit(X_train, y_train,
#           validation_split = 0.2,
#           batch_size=16,
#           callbacks=[es],
#           epochs=50)

# GRU_model.evaluate(X_test, y_test, verbose=2)