In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Flatten, Dense
from sklearn.ensemble import RandomForestClassifier

# Import data

In [3]:
data = pd.read_csv('../../raw_data/bitstampUSD.csv')

# Clean data

In [4]:
data['Timestamp'] = pd.to_datetime(data['Timestamp'], unit='s', origin='unix')

In [5]:
data = data[["Timestamp", "Open"]].fillna(method='ffill')

In [6]:
def open_diff_col(data):
    data['Open_diff'] = data["Open"].diff()
    clean_data = data[1:]
    return clean_data

In [7]:
cleaned_data = open_diff_col(data)

In [8]:
data_sample = cleaned_data[2798176:]
data_test = data_sample[1829602:]

In [9]:
def y_encoding(data):
    data['Coded'] = data['Open_diff'].map(lambda x: 0 if x <= 0 else 1)
    return data

In [10]:
y_encoded = y_encoding(data_test)
y_encoded.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Coded'] = data['Open_diff'].map(lambda x: 0 if x <= 0 else 1)


Unnamed: 0,Timestamp,Open,Open_diff,Coded
4627779,2020-10-22 13:23:00,12955.46,14.34,1
4627780,2020-10-22 13:24:00,12959.98,4.52,1
4627781,2020-10-22 13:25:00,12959.01,-0.97,0
4627782,2020-10-22 13:26:00,12949.05,-9.96,0
4627783,2020-10-22 13:27:00,12952.39,3.34,1


# Dumb baseline model

In [11]:
baseline_sample = data_sample[:1000000]
y_base = y_encoding(baseline_sample)
base = y_base[['Coded']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Coded'] = data['Open_diff'].map(lambda x: 0 if x <= 0 else 1)


In [12]:
train_size = 0.6
index = round(train_size*base.shape[0])
df_train = base.iloc[:index]
df_test = base.iloc[index+1:]

In [13]:
y_pred = df_test.shift(1).dropna()
y_true = df_test[1:]
print(f"Accuracy:{accuracy_score(y_true, y_pred)}")

Accuracy:0.49721748608743044


# Simple Ridge Classifier model

In [14]:
y_base.shape

(1000000, 4)

In [21]:
y_base.head()

Unnamed: 0,Timestamp,Open,Open_diff,Coded
2798177,2017-05-01 00:01:00,1352.41,3.53,1
2798178,2017-05-01 00:02:00,1349.49,-2.92,0
2798179,2017-05-01 00:03:00,1350.11,0.62,1
2798180,2017-05-01 00:04:00,1351.25,1.14,1
2798181,2017-05-01 00:05:00,1351.24,-0.01,0


In [11]:
def input_data(data, sample_size, shift_size, train_size):

    data_size = data.shape[0]
    sample = data.iloc[(data_size-sample_size):data_size]
    sample_pp = sample[['Open_diff', 'Timestamp']].set_index("Timestamp").fillna(method='ffill')


    for i in range(1, shift_size+1):
        sample_pp[f't - {i}'] = sample_pp['Open_diff'].shift(i)
    sample_shifted = sample_pp.dropna() 


    X = sample_shifted.drop(columns=['Open_diff'])
    y = sample_shifted['Open_diff']


    X_train = X.iloc[0:train_size]
    y_train = y.iloc[0:train_size]
    X_test = X.iloc[(train_size+1):(sample_size-shift_size)]
    y_test = y.iloc[(train_size+1):(sample_size-shift_size)]
    
    return X_train, X_test, y_train, y_test

In [12]:
X_train, X_test, y_train, y_test = input_data(y_base, 10000, 20, 6000)

NameError: name 'y_base' is not defined

In [13]:
y_test.head(20)

NameError: name 'y_test' is not defined

In [14]:
y_train[y_train > 0] = 1
y_train[y_train <= 0] = 0
y_test[y_test > 0] =1
y_test[y_test <= 0] = 0

NameError: name 'y_train' is not defined

In [9]:
def ridge_classifier(X_train, X_test, y_train, y_test):
    log_reg = RidgeClassifier()
    log_reg = log_reg.fit(X_train, y_train)
    results = log_reg.predict(X_test)
    score = log_reg.score(X_test, y_test)
    return score

In [10]:
ridge_classifier(X_train, X_test, y_train, y_test)

NameError: name 'X_train' is not defined

In [45]:
score = log_reg.score(X_test, y_test)
score

0.6154812767026892

## Random Forest Classifier

In [69]:
rf = RandomForestClassifier()
rf = rf.fit(X_train, y_train)
result = rf.predict(X_test)
result[:20]

array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])

In [70]:
rf_score = rf.score(X_test, y_test)
rf_score

0.6084443327469213

### with cross-val

In [73]:
cv_results = cross_validate(RandomForestClassifier(), X_train, y_train, cv=10)

In [79]:
cv_array = cv_results['test_score']
cv_array

array([0.56166667, 0.56333333, 0.54166667, 0.57333333, 0.47666667,
       0.575     , 0.58166667, 0.53666667, 0.60833333, 0.55666667])

In [80]:
cv_accuracy = cv_array.mean()
cv_accuracy

0.5575

## Functions for modeling

In [7]:
def preprocessing_data(data, shift_size, h=1):
    data_pp = data[2798176:4727776]
    data_pp['Timestamp'] = pd.to_datetime(data_pp['Timestamp'], unit='s', origin='unix')
    data_pp = data_pp[['Open', 'Timestamp']].set_index("Timestamp").fillna(method='ffill')
    data_pp
    data_pp['diff_Open'] = data_pp['Open'].diff(h)
    data_pp['diff_Open'] = data_pp['diff_Open'].dropna()
    data_pp[f"t+{h}"] = data_pp['diff_Open'].shift(-h)
    for i in range(0, shift_size):
        data_pp[f't-{i}'] = data_pp['Open'].shift(i)
    data_shifted = data_pp.dropna()
    X = data_shifted.drop(columns=['Open', 'diff_Open', f"t+{h}"])
    y = data_shifted[f"t+{h}"]
    y[y > 0] = 1
    y[y <= 0] = 0
    return X, y, data_shifted

def input_data(data, sample_size, shift_size, train_size, h=1, w=0):
    X, y, data_shifted = preprocessing_data(data, shift_size, h)
    data_size = data_shifted.shape[0]
    sample_X = X.iloc[(data_size-sample_size-w):data_size-w]
    sample_y = y.iloc[(data_size-sample_size-w):data_size-w]
    X_train = sample_X.iloc[0:train_size]
    y_train = sample_y.iloc[0:train_size]
    X_test = sample_X.iloc[(train_size+h-1):(sample_size-shift_size)]
    y_test = sample_y.iloc[(train_size+h-1):(sample_size-shift_size)]
    return X_train, X_test, y_train, y_test

In [None]:
def initialize_model():
    model = 

In [None]:
def compile_model(model):
    model.compile(loss='binary_crossentropy', 
              optimizer='rmsprop',
              metrics=['accuracy'])
    return model

In [None]:
model = initialize_model()
model = compile_model(model)

In [None]:
es = EarlyStopping(patience=10, restore_best_weights=True)
history = model.fit(X_train[:1000], y_train[:1000],
                    validation_split=0.3,
                    epochs=200,
                    batch_size=32,
                    callbacks=[es], 
                    verbose=1)

In [None]:
model.evaluate(X_test, y_test, verbose=2)

# More advanced model

In [None]:
# from tensorflow.keras.utils import to_categorical
# y_train = to_categorical(labels_train)
# y_test = to_categorical(labels_test)
# y_test.shape

# def initialize_model():
#     # YOUR CODE HERE
#     model = models.Sequential()
    
#     model.add(layers.Conv2D(32, (3,3), activation='relu', padding='same', input_shape=(32, 32, 3)))
#     model.add(layers.MaxPool2D(pool_size=(2,2)))
    
#     model.add(layers.Conv2D(64, (3,3), activation='relu', padding='same'))
#     model.add(layers.MaxPool2D(pool_size=(2,2)))
    
#     model.add(layers.Conv2D(128, (3,3), activation='relu', padding='same'))
#     model.add(layers.MaxPool2D(pool_size=(3,3)))
              
#     model.add(layers.Flatten())
              
#     model.add(layers.Dense(120, activation='relu'))
#     model.add(layers.Dense(60, activation='relu'))
#     model.add(layers.Dropout(rate=0.5))
#     model.add(layers.Dense(10, activation='softmax'))
    
#     return model

# def compile_model(model):
#     # YOUR CODE HERE
#     model.compile(loss='categorical_crossentropy', 
#               optimizer='adam',
#               metrics=['accuracy'])
#     return model

# model = initialize_model()
# model = compile_model(model)

# es = EarlyStopping(patience=10, restore_best_weights=True)
# history = model.fit(X_train[:1000], y_train[:1000],
#                     validation_split=0.3,
#                     epochs=200,
#                     batch_size=32,
#                     callbacks=[es], 
#                     verbose=1)

# # YOUR CODE HERE
# model.evaluate(X_test, y_test, verbose=2)

In [81]:
# model = Sequential()
# model.add(layers.SimpleRNN(units=1, activation='tanh'))
# model.add(layers.Dense(1, activation="relu"))

# # The compilation
# model.compile(loss='binary_crossentropy', 
#               optimizer='rmsprop',
#               metrics=['accuracy'])

# # The fit
# model.fit(X_train, y_train,
#             validation_split=0.2,
#          batch_size=16,
#          epochs=5, verbose=0)

# # The prediction
# model.predict(X) 

In [2]:
# model_LSTM = Sequential()
# model_LSTM.add(LSTM(units=20, activation='tanh'))
# model_LSTM.add(Dense(10, activation='relu'))
# model_LSTM.add(Dense(1, activation='linear'))

# model_LSTM = compile_model(model_LSTM)

# es = EarlyStopping(patience=5)

# model_LSTM.fit(X_train, y_train,
#           epochs=100, 
#           batch_size=32, 
#           validation_split=0.2,
#           callbacks=[es])
# model_LSTM.evaluate(X_test, y_test)

In [3]:
# lstm_model = Sequential()
# lstm_model.add(layers.Masking(mask_value=-1000))
# lstm_model.add(layers.LSTM(units=10, activation='tanh')) 
# lstm_model.add(layers.Dense(20, activation="tanh"))
# lstm_model.add(layers.Dense(5, activation='softmax'))

# lstm_model.compile(loss='categorical_crossentropy', 
#               optimizer='rmsprop',
#               metrics = 'accuracy')

# es = EarlyStopping(patience=2, restore_best_weights=True)

# lstm_history = lstm_model.fit(X_train, y_train,
#           validation_split = 0.2,
#           batch_size=16,
#           callbacks=[es],
#           epochs=50)

# lstm_model.evaluate(X_test, y_test, verbose=2)

In [4]:
# GRU_model = Sequential()
# GRU_model.add(layers.Masking(mask_value=-1000))
# GRU_model.add(layers.GRU(units=10, activation='tanh')) 
# GRU_model.add(layers.Dense(20, activation="tanh"))
# GRU_model.add(layers.Dense(5, activation='softmax'))

# GRU_model.compile(loss='categorical_crossentropy', 
#               optimizer='rmsprop',
#               metrics = 'accuracy')

# es = EarlyStopping(patience=2, restore_best_weights=True)

# GRU_history = lstm_model.fit(X_train, y_train,
#           validation_split = 0.2,
#           batch_size=16,
#           callbacks=[es],
#           epochs=50)

# GRU_model.evaluate(X_test, y_test, verbose=2)