In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Import data

In [3]:
data = pd.read_csv('../../raw_data/bitstampUSD.csv')

# Clean data

In [4]:
data['Timestamp'] = pd.to_datetime(data['Timestamp'], unit='s', origin='unix')

In [5]:
data = data[["Timestamp", "Open"]].fillna(method='ffill')

In [6]:
def open_diff_col(data):
    data['Open_diff'] = data["Open"].diff()
    clean_data = data[1:]
    return clean_data

In [7]:
cleaned_data = open_diff_col(data)

In [8]:
data_sample = cleaned_data[2798176:]
data_test = data_sample[1829602:]

In [9]:
def y_encoding(data):
    data['Coded'] = data['Open_diff'].map(lambda x: 0 if x <= 0 else 1)
    return data

In [10]:
y_encoded = y_encoding(data_test)
y_encoded.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Coded'] = data['Open_diff'].map(lambda x: 0 if x <= 0 else 1)


Unnamed: 0,Timestamp,Open,Open_diff,Coded
4627779,2020-10-22 13:23:00,12955.46,14.34,1
4627780,2020-10-22 13:24:00,12959.98,4.52,1
4627781,2020-10-22 13:25:00,12959.01,-0.97,0
4627782,2020-10-22 13:26:00,12949.05,-9.96,0
4627783,2020-10-22 13:27:00,12952.39,3.34,1


# Dumb baseline model

In [53]:
baseline_sample = data_sample[:1000000]
y_base = y_encoding(baseline_sample)
base = y_base[['Coded']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Coded'] = data['Open_diff'].map(lambda x: 0 if x <= 0 else 1)


In [54]:
train_size = 0.6
index = round(train_size*base.shape[0])
df_train = base.iloc[:index]
df_test = base.iloc[index+1:]

In [55]:
y_pred = df_test.shift(1).dropna()
y_true = df_test[1:]
print(f"Accuracy:{accuracy_score(y_true, y_pred)}")

Accuracy:0.49721748608743044


# Simple Logistic Regression model

In [56]:
y_base.shape

(1000000, 4)

In [57]:
def input_data(data, sample_size, shift_size, train_size):

    data_size = data.shape[0]
    sample = data.iloc[(data_size-sample_size):data_size]
    sample_pp = sample[['Coded', 'Timestamp']].set_index("Timestamp").fillna(method='ffill')


    for i in range(1, shift_size+1):
        sample_pp[f't - {i}'] = sample_pp['Coded'].shift(i)
    sample_shifted = sample_pp.dropna() 


    X = sample_shifted.drop(columns=['Coded'])
    y = sample_shifted['Coded']


    X_train = X.iloc[0:train_size]
    y_train = y.iloc[0:train_size]
    X_test = X.iloc[(train_size+1):(sample_size-shift_size)]
    y_test = y.iloc[(train_size+1):(sample_size-shift_size)]
    
    return X_train, X_test, y_train, y_test

In [58]:
X_train, X_test, y_train, y_test = input_data(y_base, 500, 10, 100)

In [59]:
y_test.head(20)

Timestamp
2019-03-26 04:12:00    0
2019-03-26 04:13:00    0
2019-03-26 04:14:00    0
2019-03-26 04:15:00    1
2019-03-26 04:16:00    1
2019-03-26 04:17:00    0
2019-03-26 04:18:00    0
2019-03-26 04:19:00    0
2019-03-26 04:20:00    1
2019-03-26 04:21:00    0
2019-03-26 04:22:00    1
2019-03-26 04:23:00    1
2019-03-26 04:24:00    0
2019-03-26 04:25:00    0
2019-03-26 04:26:00    1
2019-03-26 04:27:00    0
2019-03-26 04:28:00    0
2019-03-26 04:29:00    0
2019-03-26 04:30:00    0
2019-03-26 04:31:00    0
Name: Coded, dtype: int64

In [60]:
log_reg = LogisticRegression()
log_reg = log_reg.fit(X_train, y_train)
results = log_reg.predict(X_test)
results[:20]

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [61]:
score = log_reg.score(X_test, y_test)
score

0.6246786632390745

# More advanced model