In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score
from sklearn.preprocessing import LabelBinarizer

In [3]:
data = pd.read_csv('../../raw_data/bitstampUSD.csv')

In [4]:
data['Timestamp'] = pd.to_datetime(data['Timestamp'], unit='s', origin='unix')

In [5]:
# data_sample = data[2701596:]
data_sample = data[2798175:]
data_sample.shape

(1929602, 8)

## NaN Padding

In [6]:
data_sample.shape, data_sample.isnull().sum()

((1929602, 8),
 Timestamp                0
 Open                 55145
 High                 55145
 Low                  55145
 Close                55145
 Volume_(BTC)         55145
 Volume_(Currency)    55145
 Weighted_Price       55145
 dtype: int64)

In [7]:
data_sample[500:550]

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
2798675,2017-05-01 08:19:00,1359.72,1361.77,1359.72,1360.7,15.509,21093.617542,1360.088822
2798676,2017-05-01 08:20:00,1360.7,1360.71,1356.09,1356.09,1.009,1372.91471,1360.668692
2798677,2017-05-01 08:21:00,1360.71,1360.71,1360.71,1360.71,0.134475,182.981219,1360.71
2798678,2017-05-01 08:22:00,1360.7,1360.7,1360.7,1360.7,0.073538,100.063714,1360.7
2798679,2017-05-01 08:23:00,1356.16,1360.71,1356.16,1360.68,4.384365,5962.744701,1360.001884
2798680,2017-05-01 08:24:00,1360.66,1360.66,1360.61,1360.61,0.060187,81.892016,1360.621572
2798681,2017-05-01 08:25:00,1356.17,1356.17,1356.17,1356.17,0.258207,350.172085,1356.17
2798682,2017-05-01 08:26:00,1356.17,1356.17,1356.17,1356.17,0.120754,163.762369,1356.17
2798683,2017-05-01 08:27:00,1356.03,1356.16,1355.5,1356.16,1.7255,2339.122108,1355.619883
2798684,2017-05-01 08:28:00,1355.5,1356.16,1355.5,1356.16,0.505324,685.234779,1356.029391


In [8]:
# data_sample['diff_open_close'] = data_sample['Open'] - data_sample['Close']

In [9]:
data_sample.head()

Unnamed: 0,Timestamp,Open,High,Low,Close,Volume_(BTC),Volume_(Currency),Weighted_Price
2798175,2017-04-30 23:59:00,1350.21,1350.21,1350.21,1350.21,0.100502,135.699048,1350.21
2798176,2017-05-01 00:00:00,1348.88,1354.8,1348.88,1354.8,3.087374,4173.722673,1351.868203
2798177,2017-05-01 00:01:00,1352.41,1352.41,1352.41,1352.41,0.261954,354.269412,1352.41
2798178,2017-05-01 00:02:00,1349.49,1354.86,1349.49,1354.86,0.096311,130.384815,1353.783259
2798179,2017-05-01 00:03:00,1350.11,1351.25,1350.11,1351.25,0.260284,351.553973,1350.655803


In [10]:
data_sample.isnull().sum(), data_sample.shape

(Timestamp                0
 Open                 55145
 High                 55145
 Low                  55145
 Close                55145
 Volume_(BTC)         55145
 Volume_(Currency)    55145
 Weighted_Price       55145
 dtype: int64,
 (1929602, 8))

In [11]:
# plt.figure(figsize=(22,5))
# sns.lineplot(data=data_sample, x='Timestamp', y='diff_open_close')

In [12]:
# data_sample['diff_high_low'] = data_sample['High'] - data_sample['Low']

In [13]:
# plt.figure(figsize=(22,5))
# sns.lineplot(data=data_sample, x='Timestamp', y='diff_high_low')

In [14]:
# data_size = data_sample[:200]
# data_size.isnull().sum()

In [15]:
data_test = data_sample[["Timestamp", "Close"]]
# data_test['new_close'] = data_size['Close'].values
data_test['Previous'] = data_test['Close'].shift(1)
data_test.insert(loc=2, column='New', value=['' for i in range(data_test.shape[0])])
data_test[550:600]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test['Previous'] = data_test['Close'].shift(1)


Unnamed: 0,Timestamp,Close,New,Previous
2798725,2017-05-01 09:09:00,1362.0,,1362.0
2798726,2017-05-01 09:10:00,1362.0,,1362.0
2798727,2017-05-01 09:11:00,1362.0,,1362.0
2798728,2017-05-01 09:12:00,1362.0,,1362.0
2798729,2017-05-01 09:13:00,1361.99,,1362.0
2798730,2017-05-01 09:14:00,1361.99,,1361.99
2798731,2017-05-01 09:15:00,,,1361.99
2798732,2017-05-01 09:16:00,,,
2798733,2017-05-01 09:17:00,1361.98,,
2798734,2017-05-01 09:18:00,1361.98,,1361.98


In [16]:
def fill_blanks(row):
    if (isinstance(row['Close'], float)):
        row['New'] = row['Close']
    elif (isinstance(row['Close'], str)):
        row['New'] = row['Previous']
    else:
        row['New'] = ''
    return row

In [17]:
data_test = data_test.fillna('')
data_test.head()

Unnamed: 0,Timestamp,Close,New,Previous
2798175,2017-04-30 23:59:00,1350.21,,
2798176,2017-05-01 00:00:00,1354.8,,1350.21
2798177,2017-05-01 00:01:00,1352.41,,1354.8
2798178,2017-05-01 00:02:00,1354.86,,1352.41
2798179,2017-05-01 00:03:00,1351.25,,1354.86


In [18]:
clean_data = data_test.apply(fill_blanks, axis=1)

In [19]:
clean_data[510:560]

Unnamed: 0,Timestamp,Close,New,Previous
2798685,2017-05-01 08:29:00,1360.35,1360.35,1356.16
2798686,2017-05-01 08:30:00,1361.87,1361.87,1360.35
2798687,2017-05-01 08:31:00,1361.78,1361.78,1361.87
2798688,2017-05-01 08:32:00,1356.79,1356.79,1361.78
2798689,2017-05-01 08:33:00,1355.8,1355.8,1356.79
2798690,2017-05-01 08:34:00,1355.5,1355.5,1355.8
2798691,2017-05-01 08:35:00,1355.68,1355.68,1355.5
2798692,2017-05-01 08:36:00,1355.02,1355.02,1355.68
2798693,2017-05-01 08:37:00,1355.66,1355.66,1355.02
2798694,2017-05-01 08:38:00,1355.66,1355.66,1355.66


In [20]:
(clean_data['New'].values == '').sum()

10181

## Dumb Baseline Model

In [21]:
baseline_sample = data_sample
baseline_sample.shape

(1929602, 8)

In [27]:
baseline = baseline_sample[['Close']]
baseline['Previous'] = baseline['Close'].shift(1)
baseline['outcome'] = baseline['Close']- baseline['Previous']
# baseline.insert(loc=3, column='Coded', value=['' for i in range(baseline.shape[0])])
baseline = baseline.fillna(method='ffill')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  baseline['Previous'] = baseline['Close'].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  baseline['outcome'] = baseline['Close']- baseline['Previous']


In [28]:
f = lambda x: 0 if x <= 0 else 1
baseline['outcome'] = baseline['outcome'].map(f)

In [29]:
base = baseline[['outcome']]
base.head()

Unnamed: 0,outcome
2798175,1
2798176,1
2798177,0
2798178,1
2798179,0


In [30]:
train_size = 0.6
index = round(train_size*base.shape[0])
df_train = base.iloc[:index]
df_test = base.iloc[index+1:]

In [31]:
y_pred = df_test.shift(1).dropna()
y_true = df_test[1:]
print(f"Accuracy:{accuracy_score(y_true, y_pred)}")

Accuracy:0.45584765734822935
