# Plan of action
- Load data
- Check for time gaps
- Remove NaNs
- Reindex data and fill time gaps

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import pickle

# Load the data

In [None]:
data_folder = "/Users/dominguez/Documents/Kaggle_G_Research_Crypto/data/"
train = pd.read_csv(data_folder + "train.csv")
asset_details = pd.read_csv(data_folder + 'asset_details.csv')

In [None]:
train.head()

In [None]:
asset_details

In [None]:
# What is the time range for our dataset
train_start_date = datetime.fromtimestamp(min(train.timestamp)).strftime('%Y-%m-%d %H:%M:%S')
train_end_date = datetime.fromtimestamp(max(train.timestamp)).strftime('%Y-%m-%d %H:%M:%S')
print(train_start_date)
print(train_end_date)

# Clean the data

In [None]:
# Focus on BTC data, Asset_ID = 1

In [None]:
btc = train[train["Asset_ID"] == 1].set_index("timestamp")
btc

In [None]:
btc.isna().sum()

In [None]:
btc.info(show_counts=True)

In [None]:
# Missing asset data, for a given minute, is not represented by NaN's, but instead by the absence of those rows.
# We can check the timestamp difference between consecutive rows to see if there is missing data.

In [None]:
(btc.index[1:]-btc.index[:-1]).value_counts().head()

In [None]:
# Notice that there are many gaps in the data.
# To work with most time series models, we should preprocess our data into a format without time gaps.
# To fill the gaps, we can use the .reindex() method for forward filling,
# filling gaps with the previous valid value.

In [None]:
# Remove the NaNs
# This will leave gaps in the time-series. But we'll deal with this soon
btc.dropna(axis=0, inplace=True)

In [None]:
beg_btc = datetime.fromtimestamp(btc.index[0]).strftime('%Y-%m-%d %H:%M:%S')
end_btc = datetime.fromtimestamp(btc.index[-1]).strftime('%Y-%m-%d %H:%M:%S')

print('BTC data goes from ', beg_btc, 'to ', end_btc)

In [None]:
# Use .reindex() to fill the gaps
btc = btc.reindex(range(btc.index[0], btc.index[-1] + 60, 60), method='pad')
(btc.index[1:] - btc.index[:-1]).value_counts().head()

In [None]:
btc.isna().sum()

In [2]:
# # Pickling stuff
# with open("./data/btc.pickle", "wb") as f:
#     pickle.dump(btc, f)
            
with open("./data/btc.pickle", 'rb') as f:
    btc = pickle.load(f)
    
btc

Unnamed: 0_level_0,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1514764860,1,229.0,13835.19400,14013.80,13666.11,13850.176000,31.550062,13827.062093,-0.014643
1514764920,1,235.0,13835.03600,14052.30,13680.00,13828.102000,31.046432,13840.362591,-0.015037
1514764980,1,528.0,13823.90000,14000.40,13601.00,13801.314000,55.061820,13806.068014,-0.010309
1514765040,1,435.0,13802.51200,13999.00,13576.28,13768.040000,38.780529,13783.598101,-0.008999
1514765100,1,742.0,13766.00000,13955.90,13554.44,13724.914000,108.501637,13735.586842,-0.008079
...,...,...,...,...,...,...,...,...,...
1632181200,1,2643.0,42632.46500,42736.90,42607.50,42703.636250,100.797218,42653.031385,0.002084
1632181260,1,2281.0,42718.81500,42819.38,42690.84,42781.970571,76.339988,42755.785162,0.003246
1632181320,1,2642.0,42772.92125,42827.10,42690.75,42755.592500,117.429123,42749.075916,0.003108
1632181380,1,2134.0,42762.29000,42811.30,42694.37,42717.234286,78.049458,42749.024591,0.002770
