In [1]:
%%time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import torch
import torchvision
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.utils import save_image
from torchvision.datasets import MNIST, CIFAR10

from sklearn import preprocessing, model_selection
from datetime import datetime, timedelta
import calendar

import os
import gc

CPU times: user 938 ms, sys: 318 ms, total: 1.26 s
Wall time: 1.87 s


In [2]:
save_path = '/Users/michaeldjaballah/Data/Crypto/processed/'

odf = pd.read_csv(save_path + 'ri2.csv')

# Filter to about 10K observations and narrow down to one coin
df = odf[odf['Symbol'] == 'BTC']
del odf

In [3]:
# Preprocessing for columns that we are not interested in
df.drop(['Unnamed: 0', '#', 'sort', 'Name', 'Symbol', '% 1h', '% 24h', '% 7d', 'year', 'Circulating Supply'], axis=1, inplace=True)
df.reset_index(inplace=True, drop=True)

In [4]:
# Easily converts money strings to floats
def money_to_float(money):
    return float(money.replace('$', '').replace(',', ''))

In [5]:
# Quick way to convert our data to floats
new_price = []
new_mkt_cap = []
new_vol = []

for i in range(len(df)):
    new_price.append(money_to_float(df.iloc[i]['Price']))
    new_mkt_cap.append(money_to_float(df.iloc[i]['Market Cap']))
    new_vol.append(money_to_float(df.iloc[i]['Volume (24h)']))

df['Price'] = new_price
df['Market Cap'] = new_mkt_cap
df['Volume (24h)'] = new_vol

In [6]:
# Use for memory of size 1
def create_series(df, col, out='Target', inplace=False):
    
    if not inplace:
        df = df.copy()
    
    next_list = []
    for i in range(1, len(df)):
        next_list.append(df.iloc[i][col])
        
    df.drop(len(df) - 1, inplace=True)
    df[out] = next_list
    
    if not inplace:
        return df
    return None

In [7]:
# s function described in the paper, does not work for memory less than 2
def create_series(df, scol, mem, out='Target', inplace=False):
    if not inplace:
        df = df.copy()
    
    new_cols = {}
    cols = list(df.columns)
    
    for i in range(2, mem + 1):
        for col in cols:
            new_cols[(col, i)] = []
    
    for col in new_cols:
        offset = col[1] - 1
        orig_col = col[0]
        for i in range(offset, len(df) - mem + offset):
            new_cols[col].append(df[orig_col].iloc[i])
            
    new_out = []
    for i in range(mem, len(df)):
        new_out.append(df[scol].iloc[i])
    
    df.drop(df.tail(mem).index, inplace=True)
    
    for col, num in new_cols:
        df[col + ' ' + str(num)] = new_cols[(col, num)]
        
    df[out] = new_out
        
    if not inplace:
        return df
    return None

In [18]:
# Alpha creation functions from paper

def linearly_decaying(mem, d):
    alpha = []
    for i in range(mem):
        alpha.append(round(1 - d*i, 6))
    return alpha


def quadratic(mem, d):
    alpha = [1]
    for i in range(1, mem):
        alpha.append(round(alpha[i-1] - i*d, 6))
    return alpha
    

def exponential_decaying(mem, d):
    alpha = [1]
    for i in range(1, mem):
        alpha.append(alpha[i-1] * d)
    return alpha


# Autocorrelation provided by statsmodels
import statsmodels.api as sm


def auto_correlation(ser, mem):
    return list(pd.Series(sm.tsa.acf(ser, nlags=mem)).apply(abs))

In [13]:
# Applies weights to a dataframe
def alpha_apply(df, alpha):
    mem = len(alpha)
    p = len(df.columns)//mem
    cols = list(df.columns)
    for j in range(len(cols)):
        offset = j//p
        col = cols[j]
        new_col = []
        for i in range(len(df)):
            new_col.append(alpha[offset] * df[col].iloc[i])
        df[col] = new_col
    return None

In [14]:
# Applies the s function described in the paper and shifts our y's to be the appropriate target
sdf = create_series(df, 'Price', 10)

In [15]:
sdf.head()

Unnamed: 0,Market Cap,Price,Volume (24h),month,day,weekday,hour,minute,Market Cap 2,Price 2,...,minute 9,Market Cap 10,Price 10,Volume (24h) 10,month 10,day 10,weekday 10,hour 10,minute 10,Target
0,141217600000.0,7731.42,36672170000.0,3,11,2,13,25,141444300000.0,7743.83,...,5,140161200000.0,7673.56,36727080000.0,3,11,2,14,10,7672.84
1,141444300000.0,7743.83,36714290000.0,3,11,2,13,30,141221500000.0,7731.63,...,10,140147900000.0,7672.84,36817840000.0,3,11,2,14,15,7654.14
2,141221500000.0,7731.63,36710360000.0,3,11,2,13,35,141107100000.0,7725.36,...,15,139806600000.0,7654.14,36809560000.0,3,11,2,14,20,7642.81
3,141107100000.0,7725.36,36700020000.0,3,11,2,13,40,141021200000.0,7720.66,...,20,139599700000.0,7642.81,36913270000.0,3,11,2,14,25,7647.2
4,141021200000.0,7720.66,36797600000.0,3,11,2,13,45,140610000000.0,7698.15,...,25,139679900000.0,7647.2,37029930000.0,3,11,2,14,30,7674.89


In [None]:
# Now we need to standardize our data:
