In [1]:
%%time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import torch
import torchvision
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.utils import save_image
from torchvision.datasets import MNIST, CIFAR10

from sklearn import preprocessing, model_selection
from datetime import datetime, timedelta
import calendar

import os
import gc

CPU times: user 953 ms, sys: 411 ms, total: 1.36 s
Wall time: 3.3 s


In [2]:
save_path = '/Users/michaeldjaballah/Data/Crypto/processed/'

odf = pd.read_csv(save_path + 'ri2.csv')

# Filter to about 10K observations and narrow down to one coin
df = odf[odf['Symbol'] == 'BTC']
del odf

In [3]:
# Preprocessing for columns that we are not interested in
df.drop(['Unnamed: 0', '#', 'sort', 'Name', 'Symbol', '% 1h', '% 24h', '% 7d', 'year', 'Circulating Supply'], axis=1, inplace=True)
df.reset_index(inplace=True, drop=True)

In [4]:
# Easily converts money strings to floats
def money_to_float(money):
    return float(money.replace('$', '').replace(',', ''))

In [5]:
# Quick way to convert our data to floats
new_price = []
new_mkt_cap = []
new_vol = []

for i in range(len(df)):
    new_price.append(money_to_float(df.iloc[i]['Price']))
    new_mkt_cap.append(money_to_float(df.iloc[i]['Market Cap']))
    new_vol.append(money_to_float(df.iloc[i]['Volume (24h)']))

df['Price'] = new_price
df['Market Cap'] = new_mkt_cap
df['Volume (24h)'] = new_vol

In [6]:
# Use for memory of size 1
def create_series(df, col, out='Target', inplace=False):
    
    if not inplace:
        df = df.copy()
    
    next_list = []
    for i in range(1, len(df)):
        next_list.append(df.iloc[i][col])
        
    df.drop(len(df) - 1, inplace=True)
    df[out] = next_list
    
    if not inplace:
        return df
    return None

In [7]:
# s function described in the paper, does not work for memory less than 2
def create_series(df, scol, mem, out='Target', inplace=False):
    if not inplace:
        df = df.copy()
    
    new_cols = {}
    cols = list(df.columns)
    
    for i in range(2, mem + 1):
        for col in cols:
            new_cols[(col, i)] = []
    
    for col in new_cols:
        offset = col[1] - 1
        orig_col = col[0]
        for i in range(offset, len(df) - mem + offset):
            new_cols[col].append(df[orig_col].iloc[i])
            
    new_out = []
    for i in range(mem, len(df)):
        new_out.append(df[scol].iloc[i])
    
    df.drop(df.tail(mem).index, inplace=True)
    
    for col, num in new_cols:
        df[col + ' ' + str(num)] = new_cols[(col, num)]
        
    df[out] = new_out
        
    if not inplace:
        return df
    return None

In [8]:
# Alpha creation functions from paper

def linearly_decaying(mem, d):
    alpha = []
    for i in range(mem):
        alpha.append(round(1 - d*i, 6))
    return alpha


def quadratic(mem, d):
    alpha = [1]
    for i in range(1, mem):
        alpha.append(round(alpha[i-1] - i*d, 6))
    return alpha
    

def exponential_decaying(mem, d):
    alpha = [1]
    for i in range(1, mem):
        alpha.append(round(alpha[i-1] * d, 6))
    return alpha


# Autocorrelation provided by statsmodels
import statsmodels.api as sm


def auto_correlation(ser, mem):
    return list(pd.Series(sm.tsa.acf(ser, nlags=mem-1)).apply(abs))

In [9]:
# Applies weights to a dataframe
def alpha_apply(df, alpha):
    ndf = df.copy()
    mem = len(alpha)
    p = len(df.columns)//mem
    cols = list(df.columns)
    for j in range(len(cols)):
        offset = j//p
        col = cols[j]
        new_col = []
        for i in range(len(df)):
            new_col.append(alpha[offset] * df[col].iloc[i])
        ndf[col] = new_col
    
    return ndf

In [10]:
%%time
# Applies the s function described in the paper and shifts our y's to be the appropriate target
mem = 40
sdf = create_series(df, 'Price', mem)

CPU times: user 22.5 s, sys: 92.8 ms, total: 22.6 s
Wall time: 22.6 s


In [11]:
sdf

Unnamed: 0,Market Cap,Price,Volume (24h),month,day,weekday,hour,minute,Market Cap 2,Price 2,...,minute 39,Market Cap 40,Price 40,Volume (24h) 40,month 40,day 40,weekday 40,hour 40,minute 40,Target
0,1.412176e+11,7731.42,3.667217e+10,3,11,2,13,25,1.414443e+11,7743.83,...,35,1.434762e+11,7854.99,3.854349e+10,3,11,2,16,40,7857.39
1,1.414443e+11,7743.83,3.671429e+10,3,11,2,13,30,1.412215e+11,7731.63,...,40,1.435202e+11,7857.39,3.853940e+10,3,11,2,16,45,7859.55
2,1.412215e+11,7731.63,3.671036e+10,3,11,2,13,35,1.411071e+11,7725.36,...,45,1.435595e+11,7859.55,3.853683e+10,3,11,2,16,50,7860.57
3,1.411071e+11,7725.36,3.670002e+10,3,11,2,13,40,1.410212e+11,7720.66,...,50,1.435782e+11,7860.57,3.852199e+10,3,11,2,16,55,7854.65
4,1.410212e+11,7720.66,3.679760e+10,3,11,2,13,45,1.406100e+11,7698.15,...,55,1.434703e+11,7854.65,3.850995e+10,3,11,2,17,0,7864.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9793,1.272171e+11,6942.42,3.526667e+10,4,14,1,13,45,1.271520e+11,6938.87,...,55,1.259915e+11,6875.43,3.533022e+10,4,14,1,17,0,6874.44
9794,1.271520e+11,6938.87,3.524068e+10,4,14,1,13,50,1.271551e+11,6939.04,...,0,1.259733e+11,6874.44,3.482723e+10,4,14,1,17,5,6857.49
9795,1.271551e+11,6939.04,3.495124e+10,4,14,1,13,55,1.270224e+11,6931.78,...,5,1.256628e+11,6857.49,3.490556e+10,4,14,1,17,10,6866.27
9796,1.270224e+11,6931.78,3.527270e+10,4,14,1,14,0,1.269756e+11,6929.23,...,10,1.258237e+11,6866.27,3.499034e+10,4,14,1,17,15,6856.76


In [12]:
# Now we need to standardize our data:
from sklearn.preprocessing import StandardScaler

In [13]:
# Use Sklearn to scale and center
scaler = StandardScaler()
snp = scaler.fit_transform(sdf)
sdf = pd.DataFrame(snp, columns=sdf.columns)

In [14]:
sdf.head()

Unnamed: 0,Market Cap,Price,Volume (24h),month,day,weekday,hour,minute,Market Cap 2,Price 2,...,minute 39,Market Cap 40,Price 40,Volume (24h) 40,month 40,day 40,weekday 40,hour 40,minute 40,Target
0,1.939912,1.968895,-0.486427,-0.815421,-0.546152,-0.517354,0.216412,-0.14512,1.958463,1.987599,...,0.43432,2.13978,2.170917,-0.271512,-0.822203,-0.547825,-0.514597,0.64939,0.724035,2.175092
1,1.958034,1.987148,-0.481617,-0.815421,-0.546152,-0.517354,0.216412,0.144528,1.940651,1.969652,...,0.723946,2.143318,2.174471,-0.271979,-0.822203,-0.547825,-0.514597,0.64939,1.01366,2.178293
2,1.940225,1.969204,-0.482066,-0.815421,-0.546152,-0.517354,0.216412,0.434176,1.931505,1.960428,...,1.013572,2.146486,2.177671,-0.272272,-0.822203,-0.547825,-0.514597,0.64939,1.303286,2.179804
3,1.931081,1.959982,-0.483247,-0.815421,-0.546152,-0.517354,0.216412,0.723824,1.924633,1.953514,...,1.303197,2.147986,2.179182,-0.273965,-0.822203,-0.547825,-0.514597,0.64939,1.592912,2.171033
4,1.92421,1.953069,-0.472106,-0.815421,-0.546152,-0.517354,0.216412,1.013472,1.891757,1.9204,...,1.592823,2.139305,2.170413,-0.27534,-0.822203,-0.547825,-0.514597,0.79388,-1.592971,2.184915


In [15]:
# We have our weighted and standardized data in series form
# Now we split with forward chaining
# Using the first 80% to train on
split = round(len(sdf) * .8)

X = sdf.values[:,:-1]
y = sdf.values[:, -1]

X_train = X[:split]
y_train = y[:split]

X_test = X[split:]
y_test = y[split:]

In [16]:
# Ridge Regression from Sklearn
# variable alpha for this model is our lambda, the penalty parameter
from sklearn.linear_model import Ridge

In [17]:
# Create our ridge model, alpha = 1.0 is the default
ridge = Ridge(alpha=1.0)

# Train our model
ridge.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [18]:
# MSE to be used to check how well our models do
def mse(fit, actual):
    return sum((fit - actual)**2)/len(fit)


def split(ndf, per):
    split = round(len(ndf) * per)

    X = ndf.values[:,:-1]
    y = ndf.values[:, -1]

    X_train = X[:split]
    y_train = y[:split]

    X_test = X[split:]
    y_test = y[split:]
    return X_train, y_train, X_test, y_test

In [19]:
# Pretty good MSE on unseen data and without a weighting
mse(ridge.predict(X_test), y_test)

0.00017223305545011522

In [20]:
# our data in sdf, standardized but not weighted

In [24]:
# Now we check all of our methods
# methods = {
#     'none' : None,
#     'linear' : linearly_decaying,
#     'quadratic' : quadratic,
#     'exponential' : exponential_decaying, 
#     'autocorrelation' : auto_correlation
# }

# lambda_ = 1
# per = .8

# # None 
# ridge_n = Ridge(alpha=lambda_)
# nsdf = sdf.copy()
# Xtr_n, ytr_n, Xte_n, yte_n = split(nsdf, per)
# ridge_n.fit(Xtr_n, ytr_n)
# print('None:', mse(ridge_n.predict(Xte_n), yte_n))

# # Linear
# ridge_l = Ridge(alpha=lambda_)
# alpha_l = linearly_decaying(mem, .02)
# sdf_l = alpha_apply(sdf.drop('Target', 1), alpha_l)
# Xtr_l, ytr_l, Xte_l, yte_l = split(sdf_l, per)
# ridge_l.fit(Xtr_l, ytr_l)
# print('Linear:', mse(ridge_l.predict(Xte_l), yte_l))

# # Quadratic
# ridge_q = Ridge(alpha=lambda_)
# alpha_q = quadratic(mem, .001)
# sdf_q = alpha_apply(sdf.drop('Target', 1), alpha_q)
# Xtr_q, ytr_q, Xte_q, yte_q = split(sdf_q, per)
# ridge_q.fit(Xtr_q, ytr_q)
# print('Quadratic:', mse(ridge_q.predict(Xte_q), yte_q))

# # Exponential
# ridge_e = Ridge(alpha=lambda_)
# alpha_e = exponential_decaying(mem, .95)
# sdf_e = alpha_apply(sdf.drop('Target', 1), alpha_e)
# Xtr_e, ytr_e, Xte_e, yte_e = split(sdf_e, per)
# ridge_e.fit(Xtr_e, ytr_e)
# print('Exponential:', mse(ridge_e.predict(Xte_e), yte_e))

# # Autocorrelation
# ridge_a = Ridge(alpha=lambda_)
# alpha_a = auto_correlation(sdf['Price'], mem)
# sdf_a = alpha_apply(sdf.drop('Target', 1), alpha_a)
# Xtr_a, ytr_a, Xte_a, yte_a = split(sdf_a, per)
# ridge_a.fit(Xtr_a, ytr_a)
# print('Autocorrelation:', mse(ridge_a.predict(Xte_a), yte_a))

None: 0.00017223305545011522
Linear: 3.870536978243927e-06
Quadratic: 3.8812384191501e-06
Exponential: 1.5885691140954652e-06




Autocorrelation: 7.666823318537951e-05


In [None]:
# None: 0.00017223305545011522
# Linear: 3.870536978243927e-06
# Quadratic: 3.8812384191501e-06
# Exponential: 1.5885691140954652e-06
# /Users/michaeldjaballah/opt/anaconda3/lib/python3.7/site-packages/statsmodels/tsa/stattools.py:572: FutureWarning: fft=True will become the default in a future version of statsmodels. To suppress this warning, explicitly set fft=False.
#   FutureWarning
# Autocorrelation: 7.666823318537951e-05

In [21]:
# # Now we check all of our methods
# methods = {
#     'none' : None,
#     'linear' : linearly_decaying,
#     'quadratic' : quadratic,
#     'exponential' : exponential_decaying, 
#     'autocorrelation' : auto_correlation
# }

# lambda_ = 1
# per = .8

# # None 
# ridge_n = Ridge(alpha=lambda_)
# nsdf = sdf.copy()
# Xtr_n, ytr_n, Xte_n, yte_n = split(nsdf, per)
# ridge_n.fit(Xtr_n, ytr_n)
# print('None:', mse(ridge_n.predict(Xte_n), yte_n))

# # Linear
# ridge_l = Ridge(alpha=lambda_)
# alpha_l = linearly_decaying(mem, .03)
# sdf_l = alpha_apply(sdf.drop('Target', 1), alpha_l)
# Xtr_l, ytr_l, Xte_l, yte_l = split(sdf_l, per)
# ridge_l.fit(Xtr_l, ytr_l)
# print('Linear:', mse(ridge_l.predict(Xte_l), yte_l))

# # Quadratic
# ridge_q = Ridge(alpha=lambda_)
# alpha_q = quadratic(mem, .005)
# sdf_q = alpha_apply(sdf.drop('Target', 1), alpha_q)
# Xtr_q, ytr_q, Xte_q, yte_q = split(sdf_q, per)
# ridge_q.fit(Xtr_q, ytr_q)
# print('Quadratic:', mse(ridge_q.predict(Xte_q), yte_q))

# # Exponential
# ridge_e = Ridge(alpha=lambda_)
# alpha_e = exponential_decaying(mem, .95)
# sdf_e = alpha_apply(sdf.drop('Target', 1), alpha_e)
# Xtr_e, ytr_e, Xte_e, yte_e = split(sdf_e, per)
# ridge_e.fit(Xtr_e, ytr_e)
# print('Exponential:', mse(ridge_e.predict(Xte_e), yte_e))

# # Autocorrelation
# ridge_a = Ridge(alpha=lambda_)
# alpha_a = auto_correlation(sdf['Price'], mem)
# sdf_a = alpha_apply(sdf.drop('Target', 1), alpha_a)
# Xtr_a, ytr_a, Xte_a, yte_a = split(sdf_a, per)
# ridge_a.fit(Xtr_a, ytr_a)
# print('Autocorrelation:', mse(ridge_a.predict(Xte_a), yte_a))

None: 0.00016971427526463664
Linear: 4.921204691230779e-07
Quadratic: 5.788214609632041e-05
Exponential: 1.5396241684320618e-06




Autocorrelation: 2.720310845558427e-05


In [None]:
# None: 0.00016971427526463664
# Linear: 4.921204691230779e-07
# Quadratic: 5.788214609632041e-05
# Exponential: 1.5396241684320618e-06
# /Users/michaeldjaballah/opt/anaconda3/lib/python3.7/site-packages/statsmodels/tsa/stattools.py:572: FutureWarning: fft=True will become the default in a future version of statsmodels. To suppress this warning, explicitly set fft=False.
#   FutureWarning
# Autocorrelation: 2.720310845558427e-05

In [26]:
# # Now we check all of our methods
# methods = {
#     'none' : None,
#     'linear' : linearly_decaying,
#     'quadratic' : quadratic,
#     'exponential' : exponential_decaying, 
#     'autocorrelation' : auto_correlation
# }

# lambda_ = 1
# per = .8

# # None 
# ridge_n = Ridge(alpha=lambda_)
# nsdf = sdf.copy()
# Xtr_n, ytr_n, Xte_n, yte_n = split(nsdf, per)
# ridge_n.fit(Xtr_n, ytr_n)
# print('None:', mse(ridge_n.predict(Xte_n), yte_n))

# # Linear
# ridge_l = Ridge(alpha=lambda_)
# alpha_l = linearly_decaying(mem, .03)
# sdf_l = alpha_apply(sdf.drop('Target', 1), alpha_l)
# Xtr_l, ytr_l, Xte_l, yte_l = split(sdf_l, per)
# ridge_l.fit(Xtr_l, ytr_l)
# print('Linear:', mse(ridge_l.predict(Xte_l), yte_l))

# # Quadratic
# ridge_q = Ridge(alpha=lambda_)
# alpha_q = quadratic(mem, .005)
# sdf_q = alpha_apply(sdf.drop('Target', 1), alpha_q)
# Xtr_q, ytr_q, Xte_q, yte_q = split(sdf_q, per)
# ridge_q.fit(Xtr_q, ytr_q)
# print('Quadratic:', mse(ridge_q.predict(Xte_q), yte_q))

# # Exponential
# ridge_e = Ridge(alpha=lambda_)
# alpha_e = exponential_decaying(mem, .95)
# sdf_e = alpha_apply(sdf.drop('Target', 1), alpha_e)
# Xtr_e, ytr_e, Xte_e, yte_e = split(sdf_e, per)
# ridge_e.fit(Xtr_e, ytr_e)
# print('Exponential:', mse(ridge_e.predict(Xte_e), yte_e))

# # Autocorrelation
# ridge_a = Ridge(alpha=lambda_)
# alpha_a = auto_correlation(sdf['Price'], mem)
# sdf_a = alpha_apply(sdf.drop('Target', 1), alpha_a)
# Xtr_a, ytr_a, Xte_a, yte_a = split(sdf_a, per)
# ridge_a.fit(Xtr_a, ytr_a)
# print('Autocorrelation:', mse(ridge_a.predict(Xte_a), yte_a))

None: 0.0001672854783610389
Linear: 4.081481725547894e-06
Quadratic: 5.495839867975665e-08
Exponential: 3.1937175320333008e-06




Autocorrelation: 2.1366055497306093e-05


In [None]:
# None: 0.0001672854783610389
# Linear: 4.081481725547894e-06
# Quadratic: 5.495839867975665e-08
# Exponential: 3.1937175320333008e-06
# /Users/michaeldjaballah/opt/anaconda3/lib/python3.7/site-packages/statsmodels/tsa/stattools.py:572: FutureWarning: fft=True will become the default in a future version of statsmodels. To suppress this warning, explicitly set fft=False.
#   FutureWarning
# Autocorrelation: 2.1366055497306093e-05

In [42]:
# # Now we check all of our methods
# methods = {
#     'none' : None,
#     'linear' : linearly_decaying,
#     'quadratic' : quadratic,
#     'exponential' : exponential_decaying, 
#     'autocorrelation' : auto_correlation
# }

# lambda_ = 1
# per = .8

# # None 
# ridge_n = Ridge(alpha=lambda_)
# nsdf = sdf.copy()
# Xtr_n, ytr_n, Xte_n, yte_n = split(nsdf, per)
# ridge_n.fit(Xtr_n, ytr_n)
# print('None:', mse(ridge_n.predict(Xte_n), yte_n))

# # Linear
# ridge_l = Ridge(alpha=lambda_)
# alpha_l = linearly_decaying(mem, .05)
# sdf_l = alpha_apply(sdf.drop('Target', 1), alpha_l)
# Xtr_l, ytr_l, Xte_l, yte_l = split(sdf_l, per)
# ridge_l.fit(Xtr_l, ytr_l)
# print('Linear:', mse(ridge_l.predict(Xte_l), yte_l))

# # Quadratic
# ridge_q = Ridge(alpha=lambda_)
# alpha_q = quadratic(mem, .01)
# sdf_q = alpha_apply(sdf.drop('Target', 1), alpha_q)
# Xtr_q, ytr_q, Xte_q, yte_q = split(sdf_q, per)
# ridge_q.fit(Xtr_q, ytr_q)
# print('Quadratic:', mse(ridge_q.predict(Xte_q), yte_q))

# # Exponential
# ridge_e = Ridge(alpha=lambda_)
# alpha_e = exponential_decaying(mem, .95)
# sdf_e = alpha_apply(sdf.drop('Target', 1), alpha_e)
# Xtr_e, ytr_e, Xte_e, yte_e = split(sdf_e, per)
# ridge_e.fit(Xtr_e, ytr_e)
# print('Exponential:', mse(ridge_e.predict(Xte_e), yte_e))

# # Autocorrelation
# ridge_a = Ridge(alpha=lambda_)
# alpha_a = auto_correlation(sdf['Price'], mem)
# sdf_a = alpha_apply(sdf.drop('Target', 1), alpha_a)
# Xtr_a, ytr_a, Xte_a, yte_a = split(sdf_a, per)
# ridge_a.fit(Xtr_a, ytr_a)
# print('Autocorrelation:', mse(ridge_a.predict(Xte_a), yte_a))

None: 0.00016190140186996926
Linear: 0.09146543348125405
Quadratic: 0.09147211269138547
Exponential: 0.30200493649623006




Autocorrelation: 0.2972062183629557


In [None]:
# None: 0.00016190140186996926
# Linear: 0.09146543348125405
# Quadratic: 0.09147211269138547
# Exponential: 0.30200493649623006
# /Users/michaeldjaballah/opt/anaconda3/lib/python3.7/site-packages/statsmodels/tsa/stattools.py:572: FutureWarning: fft=True will become the default in a future version of statsmodels. To suppress this warning, explicitly set fft=False.
#   FutureWarning
# Autocorrelation: 0.2972062183629557