In [1]:
%%time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import torch
import torchvision
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.utils import save_image
from torchvision.datasets import MNIST, CIFAR10

from sklearn import preprocessing, model_selection
from datetime import datetime, timedelta
import calendar

import os
import gc

CPU times: user 931 ms, sys: 312 ms, total: 1.24 s
Wall time: 1.87 s


In [2]:
save_path = '/Users/michaeldjaballah/Data/Crypto/processed/'

odf = pd.read_csv(save_path + 'ri2.csv')

# Filter to about 10K observations and narrow down to one coin
df = odf[odf['Symbol'] == 'BTC']
del odf

In [3]:
# Preprocessing for columns that we are not interested in
df.drop(['Unnamed: 0', '#', 'sort', 'Name', 'Symbol', '% 1h', '% 24h', '% 7d', 'year', 'Circulating Supply'], axis=1, inplace=True)
df.reset_index(inplace=True, drop=True)

In [4]:
# Easily converts money strings to floats
def money_to_float(money):
    return float(money.replace('$', '').replace(',', ''))

In [5]:
# Quick way to convert our data to floats
new_price = []
new_mkt_cap = []
new_vol = []

for i in range(len(df)):
    new_price.append(money_to_float(df.iloc[i]['Price']))
    new_mkt_cap.append(money_to_float(df.iloc[i]['Market Cap']))
    new_vol.append(money_to_float(df.iloc[i]['Volume (24h)']))

df['Price'] = new_price
df['Market Cap'] = new_mkt_cap
df['Volume (24h)'] = new_vol

In [6]:
# Use for memory of size 1
def create_series(df, col, out='Target', inplace=False):
    
    if not inplace:
        df = df.copy()
    
    next_list = []
    for i in range(1, len(df)):
        next_list.append(df.iloc[i][col])
        
    df.drop(len(df) - 1, inplace=True)
    df[out] = next_list
    
    if not inplace:
        return df
    return None

In [7]:
# s function described in the paper, does not work for memory less than 2
def create_series(df, scol, mem, out='Target', inplace=False):
    if not inplace:
        df = df.copy()
    
    new_cols = {}
    cols = list(df.columns)
    
    for i in range(2, mem + 1):
        for col in cols:
            new_cols[(col, i)] = []
    
    for col in new_cols:
        offset = col[1] - 1
        orig_col = col[0]
        for i in range(offset, len(df) - mem + offset):
            new_cols[col].append(df[orig_col].iloc[i])
            
    new_out = []
    for i in range(mem, len(df)):
        new_out.append(df[scol].iloc[i])
    
    df.drop(df.tail(mem).index, inplace=True)
    
    for col, num in new_cols:
        df[col + ' ' + str(num)] = new_cols[(col, num)]
        
    df[out] = new_out
        
    if not inplace:
        return df
    return None

In [8]:
# Alpha creation functions from paper

def linearly_decaying(mem, d):
    alpha = []
    for i in range(mem):
        alpha.append(round(1 - d*i, 6))
    return alpha


def quadratic(mem, d):
    alpha = [1]
    for i in range(1, mem):
        alpha.append(round(alpha[i-1] - i*d, 6))
    return alpha
    

def exponential_decaying(mem, d):
    alpha = [1]
    for i in range(1, mem):
        alpha.append(round(alpha[i-1] * d, 6))
    return alpha


# Autocorrelation provided by statsmodels
import statsmodels.api as sm


def auto_correlation(ser, mem):
    return list(pd.Series(sm.tsa.acf(ser, nlags=mem-1)).apply(abs))

In [9]:
# Applies weights to a dataframe
def alpha_apply(df, alpha):
    ndf = df.copy()
    mem = len(alpha)
    p = len(df.columns)//mem
    cols = list(df.columns)
    for j in range(len(cols)):
        offset = j//p
        col = cols[j]
        new_col = []
        for i in range(len(df)):
            new_col.append(alpha[offset] * df[col].iloc[i])
        ndf[col] = new_col
    
    return ndf

In [10]:
# Applies the s function described in the paper and shifts our y's to be the appropriate target
mem = 20
sdf = create_series(df, 'Price', mem)

In [11]:
sdf.head()

Unnamed: 0,Market Cap,Price,Volume (24h),month,day,weekday,hour,minute,Market Cap 2,Price 2,...,minute 19,Market Cap 20,Price 20,Volume (24h) 20,month 20,day 20,weekday 20,hour 20,minute 20,Target
0,141217600000.0,7731.42,36672170000.0,3,11,2,13,25,141444300000.0,7743.83,...,55,141738300000.0,7759.88,37944440000.0,3,11,2,15,0,7767.41
1,141444300000.0,7743.83,36714290000.0,3,11,2,13,30,141221500000.0,7731.63,...,0,141875800000.0,7767.41,37868210000.0,3,11,2,15,5,7766.42
2,141221500000.0,7731.63,36710360000.0,3,11,2,13,35,141107100000.0,7725.36,...,5,141857700000.0,7766.42,37920230000.0,3,11,2,15,10,7796.45
3,141107100000.0,7725.36,36700020000.0,3,11,2,13,40,141021200000.0,7720.66,...,10,142406300000.0,7796.45,38152530000.0,3,11,2,15,15,7806.39
4,141021200000.0,7720.66,36797600000.0,3,11,2,13,45,140610000000.0,7698.15,...,15,142588000000.0,7806.39,38230690000.0,3,11,2,15,20,7798.34


In [12]:
# Now we need to standardize our data:
from sklearn.preprocessing import StandardScaler

In [13]:
# Use Sklearn to scale and center
scaler = StandardScaler()
snp = scaler.fit_transform(sdf)
sdf = pd.DataFrame(snp, columns=sdf.columns)

In [14]:
sdf.head()

Unnamed: 0,Market Cap,Price,Volume (24h),month,day,weekday,hour,minute,Market Cap 2,Price 2,...,minute 19,Market Cap 20,Price 20,Volume (24h) 20,month 20,day 20,weekday 20,hour 20,minute 20,Target
0,1.939074,1.968132,-0.4854,-0.817502,-0.546285,-0.515273,0.215707,-0.145152,1.957635,1.986846,...,1.593133,1.988798,2.018509,-0.339592,-0.8208,-0.547099,-0.513934,0.504104,-1.592645,2.030131
1,1.957203,1.986393,-0.480587,-0.817502,-0.546285,-0.515273,0.215707,0.144562,1.939816,1.968892,...,-1.592661,1.999831,2.029623,-0.348298,-0.8208,-0.547099,-0.513934,0.504104,-1.302992,2.02867
2,1.939387,1.968441,-0.481036,-0.817502,-0.546285,-0.515273,0.215707,0.434276,1.930666,1.959665,...,-1.303043,1.998377,2.028162,-0.342356,-0.8208,-0.547099,-0.513934,0.504104,-1.01334,2.073001
3,1.930239,1.959215,-0.482218,-0.817502,-0.546285,-0.515273,0.215707,0.72399,1.923792,1.952748,...,-1.013425,2.042383,2.072485,-0.315826,-0.8208,-0.547099,-0.513934,0.504104,-0.723688,2.087675
4,1.923366,1.9523,-0.471071,-0.817502,-0.546285,-0.515273,0.215707,1.013704,1.890902,1.91962,...,-0.723808,2.056954,2.087156,-0.306899,-0.8208,-0.547099,-0.513934,0.504104,-0.434036,2.075791


In [15]:
# We have our weighted and standardized data in series form
# Now we split with forward chaining
# Using the first 80% to train on
split = round(len(sdf) * .8)

X = sdf.values[:,:-1]
y = sdf.values[:, -1]

X_train = X[:split]
y_train = y[:split]

X_test = X[split:]
y_test = y[split:]

In [16]:
# Ridge Regression from Sklearn
# variable alpha for this model is our lambda, the penalty parameter
from sklearn.linear_model import Ridge

In [17]:
# Create our ridge model, alpha = 1.0 is the default
ridge = Ridge(alpha=1.0)

# Train our model
ridge.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [18]:
# MSE to be used to check how well our models do
def mse(fit, actual):
    return sum((fit - actual)**2)/len(fit)


def split(ndf, per):
    split = round(len(ndf) * per)

    X = ndf.values[:,:-1]
    y = ndf.values[:, -1]

    X_train = X[:split]
    y_train = y[:split]

    X_test = X[split:]
    y_test = y[split:]
    return X_train, y_train, X_test, y_test

In [19]:
# Pretty good MSE on unseen data and without a weighting
mse(ridge.predict(X_test), y_test)

0.0001672854783610389

In [20]:
# our data in sdf, standardized but not weighted

In [26]:
# Now we check all of our methods
methods = {
    'none' : None,
    'linear' : linearly_decaying,
    'quadratic' : quadratic,
    'exponential' : exponential_decaying, 
    'autocorrelation' : auto_correlation
}

lambda_ = 1
per = .8

# None 
ridge_n = Ridge(alpha=lambda_)
nsdf = sdf.copy()
Xtr_n, ytr_n, Xte_n, yte_n = split(nsdf, per)
ridge_n.fit(Xtr_n, ytr_n)
print('None:', mse(ridge_n.predict(Xte_n), yte_n))

# Linear
ridge_l = Ridge(alpha=lambda_)
alpha_l = linearly_decaying(mem, .03)
sdf_l = alpha_apply(sdf.drop('Target', 1), alpha_l)
Xtr_l, ytr_l, Xte_l, yte_l = split(sdf_l, per)
ridge_l.fit(Xtr_l, ytr_l)
print('Linear:', mse(ridge_l.predict(Xte_l), yte_l))

# Quadratic
ridge_q = Ridge(alpha=lambda_)
alpha_q = quadratic(mem, .005)
sdf_q = alpha_apply(sdf.drop('Target', 1), alpha_q)
Xtr_q, ytr_q, Xte_q, yte_q = split(sdf_q, per)
ridge_q.fit(Xtr_q, ytr_q)
print('Quadratic:', mse(ridge_q.predict(Xte_q), yte_q))

# Exponential
ridge_e = Ridge(alpha=lambda_)
alpha_e = exponential_decaying(mem, .95)
sdf_e = alpha_apply(sdf.drop('Target', 1), alpha_e)
Xtr_e, ytr_e, Xte_e, yte_e = split(sdf_e, per)
ridge_e.fit(Xtr_e, ytr_e)
print('Exponential:', mse(ridge_e.predict(Xte_e), yte_e))

# Autocorrelation
ridge_a = Ridge(alpha=lambda_)
alpha_a = auto_correlation(sdf['Price'], mem)
sdf_a = alpha_apply(sdf.drop('Target', 1), alpha_a)
Xtr_a, ytr_a, Xte_a, yte_a = split(sdf_a, per)
ridge_a.fit(Xtr_a, ytr_a)
print('Autocorrelation:', mse(ridge_a.predict(Xte_a), yte_a))

None: 0.0001672854783610389
Linear: 4.081481725547894e-06
Quadratic: 5.495839867975665e-08
Exponential: 3.1937175320333008e-06




Autocorrelation: 2.1366055497306093e-05


In [42]:
# # Now we check all of our methods
# methods = {
#     'none' : None,
#     'linear' : linearly_decaying,
#     'quadratic' : quadratic,
#     'exponential' : exponential_decaying, 
#     'autocorrelation' : auto_correlation
# }

# lambda_ = 1
# per = .8

# # None 
# ridge_n = Ridge(alpha=lambda_)
# nsdf = sdf.copy()
# Xtr_n, ytr_n, Xte_n, yte_n = split(nsdf, per)
# ridge_n.fit(Xtr_n, ytr_n)
# print('None:', mse(ridge_n.predict(Xte_n), yte_n))

# # Linear
# ridge_l = Ridge(alpha=lambda_)
# alpha_l = linearly_decaying(mem, .05)
# sdf_l = alpha_apply(sdf.drop('Target', 1), alpha_l)
# Xtr_l, ytr_l, Xte_l, yte_l = split(sdf_l, per)
# ridge_l.fit(Xtr_l, ytr_l)
# print('Linear:', mse(ridge_l.predict(Xte_l), yte_l))

# # Quadratic
# ridge_q = Ridge(alpha=lambda_)
# alpha_q = quadratic(mem, .01)
# sdf_q = alpha_apply(sdf.drop('Target', 1), alpha_q)
# Xtr_q, ytr_q, Xte_q, yte_q = split(sdf_q, per)
# ridge_q.fit(Xtr_q, ytr_q)
# print('Quadratic:', mse(ridge_q.predict(Xte_q), yte_q))

# # Exponential
# ridge_e = Ridge(alpha=lambda_)
# alpha_e = exponential_decaying(mem, .95)
# sdf_e = alpha_apply(sdf.drop('Target', 1), alpha_e)
# Xtr_e, ytr_e, Xte_e, yte_e = split(sdf_e, per)
# ridge_e.fit(Xtr_e, ytr_e)
# print('Exponential:', mse(ridge_e.predict(Xte_e), yte_e))

# # Autocorrelation
# ridge_a = Ridge(alpha=lambda_)
# alpha_a = auto_correlation(sdf['Price'], mem)
# sdf_a = alpha_apply(sdf.drop('Target', 1), alpha_a)
# Xtr_a, ytr_a, Xte_a, yte_a = split(sdf_a, per)
# ridge_a.fit(Xtr_a, ytr_a)
# print('Autocorrelation:', mse(ridge_a.predict(Xte_a), yte_a))

None: 0.00016190140186996926
Linear: 0.09146543348125405
Quadratic: 0.09147211269138547
Exponential: 0.30200493649623006




Autocorrelation: 0.2972062183629557
