In [53]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import os
from misc import *
from sklearn.feature_selection import mutual_info_regression

In [21]:
# load data files
coin_names = ['btc', 'eth']
coins = dict()
cwd = os.getcwd() + '/'

for coin in coin_names:
    coins[coin] = pd.read_pickle(cwd + coin)

In [12]:
coins['btc'].head()

Unnamed: 0,open,close,high,low,volume,log_rtrn_1m
2018-01-01 00:02:00,13774.0,13742.0,13777.0,13741.0,16.958926,-0.002181
2018-01-01 00:03:00,13741.0,13722.0,13742.0,13721.0,25.783042,-0.001456
2018-01-01 00:04:00,13722.0,13709.0,13722.0,13709.0,11.431,-0.000948
2018-01-01 00:05:00,13710.0,13628.0,13710.0,13559.0,169.517133,-0.005926
2018-01-01 00:06:00,13628.0,13625.0,13637.0,13621.0,21.351554,-0.00022


In [60]:
def features_candle_stick(df):
    X = pd.DataFrame()
    X['average_price'] = (df.close + df.high + df.low) / 3
    X['up_shdw'] = df.high - np.maximum(df.close, df.open)
    X['dwn_shdw'] = np.minimum(df.close, df.open) - df.low
    return X

def features_volatility(df):
    X = pd.DataFrame()
    X['vol_15m'] = df.close.rolling(15, min_periods=1).std()
    X['vol_2h'] = df.close.rolling('2H', min_periods=1).std()
#     garman-klass volatility
    X['vol_gs'] = (1 / 2 * np.log(df.high/df.low) ** 2 - (2 * np.log(2) - 1) * np.log(df.close/df.open) ** 2)
    
    X['vol_rs'] = np.log(df.high/df.close)*np.log(df.high/df.open) + np.log(df.low/df.close)*np.log(df.low/df.open)
    return X

def features_ewm(df):
    X = pd.DataFrame()
    X['ewm_21'] = df.close.ewm(span=21, min_periods=5, adjust=False).mean()
    X['ewm_35'] = df.close.ewm(span=35, min_periods=5, adjust=False).mean()
    X['ewm_80'] = df.close.ewm(span=80, min_periods=5, adjust=False).mean()
    X['ewm_250'] = df.close.ewm(span=250, min_periods=5, adjust=False).mean()
    return X

In [81]:
def make_target(df):
    y = pd.DataFrame()
#     y['target'] = log_return(df.close, periods=-16) - log_return(df.close, periods=-1)
    y['target'] = df.close.shift(-1).dropna()
    return y


In [82]:
coin = 'btc'
X = pd.concat([coins[coin], features_ewm(coins[coin])], axis=1).dropna()

y = make_target(coins['btc'])


In [83]:
X,y = X.align(y, join='inner',axis=0)

In [63]:
def make_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [65]:
temp2 = make_mi_scores(X,y)
display(temp2)

close          0.872753
high           0.829890
low            0.826953
open           0.791258
ewm_21         0.654786
ewm_35         0.609358
ewm_80         0.542442
ewm_250        0.459580
log_rtrn_1m    0.121671
log_rtrn       0.121658
volume         0.058766
Name: MI Scores, dtype: float64

In [55]:
mutual_info_regression(X,y)

array([0.79127577, 0.87275275, 0.82985689, 0.82676975, 0.0587861 ,
       0.12167156, 0.12176176, 0.85733117, 0.03729711, 0.03669953,
       0.1111585 , 0.12173588, 0.115492  , 0.07991364, 0.65476636])

In [57]:
temp = pd.Series(np.array([0.79127577, 0.87275275, 0.82985689, 0.82676975, 0.0587861 ,
       0.12167156, 0.12176176, 0.85733117, 0.03729711, 0.03669953,
       0.1111585 , 0.12173588, 0.115492  , 0.07991364, 0.65476636]), index=X.columns)

In [59]:
temp.sort_values(ascending=False)

close            0.872753
average_price    0.857331
high             0.829857
low              0.826770
open             0.791276
ewm_21           0.654766
log_rtrn_1m      0.121762
vol_2h           0.121736
log_rtrn         0.121672
vol_gs           0.115492
vol_15m          0.111158
vol_rs           0.079914
volume           0.058786
up_shdw          0.037297
dwn_shdw         0.036700
dtype: float64

In [84]:
total = pd.concat([X,y], axis=1)

In [85]:
total.to_pickle(cwd + 'btc_total.pkl')