In [None]:
import numpy as np
import os
import pandas as pd
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
import shap
from sklearn.metrics import mean_squared_error
import datetime

import warnings
warnings.filterwarnings('ignore')

shap.initjs()

In [None]:
def date_string_to_datetime(date):
    date = date.split('.')
    date = date[2]+"-"+date[1]+"-"+date[0]#+" 00:30:00"
    return datetime.datetime.strptime(date, '%Y-%m-%d')# %H:%M:%S')
def string_to_float(value):
    value = value.replace('.', '')
    value = value.replace(',','.')
    return float(value)

In [None]:
# This step is used to clean the dataset and change the 1 minute view to 5 min view. We also add a log_return feature
datasets = dict()
market_datasets_names = []
conversion = {'open' : 'first', 'high' : 'max', 'low' : 'min', 'close' : 'last', 'logreturn': 'mean', 'variance': 'sum'}

for dirname, _, filenames in os.walk('/kaggle/input/market-stocks-historical-data/'):
    for filename in filenames:
        market_datasets_names.append(filename)
        print(os.path.join(dirname, filename))

for dataset in market_datasets_names:
    dataset_path = '/kaggle/input/market-stocks-historical-data/' + dataset
    datasets[dataset] = pd.read_csv(dataset_path)
    datasets[dataset].rename(columns={"Data": "open_time", "Último": "close", "Abertura": "open", "Máxima": "high", "Mínima": "low", "Vol.": "volume", "Var%": "var"}, inplace=True)
    datasets[dataset]['open_time'] = datasets[dataset].apply(lambda x: date_string_to_datetime(x.open_time), axis=1)
    datasets[dataset]['close'] = datasets[dataset].apply(lambda x:string_to_float(x.close), axis=1)
    datasets[dataset]['open'] = datasets[dataset].apply(lambda x:string_to_float(x.open), axis=1)
    datasets[dataset]['high'] = datasets[dataset].apply(lambda x:string_to_float(x.high), axis=1)
    datasets[dataset]['low'] = datasets[dataset].apply(lambda x:string_to_float(x.low), axis=1)
    datasets[dataset].set_index("open_time", inplace=True)
    datasets[dataset].sort_index(ascending=True, inplace=True)
    datasets[dataset] = datasets[dataset].shift(periods=-1).fillna(method='bfill')
    print(datasets[dataset].shape)

datasets['Ibovespa Dados Histricos.csv'].head()

In [None]:
# Crypto pairs that you want to work with
USED_DATASETS = ['ETH-USDT', 'BTC-USDT', 'XRP-USDT', 'IOTA-USDT', 'LTC-USDT', 'ETH-BTC', 'XRP-BTC', 'IOTA-BTC', 'LTC-BTC']

In [None]:
# This step is used to clean the dataset and change the 1 minute view to 5 min view. We also add a log_return feature
# datasets = dict()
for dataset in USED_DATASETS:
    dataset_path = '/kaggle/input/binance-full-history/' + dataset + '.parquet'
    datasets[dataset] = pd.read_parquet(dataset_path)
    datasets[dataset] = datasets[dataset][datasets[dataset].number_of_trades > 0]
    datasets[dataset] = datasets[dataset].resample("1D").bfill()
    datasets[dataset] = datasets[dataset][['open', 'close']]
    datasets[dataset]['log_return'] = np.log(datasets[dataset].close) - np.log(datasets[dataset].close.shift(1))
    print(datasets[dataset].shape)
    
datasets['ETH-USDT'].head()

In [None]:
datasets.keys()

In [None]:
# Function to find the initial date for all datasets and start all dataset in the same date
def set_min_date(df_dict):
    min_date = datasets['ETH-USDT'].index[0]
    for name, df in df_dict.items():
        if min_date < df.index[0]:
            min_date = df.index[0]
    print('Initial date: ', min_date)
    for name, df in df_dict.items():
        df_dict[name] = df_dict[name][df_dict[name].index >= min_date]
        print(name, '-- Shape:', df_dict[name].shape, '  First date:', df_dict[name].index[0])

In [None]:
set_min_date(datasets)

In [None]:
# This function is used to plot the time series
def plot_time_series(df_name, column, ylabel = 'Price'):
    plot = datasets[df_name][column].plot()
    plot.set_title(column + ' price')
    plot.set_ylabel(ylabel)
    plot.set_xlabel('Time')
    return plot

In [None]:
plot_time_series('XRP-USDT', 'close')

In [None]:
plot_time_series('ETH-USDT', 'log_return', 'Return')

## Determinação de sazonalidade

In [None]:
def plot_lagged_autocorrelation(df, shift_value, corr):
    df_tmp = df[['close']]
    df_tmp['close_shifted'] = df_tmp.close.shift(shift_value)
    plot = df_tmp.plot()
    plot.set_title('Shift Value: ' + str(shift_value) + ' -- Corr: ' + str(round(corr,5)))
    plot.set_ylabel('Price')
    plot.set_xlabel('Time')
    return plot


def lagged_autocorrelation(df, shift_list, plot_lagged = False):
    results = dict()
    for shift_value in shift_list:
        corr = np.corrcoef(df.close.iloc[:-shift_value], df.close.iloc[shift_value:])[0][1]
        if plot_lagged:
            plot_lagged_autocorrelation(df, shift_value, corr)
        results[str(shift_value)] = corr
    return results

In [None]:
# Seasonalities:
# Daily: 288 --> 1 day = 24 hs = 60min * 24 = 1440min / 5min (time frequency) = 288
# Weekly: 2016
# Monthly: 8640

lagged_autocorrelation(datasets['ETH-USDT'], [288, 2016, 8640], plot_lagged=True)

In [None]:
corr_best_returns = dict()
for df_name, df in datasets.items():
    corrs = lagged_autocorrelation(df, [288, 2016, 8640])
    corr_best_returns[df_name] = max(corrs, key = corrs.get)
corr_best_returns

## Correlação entre as séries temporais

In [None]:
def corr_matrix(df_list_name):
    tmp_df_close = pd.DataFrame(index=datasets[df_list_name[0]].index)
    for df_name in df_list_name:
        tmp_df_close[df_name] = datasets[df_name].close
    return tmp_df_close.corr()

In [None]:
sns.heatmap(corr_matrix(['ETH-BTC', 'LTC-BTC', 'IOTA-BTC', 'XRP-BTC']), annot=True, cmap='inferno_r')

In [None]:
sns.heatmap(corr_matrix(['ETH-USDT', 'BTC-USDT', 'XRP-USDT', 'IOTA-USDT', 'LTC-USDT']), annot=True, cmap='inferno_r')

## Treino de modelos

In [None]:
def plot_time_series_forecast(model, X_train, X_test, y_train, y_test):
    plt.figure(figsize=(16,9))
    plt.plot(X_test.index, model.predict(X_test), color='red', label = 'predicted')
    plt.plot(X_test.index, y_test, color = 'blue', label = 'real')
    plt.plot(X_train.index, y_train, color = 'green', label = 'treino')
    plt.xlabel('Time')
    plt.ylabel('Price')
    plt.legend()
    plt.show()
    try:
        print("Model info:\n", model.bestparams)
    except:
        pass
    print("Model RMSE:\n {}".format(np.sqrt(mean_squared_error(y_test, model.predict(X_test)))))
    print("Model RMSE/mean:\n {}".format(np.sqrt(mean_squared_error(y_test, model.predict(X_test)))/y_test.mean()))
    print("Model RMSE/median:\n {}".format(np.sqrt(mean_squared_error(y_test, model.predict(X_test)))/y_test.median()))

### Média móvel

In [None]:
# Parâmetros para o treino

LAGGED_FEATURES = 4
COIN = 'BTC-USDT'

In [None]:
LAGGED_FEATURES += 1

In [None]:
def create_lagged_features(df, lagged_features):
    lagged_features_name = []
    df_ = df.copy()
    for i in range(lagged_features):
        df_['lagged_' + str(i)] = df_.close.shift(i)
        if i>0:
            lagged_features_name.append('lagged_' + str(i))
    return df_, lagged_features_name

In [None]:
df, lagged_features_name = create_lagged_features(datasets[COIN], LAGGED_FEATURES)
df = df.iloc[LAGGED_FEATURES-1:]
df['lagged_sum'] = df[lagged_features_name].sum(axis=1)
df['y_pred'] = df['lagged_sum']/(LAGGED_FEATURES-1)
y_pred = list(df['y_pred'])
np.sqrt(mean_squared_error(df.close, y_pred))

In [None]:
plt.plot(range(len(y_pred)), y_pred, color='blue', label = 'predicted')
plt.plot(range(len(y_pred)), df.close, color = 'red', label = 'real')
plt.legend()
plt.show()

### Regressão linear usando variáveis atrasadas

Aqui, vamos realizar uma regressão linear e para isso usaremos variáveis atrasadas. Além disso, criamos funções para gerar variáveis atrasadas e também para separar nossa série em treino e teste.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
# Parâmetros para o treino

LAGGED_FEATURES = 5
TEST_RATIO = 0.3
COIN = 'BTC-USDT'

In [None]:
def create_lagged_features(df, lagged_features):
    lagged_features_name = []
    df_ = df.copy()
    for i in range(lagged_features):
        df_['lagged_' + str(i)] = df_.close.shift(i)
        if i>0:
            lagged_features_name.append('lagged_' + str(i))
    return df_, lagged_features_name

def train_test_split(df, test_ratio, lagged_features, lagged_features_name):
    final_train_index = int(df.shape[0] * test_ratio)
    X_train = df.iloc[lagged_features:-final_train_index][lagged_features_name]
    y_train = df.iloc[lagged_features:-final_train_index]['close']
    X_test  = df.iloc[-final_train_index:][lagged_features_name]
    y_test  = df.iloc[-final_train_index:]['close']
    return X_train, y_train, X_test, y_test

In [None]:
df, lagged_features_name = create_lagged_features(datasets[COIN], LAGGED_FEATURES)
X_train, y_train, X_test, y_test = train_test_split(df, TEST_RATIO, LAGGED_FEATURES, lagged_features_name)
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
plt.plot(range(len(y_pred)), y_pred, color='red', label = 'predicted')
plt.plot(range(len(y_pred)), y_test, color = 'blue', label = 'real')
plt.xlabel('Time')
plt.ylabel('Price')
plt.legend()
plt.show()

### Explicabilidade da Regressão Linear

In [None]:
explainer = shap.LinearExplainer(reg, X_train)
shap_values = explainer.shap_values(X_test)
X_test_array = X_test # we need to pass a dense version for the plotting functions

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=X_test.columns)

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=X_test.columns, plot_type='bar')

## KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

In [None]:
# Parâmetros para o treino

LAGGED_FEATURES = 4
TEST_RATIO = 0.3
COIN = 'XRP-USDT'

In [None]:
def create_lagged_features(df, lagged_features):
    lagged_features_name = []
    df_ = df.copy()
    for i in range(lagged_features):
        df_['lagged_' + str(i)] = df_.close.shift(i)
        if i>0:
            lagged_features_name.append('lagged_' + str(i))
    return df_, lagged_features_name

def train_test_split(df, test_ratio, lagged_features, lagged_features_name):
    final_train_index = int(df.shape[0] * test_ratio)
    X_train = df.iloc[lagged_features:-final_train_index][lagged_features_name]
    y_train = df.iloc[lagged_features:-final_train_index]['close']
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = df.iloc[-final_train_index:][lagged_features_name]
    y_test  = df.iloc[-final_train_index:]['close']
    X_test = scaler.transform(X_test)
    return X_train, y_train, X_test, y_test

In [None]:
df, lagged_features_name = create_lagged_features(datasets[COIN], LAGGED_FEATURES)
X_train, y_train, X_test, y_test = train_test_split(df, TEST_RATIO, LAGGED_FEATURES, lagged_features_name)
# reg = SVR(kernel='linear', C=81000).fit(X_train, y_train)
reg = KNeighborsRegressor(n_neighbors=5, weights='uniform', p=2).fit(X_train, y_train)
y_pred = reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
plt.plot(range(len(y_pred)), y_pred, color='red', label = 'predicted')
plt.plot(range(len(y_pred)), y_test, color = 'blue', label = 'real')
plt.xlabel('Time')
plt.ylabel('Price')
plt.legend()
plt.show()

## SVR

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

In [None]:
# Parâmetros para o treino

LAGGED_FEATURES = 4
TEST_RATIO = 0.3
COIN = 'XRP-USDT'

In [None]:
def create_lagged_features(df, lagged_features):
    lagged_features_name = []
    df_ = df.copy()
    for i in range(lagged_features):
        df_['lagged_' + str(i)] = df_.close.shift(i)
        if i>0:
            lagged_features_name.append('lagged_' + str(i))
    return df_, lagged_features_name

def train_test_split(df, test_ratio, lagged_features, lagged_features_name):
    final_train_index = int(df.shape[0] * test_ratio)
    X_train = df.iloc[lagged_features:-final_train_index][lagged_features_name]
    y_train = df.iloc[lagged_features:-final_train_index]['close']
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = df.iloc[-final_train_index:][lagged_features_name]
    y_test  = df.iloc[-final_train_index:]['close']
    X_test = scaler.transform(X_test)
    return X_train, y_train, X_test, y_test

In [None]:
df, lagged_features_name = create_lagged_features(datasets[COIN], LAGGED_FEATURES)
X_train, y_train, X_test, y_test = train_test_split(df, TEST_RATIO, LAGGED_FEATURES, lagged_features_name)
reg = SVR(kernel='linear', C=1, epsilon = 0.0004).fit(X_train, y_train)
# reg = KNeighborsRegressor(n_neighbors=5, weights='uniform', p=2).fit(X_train, y_train)
y_pred = reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
plt.plot(range(len(y_pred)), y_pred, color='red', label = 'predicted')
plt.plot(range(len(y_pred)), y_test, color = 'blue', label = 'real')
plt.xlabel('Time')
plt.ylabel('Price')
plt.legend()
plt.show()

## Regressão linear usando múltiplos datasets

In [None]:
# Parâmetros para o treino

LAGGED_FEATURES = 4
TEST_RATIO = 0.3
COIN = 'LTC-USDT'
COINS_TO_USE = [COIN, 'BTC-USDT']

In [None]:
def create_multiple_lagged_features(df_dict, lagged_features, coin, related_coins):
    lagged_features_name = []
    df_ = df_dict[coin].copy()
    for coin_ in related_coins:
        for i in range(lagged_features):
            df_['lagged_' + str(i) + coin_] = df_dict[coin_].close.shift(i)
            if i>0:
                lagged_features_name.append('lagged_' + str(i) + coin_)
    return df_, lagged_features_name

def train_test_split(df, test_ratio, lagged_features, lagged_features_name):
    final_train_index = int(df.shape[0] * test_ratio)
    X_train = df.iloc[lagged_features:-final_train_index][lagged_features_name]
    y_train = df.iloc[lagged_features:-final_train_index]['close']
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = df.iloc[-final_train_index:][lagged_features_name]
    y_test  = df.iloc[-final_train_index:]['close']
    X_test = scaler.transform(X_test)
    return X_train, y_train, X_test, y_test

In [None]:
df, lagged_features_name = create_multiple_lagged_features(datasets, LAGGED_FEATURES, COIN, COINS_TO_USE)
X_train, y_train, X_test, y_test = train_test_split(df, TEST_RATIO, LAGGED_FEATURES, lagged_features_name)
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
plt.plot(range(len(y_pred)), y_pred, color='blue', label = 'predicted')
plt.plot(range(len(y_pred)), y_test, color = 'red', label = 'real')
plt.legend()
plt.show()

In [None]:
explainer = shap.LinearExplainer(reg, X_train)
shap_values = explainer.shap_values(X_test)
X_test_array = X_test # we need to pass a dense version for the plotting functions

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name)

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name, plot_type='bar')

#### Análise do efeito das outras moedas no Bitcoin e vice-versa

1. Bitcoin e Litecoin

In [None]:
# Parâmetros para o treino

LAGGED_FEATURES = 8
TEST_RATIO = 0.3
COIN = 'BTC-USDT'
COINS_TO_USE = [COIN, 'LTC-USDT']

In [None]:
def create_multiple_lagged_features(df_dict, lagged_features, coin, related_coins):
    lagged_features_name = []
    df_ = df_dict[coin].copy()
    for coin_ in related_coins:
        for i in range(lagged_features):
            df_['lagged_' + str(i) + coin_] = df_dict[coin_].close.shift(i)
            if i>0:
                lagged_features_name.append('lagged_' + str(i) + coin_)
    return df_, lagged_features_name

def train_test_split(df, test_ratio, lagged_features, lagged_features_name):
    final_train_index = int(df.shape[0] * test_ratio)
    X_train = df.iloc[lagged_features:-final_train_index][lagged_features_name]
    y_train = df.iloc[lagged_features:-final_train_index]['close']
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = df.iloc[-final_train_index:][lagged_features_name]
    y_test  = df.iloc[-final_train_index:]['close']
    X_test = scaler.transform(X_test)
    return X_train, y_train, X_test, y_test

In [None]:
df, lagged_features_name = create_multiple_lagged_features(datasets, LAGGED_FEATURES, COIN, COINS_TO_USE)
X_train, y_train, X_test, y_test = train_test_split(df, TEST_RATIO, LAGGED_FEATURES, lagged_features_name)
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
plt.plot(range(len(y_pred)), y_pred, color='blue', label = 'predicted')
plt.plot(range(len(y_pred)), y_test, color = 'red', label = 'real')
plt.legend()
plt.show()

In [None]:
explainer = shap.LinearExplainer(reg, X_train)
shap_values = explainer.shap_values(X_test)
X_test_array = X_test # we need to pass a dense version for the plotting functions

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name)

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name, plot_type='bar')

Podemos verificar que a variável mais importante nesse caso é o valor do dia anterior do Bitcoin, enquanto as outras variáveis tem baixíssima influência no valor atual dele.

In [None]:
# Parâmetros para o treino

LAGGED_FEATURES = 8
TEST_RATIO = 0.3
COIN = 'LTC-USDT'
COINS_TO_USE = [COIN, 'BTC-USDT']

In [None]:
def create_multiple_lagged_features(df_dict, lagged_features, coin, related_coins):
    lagged_features_name = []
    df_ = df_dict[coin].copy()
    for coin_ in related_coins:
        for i in range(lagged_features):
            df_['lagged_' + str(i) + coin_] = df_dict[coin_].close.shift(i)
            if i>0:
                lagged_features_name.append('lagged_' + str(i) + coin_)
    return df_, lagged_features_name

def train_test_split(df, test_ratio, lagged_features, lagged_features_name):
    final_train_index = int(df.shape[0] * test_ratio)
    X_train = df.iloc[lagged_features:-final_train_index][lagged_features_name]
    y_train = df.iloc[lagged_features:-final_train_index]['close']
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = df.iloc[-final_train_index:][lagged_features_name]
    y_test  = df.iloc[-final_train_index:]['close']
    X_test = scaler.transform(X_test)
    return X_train, y_train, X_test, y_test

In [None]:
df, lagged_features_name = create_multiple_lagged_features(datasets, LAGGED_FEATURES, COIN, COINS_TO_USE)
X_train, y_train, X_test, y_test = train_test_split(df, TEST_RATIO, LAGGED_FEATURES, lagged_features_name)
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
plt.plot(range(len(y_pred)), y_pred, color='blue', label = 'predicted')
plt.plot(range(len(y_pred)), y_test, color = 'red', label = 'real')
plt.legend()
plt.show()

In [None]:
explainer = shap.LinearExplainer(reg, X_train)
shap_values = explainer.shap_values(X_test)
X_test_array = X_test # we need to pass a dense version for the plotting functions

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name)

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name, plot_type='bar')

Nesse caso, podemos notar que, como era de se esperar, a variável mais importante é o valor do dia anterior do Litecoin. Contudo, algo interessante aparece nas segundas e terceiras variáveis mais importantes. Como podemos notar, com base em ambos os gráficos, os valores de 3 e 4 dias atrás do Bitcoin impactam o valor atual do Litecoin. É possível notar que valores positivos do Bitcoin de 4 dias atrás impactam mais negativamente o valor do Litecoin atual do que valores positivos. Já para o caso do valor de 3 dias atrás do Bitcoin, vemos que valores positivos impactam positivamente o preço do Litecoin, enquanto valores negativos impactam negativamente.


**Dessa forma, podemos concluir que o Bitcoin influencia muito mais o preço do Litecoin do que o contrário.**

2. Bitcoin e Ethereum

In [None]:
# Parâmetros para o treino

LAGGED_FEATURES = 8
TEST_RATIO = 0.3
COIN = 'BTC-USDT'
COINS_TO_USE = [COIN, 'ETH-USDT']

In [None]:
def create_multiple_lagged_features(df_dict, lagged_features, coin, related_coins):
    lagged_features_name = []
    df_ = df_dict[coin].copy()
    for coin_ in related_coins:
        for i in range(lagged_features):
            df_['lagged_' + str(i) + coin_] = df_dict[coin_].close.shift(i)
            if i>0:
                lagged_features_name.append('lagged_' + str(i) + coin_)
    return df_, lagged_features_name

def train_test_split(df, test_ratio, lagged_features, lagged_features_name):
    final_train_index = int(df.shape[0] * test_ratio)
    X_train = df.iloc[lagged_features:-final_train_index][lagged_features_name]
    y_train = df.iloc[lagged_features:-final_train_index]['close']
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = df.iloc[-final_train_index:][lagged_features_name]
    y_test  = df.iloc[-final_train_index:]['close']
    X_test = scaler.transform(X_test)
    return X_train, y_train, X_test, y_test

In [None]:
df, lagged_features_name = create_multiple_lagged_features(datasets, LAGGED_FEATURES, COIN, COINS_TO_USE)
X_train, y_train, X_test, y_test = train_test_split(df, TEST_RATIO, LAGGED_FEATURES, lagged_features_name)
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
plt.plot(range(len(y_pred)), y_pred, color='blue', label = 'predicted')
plt.plot(range(len(y_pred)), y_test, color = 'red', label = 'real')
plt.legend()
plt.show()

In [None]:
explainer = shap.LinearExplainer(reg, X_train)
shap_values = explainer.shap_values(X_test)
X_test_array = X_test # we need to pass a dense version for the plotting functions

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name)

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name, plot_type='bar')

In [None]:
# Parâmetros para o treino

LAGGED_FEATURES = 8
TEST_RATIO = 0.3
COIN = 'ETH-USDT'
COINS_TO_USE = [COIN, 'BTC-USDT']

In [None]:
def create_multiple_lagged_features(df_dict, lagged_features, coin, related_coins):
    lagged_features_name = []
    df_ = df_dict[coin].copy()
    for coin_ in related_coins:
        for i in range(lagged_features):
            df_['lagged_' + str(i) + coin_] = df_dict[coin_].close.shift(i)
            if i>0:
                lagged_features_name.append('lagged_' + str(i) + coin_)
    return df_, lagged_features_name

def train_test_split(df, test_ratio, lagged_features, lagged_features_name):
    final_train_index = int(df.shape[0] * test_ratio)
    X_train = df.iloc[lagged_features:-final_train_index][lagged_features_name]
    y_train = df.iloc[lagged_features:-final_train_index]['close']
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = df.iloc[-final_train_index:][lagged_features_name]
    y_test  = df.iloc[-final_train_index:]['close']
    X_test = scaler.transform(X_test)
    return X_train, y_train, X_test, y_test

In [None]:
df, lagged_features_name = create_multiple_lagged_features(datasets, LAGGED_FEATURES, COIN, COINS_TO_USE)
X_train, y_train, X_test, y_test = train_test_split(df, TEST_RATIO, LAGGED_FEATURES, lagged_features_name)
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
plt.plot(range(len(y_pred)), y_pred, color='blue', label = 'predicted')
plt.plot(range(len(y_pred)), y_test, color = 'red', label = 'real')
plt.legend()
plt.show()

In [None]:
explainer = shap.LinearExplainer(reg, X_train)
shap_values = explainer.shap_values(X_test)
X_test_array = X_test # we need to pass a dense version for the plotting functions

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name)

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name, plot_type='bar')

3. Ripple e Bitcoin

In [None]:
# Parâmetros para o treino

LAGGED_FEATURES = 8
TEST_RATIO = 0.3
COIN = 'BTC-USDT'
COINS_TO_USE = [COIN, 'XRP-USDT']

In [None]:
def create_multiple_lagged_features(df_dict, lagged_features, coin, related_coins):
    lagged_features_name = []
    df_ = df_dict[coin].copy()
    for coin_ in related_coins:
        for i in range(lagged_features):
            df_['lagged_' + str(i) + coin_] = df_dict[coin_].close.shift(i)
            if i>0:
                lagged_features_name.append('lagged_' + str(i) + coin_)
    return df_, lagged_features_name

def train_test_split(df, test_ratio, lagged_features, lagged_features_name):
    final_train_index = int(df.shape[0] * test_ratio)
    X_train = df.iloc[lagged_features:-final_train_index][lagged_features_name]
    y_train = df.iloc[lagged_features:-final_train_index]['close']
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = df.iloc[-final_train_index:][lagged_features_name]
    y_test  = df.iloc[-final_train_index:]['close']
    X_test = scaler.transform(X_test)
    return X_train, y_train, X_test, y_test

In [None]:
df, lagged_features_name = create_multiple_lagged_features(datasets, LAGGED_FEATURES, COIN, COINS_TO_USE)
X_train, y_train, X_test, y_test = train_test_split(df, TEST_RATIO, LAGGED_FEATURES, lagged_features_name)
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
plt.plot(range(len(y_pred)), y_pred, color='blue', label = 'predicted')
plt.plot(range(len(y_pred)), y_test, color = 'red', label = 'real')
plt.legend()
plt.show()

In [None]:
explainer = shap.LinearExplainer(reg, X_train)
shap_values = explainer.shap_values(X_test)
X_test_array = X_test # we need to pass a dense version for the plotting functions

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name)

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name, plot_type='bar')

In [None]:
# Parâmetros para o treino

LAGGED_FEATURES = 4
TEST_RATIO = 0.3
COIN = 'XRP-USDT'
COINS_TO_USE = [COIN, 'BTC-USDT']

In [None]:
def create_multiple_lagged_features(df_dict, lagged_features, coin, related_coins):
    lagged_features_name = []
    df_ = df_dict[coin].copy()
    for coin_ in related_coins:
        for i in range(lagged_features):
            df_['lagged_' + str(i) + coin_] = df_dict[coin_].close.shift(i)
            if i>0:
                lagged_features_name.append('lagged_' + str(i) + coin_)
    return df_, lagged_features_name

def train_test_split(df, test_ratio, lagged_features, lagged_features_name):
    final_train_index = int(df.shape[0] * test_ratio)
    X_train = df.iloc[lagged_features:-final_train_index][lagged_features_name]
    y_train = df.iloc[lagged_features:-final_train_index]['close']
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = df.iloc[-final_train_index:][lagged_features_name]
    y_test  = df.iloc[-final_train_index:]['close']
    X_test = scaler.transform(X_test)
    return X_train, y_train, X_test, y_test

In [None]:
df, lagged_features_name = create_multiple_lagged_features(datasets, LAGGED_FEATURES, COIN, COINS_TO_USE)
X_train, y_train, X_test, y_test = train_test_split(df, TEST_RATIO, LAGGED_FEATURES, lagged_features_name)
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
plt.plot(range(len(y_pred)), y_pred, color='blue', label = 'predicted')
plt.plot(range(len(y_pred)), y_test, color = 'red', label = 'real')
plt.legend()
plt.show()

In [None]:
explainer = shap.LinearExplainer(reg, X_train)
shap_values = explainer.shap_values(X_test)
X_test_array = X_test # we need to pass a dense version for the plotting functions

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name)

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name, plot_type='bar')

4. BTC e IOTA

In [None]:
# Parâmetros para o treino

LAGGED_FEATURES = 8
TEST_RATIO = 0.3
COIN = 'BTC-USDT'
COINS_TO_USE = [COIN, 'IOTA-USDT']

In [None]:
def create_multiple_lagged_features(df_dict, lagged_features, coin, related_coins):
    lagged_features_name = []
    df_ = df_dict[coin].copy()
    for coin_ in related_coins:
        for i in range(lagged_features):
            df_['lagged_' + str(i) + coin_] = df_dict[coin_].close.shift(i)
            if i>0:
                lagged_features_name.append('lagged_' + str(i) + coin_)
    return df_, lagged_features_name

def train_test_split(df, test_ratio, lagged_features, lagged_features_name):
    final_train_index = int(df.shape[0] * test_ratio)
    X_train = df.iloc[lagged_features:-final_train_index][lagged_features_name]
    y_train = df.iloc[lagged_features:-final_train_index]['close']
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = df.iloc[-final_train_index:][lagged_features_name]
    y_test  = df.iloc[-final_train_index:]['close']
    X_test = scaler.transform(X_test)
    return X_train, y_train, X_test, y_test

In [None]:
df, lagged_features_name = create_multiple_lagged_features(datasets, LAGGED_FEATURES, COIN, COINS_TO_USE)
X_train, y_train, X_test, y_test = train_test_split(df, TEST_RATIO, LAGGED_FEATURES, lagged_features_name)
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
plt.plot(range(len(y_pred)), y_pred, color='blue', label = 'predicted')
plt.plot(range(len(y_pred)), y_test, color = 'red', label = 'real')
plt.legend()
plt.show()

In [None]:
explainer = shap.LinearExplainer(reg, X_train)
shap_values = explainer.shap_values(X_test)
X_test_array = X_test # we need to pass a dense version for the plotting functions

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name)

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name, plot_type='bar')

In [None]:
# Parâmetros para o treino

LAGGED_FEATURES = 8
TEST_RATIO = 0.3
COIN = 'IOTA-USDT'
COINS_TO_USE = [COIN, 'BTC-USDT']

In [None]:
def create_multiple_lagged_features(df_dict, lagged_features, coin, related_coins):
    lagged_features_name = []
    df_ = df_dict[coin].copy()
    for coin_ in related_coins:
        for i in range(lagged_features):
            df_['lagged_' + str(i) + coin_] = df_dict[coin_].close.shift(i)
            if i>0:
                lagged_features_name.append('lagged_' + str(i) + coin_)
    return df_, lagged_features_name

def train_test_split(df, test_ratio, lagged_features, lagged_features_name):
    final_train_index = int(df.shape[0] * test_ratio)
    X_train = df.iloc[lagged_features:-final_train_index][lagged_features_name]
    y_train = df.iloc[lagged_features:-final_train_index]['close']
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = df.iloc[-final_train_index:][lagged_features_name]
    y_test  = df.iloc[-final_train_index:]['close']
    X_test = scaler.transform(X_test)
    return X_train, y_train, X_test, y_test

In [None]:
df, lagged_features_name = create_multiple_lagged_features(datasets, LAGGED_FEATURES, COIN, COINS_TO_USE)
X_train, y_train, X_test, y_test = train_test_split(df, TEST_RATIO, LAGGED_FEATURES, lagged_features_name)
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
plt.plot(range(len(y_pred)), y_pred, color='blue', label = 'predicted')
plt.plot(range(len(y_pred)), y_test, color = 'red', label = 'real')
plt.legend()
plt.show()

In [None]:
explainer = shap.LinearExplainer(reg, X_train)
shap_values = explainer.shap_values(X_test)
X_test_array = X_test # we need to pass a dense version for the plotting functions

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name)

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name, plot_type='bar')

5. Ripple e Litecoin

In [None]:
# Parâmetros para o treino

LAGGED_FEATURES = 8
TEST_RATIO = 0.3
COIN = 'XRP-USDT'
COINS_TO_USE = [COIN, 'LTC-USDT']

In [None]:
def create_multiple_lagged_features(df_dict, lagged_features, coin, related_coins):
    lagged_features_name = []
    df_ = df_dict[coin].copy()
    for coin_ in related_coins:
        for i in range(lagged_features):
            df_['lagged_' + str(i) + coin_] = df_dict[coin_].close.shift(i)
            if i>0:
                lagged_features_name.append('lagged_' + str(i) + coin_)
    return df_, lagged_features_name

def train_test_split(df, test_ratio, lagged_features, lagged_features_name):
    final_train_index = int(df.shape[0] * test_ratio)
    X_train = df.iloc[lagged_features:-final_train_index][lagged_features_name]
    y_train = df.iloc[lagged_features:-final_train_index]['close']
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = df.iloc[-final_train_index:][lagged_features_name]
    y_test  = df.iloc[-final_train_index:]['close']
    X_test = scaler.transform(X_test)
    return X_train, y_train, X_test, y_test

In [None]:
df, lagged_features_name = create_multiple_lagged_features(datasets, LAGGED_FEATURES, COIN, COINS_TO_USE)
X_train, y_train, X_test, y_test = train_test_split(df, TEST_RATIO, LAGGED_FEATURES, lagged_features_name)
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
plt.plot(range(len(y_pred)), y_pred, color='blue', label = 'predicted')
plt.plot(range(len(y_pred)), y_test, color = 'red', label = 'real')
plt.legend()
plt.show()

In [None]:
explainer = shap.LinearExplainer(reg, X_train)
shap_values = explainer.shap_values(X_test)
X_test_array = X_test # we need to pass a dense version for the plotting functions

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name)

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name, plot_type='bar')

In [None]:
# Parâmetros para o treino

LAGGED_FEATURES = 8
TEST_RATIO = 0.3
COIN = 'LTC-USDT'
COINS_TO_USE = [COIN, 'XRP-USDT']

In [None]:
def create_multiple_lagged_features(df_dict, lagged_features, coin, related_coins):
    lagged_features_name = []
    df_ = df_dict[coin].copy()
    for coin_ in related_coins:
        for i in range(lagged_features):
            df_['lagged_' + str(i) + coin_] = df_dict[coin_].close.shift(i)
            if i>0:
                lagged_features_name.append('lagged_' + str(i) + coin_)
    return df_, lagged_features_name

def train_test_split(df, test_ratio, lagged_features, lagged_features_name):
    final_train_index = int(df.shape[0] * test_ratio)
    X_train = df.iloc[lagged_features:-final_train_index][lagged_features_name]
    y_train = df.iloc[lagged_features:-final_train_index]['close']
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = df.iloc[-final_train_index:][lagged_features_name]
    y_test  = df.iloc[-final_train_index:]['close']
    X_test = scaler.transform(X_test)
    return X_train, y_train, X_test, y_test

In [None]:
df, lagged_features_name = create_multiple_lagged_features(datasets, LAGGED_FEATURES, COIN, COINS_TO_USE)
X_train, y_train, X_test, y_test = train_test_split(df, TEST_RATIO, LAGGED_FEATURES, lagged_features_name)
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
plt.plot(range(len(y_pred)), y_pred, color='blue', label = 'predicted')
plt.plot(range(len(y_pred)), y_test, color = 'red', label = 'real')
plt.legend()
plt.show()

In [None]:
explainer = shap.LinearExplainer(reg, X_train)
shap_values = explainer.shap_values(X_test)
X_test_array = X_test # we need to pass a dense version for the plotting functions

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name)

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name, plot_type='bar')

6. Ripple e IOTA

In [None]:
# Parâmetros para o treino

LAGGED_FEATURES = 8
TEST_RATIO = 0.3
COIN = 'XRP-USDT'
COINS_TO_USE = [COIN, 'IOTA-USDT']

In [None]:
def create_multiple_lagged_features(df_dict, lagged_features, coin, related_coins):
    lagged_features_name = []
    df_ = df_dict[coin].copy()
    for coin_ in related_coins:
        for i in range(lagged_features):
            df_['lagged_' + str(i) + coin_] = df_dict[coin_].close.shift(i)
            if i>0:
                lagged_features_name.append('lagged_' + str(i) + coin_)
    return df_, lagged_features_name

def train_test_split(df, test_ratio, lagged_features, lagged_features_name):
    final_train_index = int(df.shape[0] * test_ratio)
    X_train = df.iloc[lagged_features:-final_train_index][lagged_features_name]
    y_train = df.iloc[lagged_features:-final_train_index]['close']
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = df.iloc[-final_train_index:][lagged_features_name]
    y_test  = df.iloc[-final_train_index:]['close']
    X_test = scaler.transform(X_test)
    return X_train, y_train, X_test, y_test

In [None]:
df, lagged_features_name = create_multiple_lagged_features(datasets, LAGGED_FEATURES, COIN, COINS_TO_USE)
X_train, y_train, X_test, y_test = train_test_split(df, TEST_RATIO, LAGGED_FEATURES, lagged_features_name)
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
plt.plot(range(len(y_pred)), y_pred, color='blue', label = 'predicted')
plt.plot(range(len(y_pred)), y_test, color = 'red', label = 'real')
plt.legend()
plt.show()

In [None]:
explainer = shap.LinearExplainer(reg, X_train)
shap_values = explainer.shap_values(X_test)
X_test_array = X_test # we need to pass a dense version for the plotting functions

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name)

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name, plot_type='bar')

In [None]:
# Parâmetros para o treino

LAGGED_FEATURES = 8
TEST_RATIO = 0.3
COIN = 'IOTA-USDT'
COINS_TO_USE = [COIN, 'XRP-USDT']

In [None]:
def create_multiple_lagged_features(df_dict, lagged_features, coin, related_coins):
    lagged_features_name = []
    df_ = df_dict[coin].copy()
    for coin_ in related_coins:
        for i in range(lagged_features):
            df_['lagged_' + str(i) + coin_] = df_dict[coin_].close.shift(i)
            if i>0:
                lagged_features_name.append('lagged_' + str(i) + coin_)
    return df_, lagged_features_name

def train_test_split(df, test_ratio, lagged_features, lagged_features_name):
    final_train_index = int(df.shape[0] * test_ratio)
    X_train = df.iloc[lagged_features:-final_train_index][lagged_features_name]
    y_train = df.iloc[lagged_features:-final_train_index]['close']
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = df.iloc[-final_train_index:][lagged_features_name]
    y_test  = df.iloc[-final_train_index:]['close']
    X_test = scaler.transform(X_test)
    return X_train, y_train, X_test, y_test

In [None]:
df, lagged_features_name = create_multiple_lagged_features(datasets, LAGGED_FEATURES, COIN, COINS_TO_USE)
X_train, y_train, X_test, y_test = train_test_split(df, TEST_RATIO, LAGGED_FEATURES, lagged_features_name)
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
plt.plot(range(len(y_pred)), y_pred, color='blue', label = 'predicted')
plt.plot(range(len(y_pred)), y_test, color = 'red', label = 'real')
plt.legend()
plt.show()

In [None]:
explainer = shap.LinearExplainer(reg, X_train)
shap_values = explainer.shap_values(X_test)
X_test_array = X_test # we need to pass a dense version for the plotting functions

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name)

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name, plot_type='bar')

8. ETH e XRP

In [None]:
# Parâmetros para o treino

LAGGED_FEATURES = 8
TEST_RATIO = 0.3
COIN = 'XRP-USDT'
COINS_TO_USE = [COIN, 'ETH-USDT']

In [None]:
def create_multiple_lagged_features(df_dict, lagged_features, coin, related_coins):
    lagged_features_name = []
    df_ = df_dict[coin].copy()
    for coin_ in related_coins:
        for i in range(lagged_features):
            df_['lagged_' + str(i) + coin_] = df_dict[coin_].close.shift(i)
            if i>0:
                lagged_features_name.append('lagged_' + str(i) + coin_)
    return df_, lagged_features_name

def train_test_split(df, test_ratio, lagged_features, lagged_features_name):
    final_train_index = int(df.shape[0] * test_ratio)
    X_train = df.iloc[lagged_features:-final_train_index][lagged_features_name]
    y_train = df.iloc[lagged_features:-final_train_index]['close']
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = df.iloc[-final_train_index:][lagged_features_name]
    y_test  = df.iloc[-final_train_index:]['close']
    X_test = scaler.transform(X_test)
    return X_train, y_train, X_test, y_test

In [None]:
df, lagged_features_name = create_multiple_lagged_features(datasets, LAGGED_FEATURES, COIN, COINS_TO_USE)
X_train, y_train, X_test, y_test = train_test_split(df, TEST_RATIO, LAGGED_FEATURES, lagged_features_name)
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
plt.plot(range(len(y_pred)), y_pred, color='blue', label = 'predicted')
plt.plot(range(len(y_pred)), y_test, color = 'red', label = 'real')
plt.legend()
plt.show()

In [None]:
explainer = shap.LinearExplainer(reg, X_train)
shap_values = explainer.shap_values(X_test)
X_test_array = X_test # we need to pass a dense version for the plotting functions

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name)

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name, plot_type='bar')

In [None]:
# Parâmetros para o treino

LAGGED_FEATURES = 8
TEST_RATIO = 0.3
COIN = 'ETH-USDT'
COINS_TO_USE = [COIN, 'XRP-USDT']

In [None]:
def create_multiple_lagged_features(df_dict, lagged_features, coin, related_coins):
    lagged_features_name = []
    df_ = df_dict[coin].copy()
    for coin_ in related_coins:
        for i in range(lagged_features):
            df_['lagged_' + str(i) + coin_] = df_dict[coin_].close.shift(i)
            if i>0:
                lagged_features_name.append('lagged_' + str(i) + coin_)
    return df_, lagged_features_name

def train_test_split(df, test_ratio, lagged_features, lagged_features_name):
    final_train_index = int(df.shape[0] * test_ratio)
    X_train = df.iloc[lagged_features:-final_train_index][lagged_features_name]
    y_train = df.iloc[lagged_features:-final_train_index]['close']
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test  = df.iloc[-final_train_index:][lagged_features_name]
    y_test  = df.iloc[-final_train_index:]['close']
    X_test = scaler.transform(X_test)
    return X_train, y_train, X_test, y_test

In [None]:
df, lagged_features_name = create_multiple_lagged_features(datasets, LAGGED_FEATURES, COIN, COINS_TO_USE)
X_train, y_train, X_test, y_test = train_test_split(df, TEST_RATIO, LAGGED_FEATURES, lagged_features_name)
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
plt.plot(range(len(y_pred)), y_pred, color='blue', label = 'predicted')
plt.plot(range(len(y_pred)), y_test, color = 'red', label = 'real')
plt.legend()
plt.show()

In [None]:
explainer = shap.LinearExplainer(reg, X_train)
shap_values = explainer.shap_values(X_test)
X_test_array = X_test # we need to pass a dense version for the plotting functions

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name)

In [None]:
shap.summary_plot(shap_values, X_test_array, feature_names=lagged_features_name, plot_type='bar')

### MLP

Aqui vamos realizar a predição usando redes neurais com ativações não-lineares. Testaremos diferentes numeros de camadas e arquiteturas, com o objetivo de obter o melhor valor.

O código abaixo foi feito com base em: https://github.com/lmoroney/dlaicourse/blob/master/TensorFlow%20In%20Practice/Course%204%20-%20S%2BP/S%2BP%20Week%202%20Lesson%203.ipynb

In [None]:
import tensorflow as tf

In [None]:
# Parâmetros para o treino

LAGGED_FEATURES = 4
TEST_RATIO = 0.3
BATCH_SIZE = 32
SHUFFLE_BUFFER = 10000
COIN = 'BTC-USDT'
NEURONS = 50

In [None]:
LAGGED_FEATURES += 1

In [None]:
def create_lagged_features(df, lagged_features):
    lagged_features_name = []
    df_ = df.copy()
    for i in range(lagged_features):
        df_['lagged_' + str(i)] = df_.close.shift(i)
        if i>0:
            lagged_features_name.append('lagged_' + str(i))
    return df_, lagged_features_name

def train_test_split(df, test_ratio, lagged_features, lagged_features_name):
    final_train_index = int(df.shape[0] * test_ratio)
    X_train = df.iloc[lagged_features:-final_train_index][lagged_features_name]
    y_train = df.iloc[lagged_features:-final_train_index]['close']
    scaler = StandardScaler()
    y_scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    y_train = y_scaler.fit_transform(np.array(y_train).reshape(-1,1))
    X_test  = df.iloc[-final_train_index:][lagged_features_name]
    y_test  = df.iloc[-final_train_index:]['close']
    X_test = scaler.transform(X_test)
    y_test = y_scaler.transform(np.array(y_test).reshape(-1,1))
    return X_train, y_train, X_test, y_test, y_scaler

In [None]:
final_train_index = int(datasets[COIN].shape[0] * TEST_RATIO)

In [None]:
df, lagged_features_name = create_lagged_features(datasets[COIN], LAGGED_FEATURES)
X_train, y_train, X_test, y_test, y_scaler = train_test_split(df, TEST_RATIO, LAGGED_FEATURES, lagged_features_name)

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(NEURONS, input_shape=[LAGGED_FEATURES-1], activation="linear"), 
    tf.keras.layers.Dense(NEURONS, activation="linear"),
    tf.keras.layers.Dense(NEURONS, activation="linear"),
    tf.keras.layers.Dense(1, activation = 'linear'),
])

In [None]:
model.compile(loss="mse", optimizer=tf.keras.optimizers.Adam(lr=1e-4))
model.fit(X_train, y_train,epochs=1000,verbose=1, batch_size=32)

In [None]:
forecast = []
for time in range(y_test.shape[0]):
    forecast.append(model.predict(np.array(X_test[time])[np.newaxis]))

results = np.array(forecast)[:, 0, 0]
results = y_scaler.inverse_transform(list(results))
np.sqrt(mean_squared_error(datasets[COIN].iloc[-final_train_index:].close, results))

In [None]:
plt.plot(range(len(results)), results, color='blue', label = 'predicted')
plt.plot(range(len(results)), datasets[COIN].iloc[-final_train_index:].close, color = 'red', label = 'real')
plt.legend()
plt.show()

In [None]:
explainer = shap.DeepExplainer(model, X_train)
shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, X_test, feature_names=lagged_features_name)

### LSTM

In [None]:
import tensorflow as tf

In [None]:
# Parâmetros para o treino

LAGGED_FEATURES = 4
TEST_RATIO = 0.3
BATCH_SIZE = 32
SHUFFLE_BUFFER = 10000
COIN = 'XRP-USDT'

In [None]:
LAGGED_FEATURES += 1

In [None]:
def create_lagged_features(df, lagged_features):
    lagged_features_name = []
    df_ = df.copy()
    for i in range(lagged_features):
        df_['lagged_' + str(i)] = df_.close.shift(i)
        if i>0:
            lagged_features_name.append('lagged_' + str(i))
    return df_, lagged_features_name

def train_test_split(df, test_ratio, lagged_features, lagged_features_name):
    final_train_index = int(df.shape[0] * test_ratio)
    X_train = df.iloc[lagged_features:-final_train_index][lagged_features_name]
    y_train = df.iloc[lagged_features:-final_train_index]['close']
    scaler = StandardScaler()
    y_scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    y_train = y_scaler.fit_transform(np.array(y_train).reshape(-1,1))
    X_test  = df.iloc[-final_train_index:][lagged_features_name]
    y_test  = df.iloc[-final_train_index:]['close']
    X_test = scaler.transform(X_test)
    y_test = y_scaler.transform(np.array(y_test).reshape(-1,1))
    return X_train, y_train, X_test, y_test, y_scaler

In [None]:
final_train_index = int(datasets[COIN].shape[0] * TEST_RATIO)

In [None]:
df, lagged_features_name = create_lagged_features(datasets[COIN], LAGGED_FEATURES)
X_train, y_train, X_test, y_test, y_scaler = train_test_split(df, TEST_RATIO, LAGGED_FEATURES, lagged_features_name)

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Lambda(lambda x: tf.expand_dims(x, axis=-1),
                      input_shape=[None]),
    tf.keras.layers.LSTM(4, return_sequences=True),
    tf.keras.layers.LSTM(4, return_sequences=True),
    tf.keras.layers.LSTM(4),
#     tf.keras.layers.LSTM(64),
#     tf.keras.layers.Dense(16, activation='linear'),
    tf.keras.layers.Dense(1),
#     tf.keras.layers.Lambda(lambda x: x * 100.0)
#     tf.keras.layers.Lambda(lambda x: x[:,0,0])
])

In [None]:
model.compile(loss="mse", optimizer=tf.keras.optimizers.Adam(lr=1e-4),metrics=["mse"])
history = model.fit(X_train, y_train,epochs=2000,verbose=1, batch_size=32)

In [None]:
forecast = []
for time in range(y_test.shape[0]):
    forecast.append(model.predict(np.array(X_test[time])[np.newaxis]))

results = np.array(forecast)[:, 0, 0]
results = y_scaler.inverse_transform(list(results))
np.sqrt(mean_squared_error(datasets[COIN].iloc[-final_train_index:].close, results))

In [None]:
plt.plot(range(len(results)), results, color='blue', label = 'predicted')
plt.plot(range(len(results)), datasets[COIN].iloc[-final_train_index:].close, color = 'red', label = 'real')
plt.legend()
plt.show()