<div class="alert alert-block alert-info">
<center> <h1> BUSINESS CASES WITH DATA SCIENCE </h1> </center> <br>
<center> Business Case 5: Cryptocurrency Data Visualization </center>

<hr>
<a class="anchor" id="group">
    
### Group
    
</a>

- Celso Christiano Endres Neto		    |   m20200739 <br>
- Gabriel Felipe Martins de Souza   	|   m20210598 <br>
- Luiz Humberto Polaro Vizeu		    |   m20210554 <br>
- Rogerio Domingos Paulo	        	|   m20210597 <br>

**Table of Contents** <br>
* [1.0 Import](#import)
    * [1.1 Import Libs](#libs)
* [2.0 Time Series (Box-Jenkins)](#timeseries)
* [3.0 Time Series Data Preparation and Preprocessing](#data_prep_ts)
* [4.0 Time Series Model and Assessment](#ts_modeling)
* [5.0 Machine Learning](#ml)
* [6.0 ML Data Preparation and Preprocessing](#data_prep_ml)
* [7.0 Machine Learning Model and Assessment](#ml_modeling)


<hr>
<a class="anchor" id="import">
    
# 1.0 Import
    
</a>

<hr>
<a class="anchor" id="libs">
    
## 1.1 Import Libs
    
</a>

In [1]:
#common packages
import pandas as pd
import numpy as np
import glob
from math import ceil, pi, sqrt
import os
from itertools import product
import warnings
warnings.filterwarnings("ignore")
import datetime
import statsmodels.api as sm

#!pip install  holidays
import holidays
import itertools

#dataviz
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()
import graphviz
import matplotlib.cm as cm
import matplotlib.cm as cm

#algorithms for data preparation and preprocessing
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
# !pip install ta
import ta
from ta import add_all_ta_features
from sklearn.feature_selection import RFE


#Modeling and Assessment
from sklearn import datasets, linear_model
#!pip install XGBoost
from sklearn.metrics import mean_squared_error as MSE, r2_score, mean_absolute_percentage_error as MAPE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.neural_network import MLPRegressor

#Time Series and Modeling
#!pip install pmdarima
from pmdarima import auto_arima
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.statespace.tools import diff
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox

#importing stock data from Yahoo Finance
#!pip install yahoo-finance
import yfinance as yf


<hr>
<a class="anchor" id="timeseries">
    
# 2.0 Time Series (Box_Jenkins)
    
</a>

<hr>
<a class="anchor" id="data_prep_ts">
    
# 3.0 Time Series Data Preparation and Preprocessing
    
</a>

In [2]:
# list of crptocurrencies as ticker arguments
cryptocurrencies = ['ADA-USD', 'ATOM-USD', 'AVAX-USD', 'AXS-USD', 'BTC-USD', 'ETH-USD',
                     'LINK-USD', 'LUNA1-USD', 'MATIC-USD', 'SOL-USD']

In [3]:
##importing data from yahoo finance lib
data = yf.download(cryptocurrencies, period = '365d', interval = '1d')

[*********************100%***********************]  10 of 10 completed


In [4]:
#storing each indicator in separately dataframe
df_open = data['Open'].reset_index()

df_close = data['Close'].reset_index()

df_adj_close = data['Adj Close'].reset_index()

df_high = data['High'].reset_index()

df_low = data['Low'].reset_index()

df_volume = data['Volume'].reset_index()

In [5]:
# Creating a list with all the currencies
list_of_currencys = df_volume.iloc[:,1:].columns.to_list()

In [6]:
df = {}

for currency in list_of_currencys:
    
    df[currency] = pd.DataFrame()

    #retrieving open price
    df1 = df_open[['Date',currency]].copy()
    #filtering only non-null records
    df1 = df1[~df1[currency].isnull()].copy()
    #renaming column ETH-USD to open, which means the Open price for the currency
    df1.rename(columns={currency: "open"}, inplace=True)

    #retrieving close price
    df2 = df_close[['Date',currency]]
    #filtering only non-null records
    df2 = df2[~df2[currency].isnull()].copy()
    #renaming column ETH-USD to close, which means the Open price for the currency
    df2.rename(columns={currency: "close"}, inplace=True)

    #retrieving adj_close price
    df3 = df_adj_close[['Date',currency]]
    #filtering only non-null records
    df3 = df3[~df3[currency].isnull()].copy()
    #renaming column ETH-USD to adj_close, which means the adj_close price for the currency
    df3.rename(columns={currency: "adj_close"}, inplace=True)

    #retrieving highest price
    df4 = df_high[['Date',currency]]
    #filtering only non-null records
    df4 = df4[~df4[currency].isnull()].copy()
    #renaming column ETH-USD to high, which means the highest price for the currency
    df4.rename(columns={currency: "high"}, inplace=True)

    #retrieving lowest price
    df5 = df_low[['Date',currency]]
    #filtering only non-null records
    df5 = df5[~df5[currency].isnull()].copy()
    #renaming column ETH-USD to df5, which means the lowest price for the currency
    df5.rename(columns={currency: "low"}, inplace=True)

    #retrieving Volume
    df6 = df_volume[['Date',currency]]
    #filtering only non-null records
    df6 = df6[~df6[currency].isnull()].copy()
    #renaming column ETH-USD to Volume, which means the Volume for the currency
    df6.rename(columns={currency: "volume"}, inplace=True)
    
    name=str(currency)

    #merging dataframes into a single dataframe
    temp_2 = pd.merge(df1, df2, left_on='Date', right_on='Date', how='left')
    temp_3 = pd.merge(temp_2, df3, left_on='Date', right_on='Date', how='left')
    temp_4 = pd.merge(temp_3, df4, left_on='Date', right_on='Date', how='left')
    temp_5 = pd.merge(temp_4, df5, left_on='Date', right_on='Date', how='left')
    temp_6 = pd.merge(temp_5, df6, left_on='Date', right_on='Date', how='left')    
    df[currency] = temp_6.copy()
    df[currency]['Date'] = pd.to_datetime(df[currency]['Date'])
    df[currency]['volume'] = df[currency]['volume'].astype('Int64')    

<hr>
<a class="anchor" id="ts_modeling">
    
# 4.0 Time Series Model and Assessment
    
</a>

<hr>
<a class="anchor" id="BTC-USD">
    
## BTC-USD
    
</a>

In [7]:
##creating a df to predict the crypto currency
dfbtc = df['BTC-USD'].copy()

In [8]:
# Creating a new feature for better representing day-wise values
dfbtc['mean'] = (dfbtc['low'] + dfbtc['high'])/2

In [9]:
# Cleaning the data for any NaN or Null fields
dfbtc = dfbtc.dropna()

In [10]:
# Creating a copy for applying shift
dataset_for_prediction = dfbtc.copy()
dataset_for_prediction['Actual']=dataset_for_prediction['close'].shift()
dataset_for_prediction=dataset_for_prediction.dropna()

In [11]:
# date time typecast
dataset_for_prediction['Date'] =pd.to_datetime(dataset_for_prediction['Date'])
dataset_for_prediction.index= dataset_for_prediction['Date']

In [12]:
# normalizing the exogeneous variables
sc_in = MinMaxScaler(feature_range=(0, 1))
scaled_input = sc_in.fit_transform(dataset_for_prediction[['volume']])  #['low', 'high', 'open', 'adj_close', 'volume', 'mean']
scaled_input = pd.DataFrame(scaled_input, index=dataset_for_prediction.index)
X=scaled_input
X.rename(columns={0:'Volume'}, inplace=True)

In [13]:
# normalizing the time series
sc_out = MinMaxScaler(feature_range=(0, 1))
scaler_output = sc_out.fit_transform(dataset_for_prediction[['Actual']])
scaler_output =pd.DataFrame(scaler_output, index=dataset_for_prediction.index)
y=scaler_output
y.rename(columns={0:'Observed Data'}, inplace= True)
y.index=dataset_for_prediction.index

In [14]:
# train-test split (cannot shuffle in case of time series)
train_X, train_y = X[:-7].dropna(), y[:-7].dropna()
test_X, test_y = X[-9:].dropna(), y[-8:].dropna()

In [15]:
# Init the best SARIMAX model
model = SARIMAX(
    train_y,
    exog=train_X,
    order=(0,1,0),
    seasonal_order =(2, 1, 0, 6)
)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [16]:
# training the model
results = model.fit()

In [17]:
# get predictions
predictions = results.predict(start= len(train_y), end= len(train_y)+len(test_y), exog = test_X)

In [18]:
#forecast
fcst = results.predict(start= len(train_y), end= len(train_y)+len(test_y), exog = test_X).to_frame()
fcst2 = sc_out.inverse_transform(fcst)
#storing the predictions in a dataframe
btc_predictions = pd.DataFrame(fcst2, index = fcst.index, columns = ['price'])

<hr>
<a class="anchor" id="ETH-USD">
    
## ETH-USD
    
</a>

In [19]:
#creating a df to predict the crypto currency
dfeth = df['ETH-USD'].copy()

In [20]:
# Creating a new feature for better representing day-wise values
dfeth['mean'] = (dfeth['low'] + dfeth['high'])/2

In [21]:
# Cleaning the data for any NaN or Null fields
dfeth = dfeth.dropna()

In [22]:
# Creating a copy for applying shift
dataset_for_prediction = dfeth.copy()
dataset_for_prediction['Actual']=dataset_for_prediction['close'].shift()
dataset_for_prediction=dataset_for_prediction.dropna()

In [23]:
# date time typecast
dataset_for_prediction['Date'] =pd.to_datetime(dataset_for_prediction['Date'])
dataset_for_prediction.index= dataset_for_prediction['Date']

In [24]:
# normalizing the exogeneous variables
sc_in = MinMaxScaler(feature_range=(0, 1))
scaled_input = sc_in.fit_transform(dataset_for_prediction[['volume']])  #['low', 'high', 'open', 'adj_close', 'volume', 'mean']
scaled_input = pd.DataFrame(scaled_input, index=dataset_for_prediction.index)
X=scaled_input
X.rename(columns={0:'Volume'}, inplace=True)

In [25]:
# normalizing the time series
sc_out = MinMaxScaler(feature_range=(0, 1))
scaler_output = sc_out.fit_transform(dataset_for_prediction[['Actual']])
scaler_output =pd.DataFrame(scaler_output, index=dataset_for_prediction.index)
y=scaler_output
y.rename(columns={0:'Observed Data'}, inplace= True)
y.index=dataset_for_prediction.index

In [26]:
# train-test split (cannot shuffle in case of time series)
train_X, train_y = X[:-7].dropna(), y[:-7].dropna()
test_X, test_y = X[-9:].dropna(), y[-8:].dropna()

In [27]:
# Init the best SARIMAX model
model = SARIMAX(
    train_y,
    exog=train_X,
    order=(1,1,0),
    seasonal_order =(2, 1, 0, 6)
)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [28]:
# training the model
results = model.fit()

In [29]:
# get predictions
predictions = results.predict(start= len(train_y), end= len(train_y)+len(test_y), exog = test_X)

In [30]:
#forecast
fcst = results.predict(start= len(train_y), end= len(train_y)+len(test_y), exog = test_X).to_frame()
fcst2 = sc_out.inverse_transform(fcst)
#storing the predictions in a dataframe
eth_predictions = pd.DataFrame(fcst2, index = fcst.index, columns = ['price'])

<hr>
<a class="anchor" id="LINK-USD">
    
## LINK-USD
    
</a>

In [31]:
#creating a df to predict the crypto currency
dflink = df['LINK-USD'].copy()

In [32]:
# Creating a new feature for better representing day-wise values
dflink['mean'] = (dflink['low'] + dflink['high'])/2

In [33]:
# Cleaning the data for any NaN or Null fields
dflink = dflink.dropna()

In [34]:
# Creating a copy for applying shift
dataset_for_prediction = dflink.copy()
dataset_for_prediction['Actual']=dataset_for_prediction['close'].shift()
dataset_for_prediction=dataset_for_prediction.dropna()

In [35]:
# date time typecast
dataset_for_prediction['Date'] =pd.to_datetime(dataset_for_prediction['Date'])
dataset_for_prediction.index= dataset_for_prediction['Date']

In [36]:
# normalizing the exogeneous variables
sc_in = MinMaxScaler(feature_range=(0, 1))
scaled_input = sc_in.fit_transform(dataset_for_prediction[['volume']])  #['low', 'high', 'open', 'adj_close', 'volume', 'mean']
scaled_input = pd.DataFrame(scaled_input, index=dataset_for_prediction.index)
X=scaled_input
X.rename(columns={0:'Volume'}, inplace=True)

In [37]:
# normalizing the time series
sc_out = MinMaxScaler(feature_range=(0, 1))
scaler_output = sc_out.fit_transform(dataset_for_prediction[['Actual']])
scaler_output =pd.DataFrame(scaler_output, index=dataset_for_prediction.index)
y=scaler_output
y.rename(columns={0:'Observed Data'}, inplace= True)
y.index=dataset_for_prediction.index

In [38]:
# train-test split (cannot shuffle in case of time series)
train_X, train_y = X[:-7].dropna(), y[:-7].dropna()
test_X, test_y = X[-9:].dropna(), y[-8:].dropna()

In [39]:
# Init the best SARIMAX model
model = SARIMAX(
    train_y,
    exog=train_X,
    order=(1,1,1),
    seasonal_order =(2, 1, 0, 6)
)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [40]:
# training the model
results = model.fit()



In [41]:
# get predictions
predictions = results.predict(start= len(train_y), end= len(train_y)+len(test_y), exog = test_X)

In [42]:
#forecast
fcst = results.predict(start= len(train_y), end= len(train_y)+len(test_y), exog = test_X).to_frame()
fcst2 = sc_out.inverse_transform(fcst)
#storing the predictions in a dataframe
link_predictions = pd.DataFrame(fcst2, index = fcst.index, columns = ['price'])

<hr>
<a class="anchor" id="MATIC-USD">
    
## MATIC-USD
    
</a>

In [43]:
#creating a df to predict the crypto currency
dfmatic = df['MATIC-USD'].copy()

In [44]:
# Creating a new feature for better representing day-wise values
dfmatic['mean'] = (dfmatic['low'] + dfmatic['high'])/2

In [45]:
# Cleaning the data for any NaN or Null fields
dfmatic = dfmatic.dropna()

In [46]:
# Creating a copy for applying shift
dataset_for_prediction = dfmatic.copy()
dataset_for_prediction['Actual']=dataset_for_prediction['close'].shift()
dataset_for_prediction=dataset_for_prediction.dropna()

In [47]:
# date time typecast
dataset_for_prediction['Date'] =pd.to_datetime(dataset_for_prediction['Date'])
dataset_for_prediction.index= dataset_for_prediction['Date']

In [48]:
# normalizing the exogeneous variables
sc_in = MinMaxScaler(feature_range=(0, 1))
scaled_input = sc_in.fit_transform(dataset_for_prediction[['volume']])  #['low', 'high', 'open', 'adj_close', 'volume', 'mean']
scaled_input = pd.DataFrame(scaled_input, index=dataset_for_prediction.index)
X=scaled_input
X.rename(columns={0:'Volume'}, inplace=True)

In [49]:
# normalizing the time series
sc_out = MinMaxScaler(feature_range=(0, 1))
scaler_output = sc_out.fit_transform(dataset_for_prediction[['Actual']])
scaler_output =pd.DataFrame(scaler_output, index=dataset_for_prediction.index)
y=scaler_output
y.rename(columns={0:'Observed Data'}, inplace= True)
y.index=dataset_for_prediction.index

In [50]:
# train-test split (cannot shuffle in case of time series)
train_X, train_y = X[:-7].dropna(), y[:-7].dropna()
test_X, test_y = X[-9:].dropna(), y[-8:].dropna()

In [51]:
# Init the best SARIMAX model
model = SARIMAX(
    train_y,
    exog=train_X,
    order=(2,1,2),
    seasonal_order =(2, 1, 0, 6)
)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [52]:
# training the model
results = model.fit()



In [53]:
# get predictions
predictions = results.predict(start= len(train_y), end= len(train_y)+len(test_y), exog = test_X)

In [54]:
#forecast
fcst = results.predict(start= len(train_y), end= len(train_y)+len(test_y), exog = test_X).to_frame()
fcst2 = sc_out.inverse_transform(fcst)
#storing the predictions in a dataframe
matic_predictions = pd.DataFrame(fcst2, index = fcst.index, columns = ['price'])

<hr>
<a class="anchor" id="SOL-USD">
    
## SOL-USD
    
</a>

In [55]:
#creating a df to predict the crypto currency
dfsol = df['SOL-USD'].copy()

In [56]:
# Creating a new feature for better representing day-wise values
dfsol['mean'] = (dfsol['low'] + dfsol['high'])/2

In [57]:
# Cleaning the data for any NaN or Null fields
dfsol = dfsol.dropna()

In [58]:
# Creating a copy for applying shift
dataset_for_prediction = dfsol.copy()
dataset_for_prediction['Actual']=dataset_for_prediction['close'].shift()
dataset_for_prediction=dataset_for_prediction.dropna()

In [59]:
# date time typecast
dataset_for_prediction['Date'] =pd.to_datetime(dataset_for_prediction['Date'])
dataset_for_prediction.index= dataset_for_prediction['Date']

In [60]:
# normalizing the exogeneous variables
sc_in = MinMaxScaler(feature_range=(0, 1))
scaled_input = sc_in.fit_transform(dataset_for_prediction[['volume']])  #['low', 'high', 'open', 'adj_close', 'volume', 'mean']
scaled_input = pd.DataFrame(scaled_input, index=dataset_for_prediction.index)
X=scaled_input
X.rename(columns={0:'Volume'}, inplace=True)

In [61]:
# normalizing the time series
sc_out = MinMaxScaler(feature_range=(0, 1))
scaler_output = sc_out.fit_transform(dataset_for_prediction[['Actual']])
scaler_output =pd.DataFrame(scaler_output, index=dataset_for_prediction.index)
y=scaler_output
y.rename(columns={0:'Observed Data'}, inplace= True)
y.index=dataset_for_prediction.index

In [62]:
# train-test split (cannot shuffle in case of time series)
train_X, train_y = X[:-7].dropna(), y[:-7].dropna()
test_X, test_y = X[-9:].dropna(), y[-8:].dropna()

In [63]:
# Init the best SARIMAX model
model = SARIMAX(
    train_y,
    exog=train_X,
    order=(0,1,0),
    seasonal_order =(2, 1, 0, 6)
)

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [64]:
# training the model
results = model.fit()

In [65]:
# get predictions
predictions = results.predict(start= len(train_y), end= len(train_y)+len(test_y), exog = test_X)

In [66]:
#forecast
fcst = results.predict(start= len(train_y), end= len(train_y)+len(test_y), exog = test_X).to_frame()
fcst2 = sc_out.inverse_transform(fcst)
#storing the predictions in a dataframe
sol_predictions = pd.DataFrame(fcst2, index = fcst.index, columns = ['price'])

<hr>
<a class="anchor" id="ts_predictions">
    
##  Time Series Predictions Summary
    
</a>

In [67]:
# #creating dataframes for each currency to summarize the predictions

btc_predictions.index.name = 'Date'
btc = btc_predictions.rename(columns={'price': 'BTC-USD'}).reset_index()

eth_predictions.index.name = 'Date'
eth = eth_predictions.rename(columns={'price': 'ETH-USD'}).reset_index()

link_predictions.index.name = 'Date'
link = link_predictions.rename(columns={'price': 'LINK-USD'}).reset_index()

matic_predictions.index.name = 'Date'
matic = matic_predictions.rename(columns={'price': 'MATIC-USD'}).reset_index()

sol_predictions.index.name = 'Date'
sol = sol_predictions.rename(columns={'price': 'SOL-USD'}).reset_index()

In [68]:
temp_a = pd.merge(btc, link, left_on='Date', right_on='Date', how='inner')
temp_b = pd.merge(temp_a, matic, left_on='Date', right_on='Date', how='inner')
temp_c = pd.merge(temp_b, eth, left_on='Date', right_on='Date', how='inner')
final_predictions = pd.merge(temp_c, sol, left_on='Date', right_on='Date', how='inner') 

In [69]:
#creating the dataframe to store the predictions for D+1 and D+2 of each currency
df_pred_final = final_predictions[-2:].copy()

#creating the dataframe to store the predictions for the validation data
df_val_final = final_predictions[:-2].copy()

<hr>
<a class="anchor" id="ml">
    
# 5.0 Machine Learning
    
</a>

<hr>
<a class="anchor" id="data_prep_ml">
    
# 6.0 Data Preparation and Preprocessing
    
</a>

In [70]:
#importing data from yahoo finance lib
data = yf.download(cryptocurrencies, period = '495d', interval = '1d')

[*********************100%***********************]  10 of 10 completed


In [71]:
#storing each indicator in separately dataframe
df_open = data['Open'].reset_index()

df_close = data['Close'].reset_index()

df_adj_close = data['Adj Close'].reset_index()

df_high = data['High'].reset_index()

df_low = data['Low'].reset_index()

df_volume = data['Volume'].reset_index()

In [72]:
df = {}

for currency in list_of_currencys:
    
    df[currency] = pd.DataFrame()

    #retrieving open price
    df1 = df_open[['Date',currency]].copy()
    #filtering only non-null records
    df1 = df1[~df1[currency].isnull()].copy()
    #renaming column ETH-USD to open, which means the Open price for the currency
    df1.rename(columns={currency: "open"}, inplace=True)

    #retrieving close price
    df2 = df_close[['Date',currency]]
    #filtering only non-null records
    df2 = df2[~df2[currency].isnull()].copy()
    #renaming column ETH-USD to close, which means the Open price for the currency
    df2.rename(columns={currency: "close"}, inplace=True)

    #retrieving adj_close price
    df3 = df_adj_close[['Date',currency]]
    #filtering only non-null records
    df3 = df3[~df3[currency].isnull()].copy()
    #renaming column ETH-USD to adj_close, which means the adj_close price for the currency
    df3.rename(columns={currency: "adj_close"}, inplace=True)

    #retrieving highest price
    df4 = df_high[['Date',currency]]
    #filtering only non-null records
    df4 = df4[~df4[currency].isnull()].copy()
    #renaming column ETH-USD to high, which means the highest price for the currency
    df4.rename(columns={currency: "high"}, inplace=True)

    #retrieving lowest price
    df5 = df_low[['Date',currency]]
    #filtering only non-null records
    df5 = df5[~df5[currency].isnull()].copy()
    #renaming column ETH-USD to df5, which means the lowest price for the currency
    df5.rename(columns={currency: "low"}, inplace=True)

    #retrieving Volume
    df6 = df_volume[['Date',currency]]
    #filtering only non-null records
    df6 = df6[~df6[currency].isnull()].copy()
    #renaming column ETH-USD to Volume, which means the Volume for the currency
    df6.rename(columns={currency: "volume"}, inplace=True)
    
    name=str(currency)

    #merging dataframes into a single dataframe
    temp_2 = pd.merge(df1, df2, left_on='Date', right_on='Date', how='left')
    temp_3 = pd.merge(temp_2, df3, left_on='Date', right_on='Date', how='left')
    temp_4 = pd.merge(temp_3, df4, left_on='Date', right_on='Date', how='left')
    temp_5 = pd.merge(temp_4, df5, left_on='Date', right_on='Date', how='left')
    temp_6 = pd.merge(temp_5, df6, left_on='Date', right_on='Date', how='left')    
    df[currency] = temp_6.copy()
    df[currency]['Date'] = pd.to_datetime(df[currency]['Date'])
    df[currency]['volume'] = df[currency]['volume'].astype('Int64')
    
    #Adding Three new rows to the dataset
    df[currency] = df[currency].append(
        pd.DataFrame({'Date': pd.date_range(start=df[currency].Date.iloc[-1], periods=4, freq='D', closed='right')}))
    df[currency].reset_index(inplace=True,drop=True)

    #Feature Engineering
    df[currency]['year'] = pd.DatetimeIndex(df[currency]['Date']).year
    df[currency]['quarter'] = pd.DatetimeIndex(df[currency]['Date']).quarter
    df[currency]['month'] = pd.DatetimeIndex(df[currency]['Date']).month
    df[currency]['week_number_year'] = pd.DatetimeIndex(df[currency]['Date']).week
    df[currency]['day_of_the_week'] = pd.DatetimeIndex(df[currency]['Date']).weekday
    df[currency]['day'] = pd.DatetimeIndex(df[currency]['Date']).day 

#### Creating the Target

In [73]:
for currency in list_of_currencys:    
    df[currency]['target_close'] = df[currency]['close']

#### Shifting the Data

In [74]:
for currency in list_of_currencys:
    df_temp_shift = df[currency][['open', 'close', 'adj_close', 'high', 'low', 'volume']].shift(+3)
    df_temp_target = df[currency][['Date','year','quarter','month','week_number_year','day_of_the_week','day','target_close']]
    df_temp_final = pd.concat([df_temp_shift,df_temp_target], axis = 1)
    #df[currency] = df_temp_final
    df[currency] = df_temp_final.iloc[3:, :]
    df[currency].reset_index(inplace=True,drop=True)

#### Creating new features

In [75]:
# Preserving the original datasets
df_original = {}
for currency in list_of_currencys:    
    df_original[currency] = df[currency].copy()

In [76]:
# Adding Technical Analysis Features
for currency in list_of_currencys:    
    ta.add_all_ta_features(df[currency], "open", "high", "low", "close", "volume", fillna=False)

In [77]:
# Dropping the features trend_psar_up and trend_psar_down due to the quantity of NaN values
for currency in list_of_currencys:    
    df[currency].drop(['trend_psar_up','trend_psar_down'], axis = 1, inplace=True)

In [78]:
# Changing the types of volume_adi and volume_obv from object to float64
for currency in list_of_currencys:    
    df[currency] = df[currency].astype({'volume_adi':'float64','volume_obv':'float64'})

In [79]:
# Drop the missing NaN values from the new features

list_columns_to_drop_NaN = df['ADA-USD'].columns.tolist()
list_columns_to_drop_NaN.remove('target_close')

for currency in list_of_currencys:    
    df[currency].dropna(axis=0, how='any', subset=list_columns_to_drop_NaN, inplace=True)

In [80]:
# Reseting indexes and Checking new features
for currency in list_of_currencys:
    df[currency].reset_index(inplace=True,drop=True)

#### Feature Selection

In [81]:
# Creating copies of the datasets for the purpose of feature selection
df_FS = {}
for currency in list_of_currencys:    
    df_FS[currency] = df[currency].copy()

In [82]:
# Removing Missing Values in the target (last three rows - new dates to be predicted)
for currency in list_of_currencys:    
    df_FS[currency].dropna(axis=0, inplace=True)

In [83]:
#calculates the date interval - size of the series
date_interval = (df_FS['BTC-USD']['Date'].max() - df_FS['BTC-USD']['Date'].min()) / np.timedelta64(1,'D')

#measuring the split size
split_size = date_interval*0.7

In [84]:
#calculates the split date
split_date = df_FS['BTC-USD']['Date'].min() + datetime.timedelta(days=split_size)
split_date = split_date.strftime('%Y-%m-%d')

#calculates the initial date
init_date = df_FS['SOL-USD']['Date'].min().strftime('%Y-%m-%d')

In [85]:
# Feature Selection using RFE and LinearRegression as the model

#Dataset splitting parameters
initial_Date = init_date #the minimum date in common to all the criptocurrencies
splitting_Date = split_date #70% for training and 30% for validation

selected_features = {}

for currency in list_of_currencys:

    # Splitting the dataset
    X = df_FS[currency].drop(['Date','target_close'], axis = 1)
    y = df_FS[currency]['target_close']
    initial_index = df_FS[currency].index[df_FS[currency]['Date'] == initial_Date].tolist()
    splitt_index = df_FS[currency].index[df_FS[currency]['Date'] == splitting_Date].tolist()    
    X_train = X[initial_index[0]:splitt_index[0]]
    X_val = X[splitt_index[0]:-3]
    y_train = y[initial_index[0]:splitt_index[0]]
    y_val = y[splitt_index[0]:-3]
    
    # Scalling the data using MinMaxScaler to scale between 1 and 0
    scaler = MinMaxScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_train_scaled = pd.DataFrame(X_train_scaled, columns = X_train.columns).set_index(X_train.index)
    X_val_scaled = pd.DataFrame(X_val_scaled, columns = X_val.columns).set_index(X_val.index)
      
    # Checking the ideal number of features
    nof_list=np.arange(1,len(X_train_scaled.columns))
    high_score=0
    nof=0           
    score_list =[]
    for n in range(len(nof_list)):
        model = LinearRegression()
        rfe = RFE(model,nof_list[n])
        X_train_rfe = rfe.fit_transform(X_train_scaled,y_train)
        X_val_rfe = rfe.transform(X_val_scaled)
        model.fit(X_train_rfe,y_train)

        score = model.score(X_val_rfe,y_val)
        score_list.append(score)

        if(score>high_score):
            high_score = score
            nof = nof_list[n]
    print(currency,' | ',"Optimum number of features: %d" %nof)
    print(currency,' | ',"Score with %d features: %f" % (nof, high_score),'\n')
    
    # Recursive Feature Elimination (RFE)
    model = LinearRegression()
    rfe = RFE(estimator = model, n_features_to_select = nof)
    X_rfe = rfe.fit_transform(X = X_train_scaled, y = y_train)
    temp_series = pd.Series(rfe.support_, index = X_train_scaled.columns)
    selected_features[currency] = temp_series[temp_series].index.tolist()

ADA-USD  |  Optimum number of features: 1
ADA-USD  |  Score with 1 features: 0.422873 

ATOM-USD  |  Optimum number of features: 3
ATOM-USD  |  Score with 3 features: 0.791099 

AVAX-USD  |  Optimum number of features: 10
AVAX-USD  |  Score with 10 features: 0.813130 

AXS-USD  |  Optimum number of features: 7
AXS-USD  |  Score with 7 features: 0.824699 

BTC-USD  |  Optimum number of features: 9
BTC-USD  |  Score with 9 features: 0.711438 

ETH-USD  |  Optimum number of features: 1
ETH-USD  |  Score with 1 features: 0.774560 

LINK-USD  |  Optimum number of features: 5
LINK-USD  |  Score with 5 features: 0.195152 

LUNA1-USD  |  Optimum number of features: 1
LUNA1-USD  |  Score with 1 features: 0.857307 

MATIC-USD  |  Optimum number of features: 3
MATIC-USD  |  Score with 3 features: 0.706565 

SOL-USD  |  Optimum number of features: 3
SOL-USD  |  Score with 3 features: 0.680714 



<hr>
<a class="anchor" id="ml_modeling">
    
# 7.0 Machine Learning Model and Assessment
    
</a>

<hr>
<a class="anchor" id="linear">
    
## Linear Regression
    
</a>

In [86]:
df_pred = {}
summary_LR = pd.DataFrame(index=['New_pred_1','New_pred_2'])

#Dataset splitting parameters
initial_Date = init_date
splitting_Date= split_date

for currency in list_of_currencys:
    
    # List of features to be used
    feat_list = []
    feat_list = selected_features[currency]

    # Splitting the dataset
    X = df[currency][feat_list]
    y = df[currency]['target_close']
    initial_index = df[currency].index[df[currency]['Date'] == initial_Date].tolist()
    splitt_index = df[currency].index[df[currency]['Date'] == splitting_Date].tolist()    
    X_train = X[initial_index[0]:splitt_index[0]]
    X_val = X[splitt_index[0]:-3]
    y_train = y[initial_index[0]:splitt_index[0]]
    y_val = y[splitt_index[0]:-3]
    X_new_pred = X[-3:]
    
#     # Splitting the dataset - classic approach
#     X = df[currency].drop(['Date','target_close'], axis = 1)
#     y = df[currency]['target_close']
#     X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.1, random_state=0, stratify=None, shuffle=False)

    #Training the model and running the predictions
    model = LinearRegression()
    model.fit(X_train,y_train)
    model_pred = model.predict(X_val)
    model_pred_df = pd.DataFrame(data=model_pred, index=X_val.index, columns=['pred'])
    
    #Running predictions for the training dataset for overfitting checking purpose
    model_pred_train = model.predict(X_train)
    model_pred_train_df = pd.DataFrame(data=model_pred_train, index=X_train.index, columns=['pred'])
    model_pred_train_df = pd.concat([df[currency],model_pred_train_df], axis = 1)
    
    # Predicting the new data
    model_new_pred = model.predict(X_new_pred)
    model_new_pred_df = pd.DataFrame(data=model_new_pred, index=X_new_pred.index, columns=['pred'])
    model_new_pred_df = pd.concat([df[currency],model_new_pred_df], axis = 1)
    
    
    #RMSE and R-squared
    rmse_val = np.sqrt(MSE(y_val,model_pred))
    mape_val = MAPE(y_val,model_pred)
    rmse_train = np.sqrt(MSE(y_train,model_pred_train))
    mape_train = MAPE(y_train,model_pred_train)

    #Concatenating the predictions to the original Dataset
    df_pred[currency] = pd.concat([df[currency],model_pred_df], axis = 1)

    
    summary_LR[currency] = [round(model_new_pred_df[-3:-2]['pred'].tolist()[0],2),
                            round(model_new_pred_df[-2:-1]['pred'].tolist()[0],2)]

In [87]:
#calling the predictions from Linear Regression
lr_predictions = summary_LR[['AVAX-USD','LUNA1-USD']].reset_index(drop=True)

#calling the predictions from ARIMA
df_arima = df_pred_final.reset_index(drop=True)

#storing the predictions in result_a
result_a = pd.concat([df_arima, lr_predictions], axis=1)

#storing the validation predictions for the chosen currencies
lr_avax = df_pred['AVAX-USD'][['Date','target_close']][-10:-3]
lr_avax = lr_avax.rename(columns={'target_close': 'AVAX-USD'})      
lr_luna = df_pred['LUNA1-USD'][['Date','target_close']][-10:-3]
lr_luna = lr_luna.rename(columns={'target_close': 'LUNA1-USD'}) 

In [88]:

#storing the validation predictions for the chosen currencies
temp_val_a = pd.merge(df_val_final, lr_avax, left_on='Date', right_on='Date', how='inner')
temp_val_b = pd.merge(temp_val_a, lr_luna, left_on='Date', right_on='Date', how='inner')

<hr>
<a class="anchor" id="rand_forest">
    
## Random Forest
    
</a>

In [89]:
df_pred = {}
summary_RF = pd.DataFrame(index=['New_pred_1','New_pred_2'])

#Dataset splitting parameters
initial_Date = init_date
splitting_Date= split_date

for currency in list_of_currencys:
    
    # List of features to be used
    feat_list = []
    feat_list = selected_features[currency]

    # Splitting the dataset
    X = df[currency][feat_list]
    y = df[currency]['target_close']
    initial_index = df[currency].index[df[currency]['Date'] == initial_Date].tolist()
    splitt_index = df[currency].index[df[currency]['Date'] == splitting_Date].tolist()    
    X_train = X[initial_index[0]:splitt_index[0]]
    X_val = X[splitt_index[0]:-3]
    y_train = y[initial_index[0]:splitt_index[0]]
    y_val = y[splitt_index[0]:-3]
    X_new_pred = X[-3:]
    
#     # Splitting the dataset - classic approach
#     X = df[currency].drop(['Date','target_close'], axis = 1)
#     y = df[currency]['target_close']
#     X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.1, random_state=0, stratify=None, shuffle=False)

    #Training the model and running the predictions
    model = RandomForestRegressor()
    model.fit(X_train,y_train)
    model_pred = model.predict(X_val)
    model_pred_df = pd.DataFrame(data=model_pred, index=X_val.index, columns=['pred'])
    
    #Running predictions for the training dataset for overfitting checking purpose
    model_pred_train = model.predict(X_train)
    model_pred_train_df = pd.DataFrame(data=model_pred_train, index=X_train.index, columns=['pred'])
    model_pred_train_df = pd.concat([df[currency],model_pred_train_df], axis = 1)
    
    # Predicting the new data
    model_new_pred = model.predict(X_new_pred)
    model_new_pred_df = pd.DataFrame(data=model_new_pred, index=X_new_pred.index, columns=['pred'])
    model_new_pred_df = pd.concat([df[currency],model_new_pred_df], axis = 1)
    
    
    #RMSE and R-squared
    rmse_val = np.sqrt(MSE(y_val,model_pred))
    mape_val = MAPE(y_val,model_pred)
    rmse_train = np.sqrt(MSE(y_train,model_pred_train))
    mape_train = MAPE(y_train,model_pred_train)

    #Concatenating the predictions to the original Dataset
    df_pred[currency] = pd.concat([df[currency],model_pred_df], axis = 1)

    summary_RF[currency] = [round(model_new_pred_df[-3:-2]['pred'].tolist()[0],2),
                            round(model_new_pred_df[-2:-1]['pred'].tolist()[0],2)]

In [90]:
#calling the predictions from Random Forest
rf_predictions = summary_RF['ADA-USD'].reset_index(drop=True)

#storing the predictions in result
result_b = pd.concat([result_a, rf_predictions], axis=1)

#storing the validation predictions for the chosen currencies
rf_ada = df_pred['ADA-USD'][['Date','target_close']][-10:-3]
rf_ada = rf_ada.rename(columns={'target_close': 'ADA-USD'}) 

<hr>
<a class="anchor" id="svm">
    
## Support Vector Machines
    
</a>

In [91]:
df_pred = {}
summary_SVM = pd.DataFrame(index=['New_pred_1','New_pred_2'])

#Dataset splitting parameters
initial_Date = init_date
splitting_Date= split_date

for currency in list_of_currencys:
    
    # List of features to be used
    feat_list = []
    feat_list = selected_features[currency]

    # Splitting the dataset
    X = df[currency][feat_list]
    y = df[currency]['target_close']
    initial_index = df[currency].index[df[currency]['Date'] == initial_Date].tolist()
    splitt_index = df[currency].index[df[currency]['Date'] == splitting_Date].tolist()    
    X_train = X[initial_index[0]:splitt_index[0]]
    X_val = X[splitt_index[0]:-3]
    y_train = y[initial_index[0]:splitt_index[0]]
    y_val = y[splitt_index[0]:-3]
    X_new_pred = X[-3:]
    
#     # Splitting the dataset - classic approach
#     X = df[currency].drop(['Date','target_close'], axis = 1)
#     y = df[currency]['target_close']
#     X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.1, random_state=0, stratify=None, shuffle=False)

    #Training the model and running the predictions
    model = svm.SVR()
    model.fit(X_train,y_train)
    model_pred = model.predict(X_val)
    model_pred_df = pd.DataFrame(data=model_pred, index=X_val.index, columns=['pred'])
    
    #Running predictions for the training dataset for overfitting checking purpose
    model_pred_train = model.predict(X_train)
    model_pred_train_df = pd.DataFrame(data=model_pred_train, index=X_train.index, columns=['pred'])
    model_pred_train_df = pd.concat([df[currency],model_pred_train_df], axis = 1)
    
    # Predicting the new data
    model_new_pred = model.predict(X_new_pred)
    model_new_pred_df = pd.DataFrame(data=model_new_pred, index=X_new_pred.index, columns=['pred'])
    model_new_pred_df = pd.concat([df[currency],model_new_pred_df], axis = 1)
    
    
    #RMSE and R-squared
    rmse_val = np.sqrt(MSE(y_val,model_pred))
    mape_val = MAPE(y_val,model_pred)
    rmse_train = np.sqrt(MSE(y_train,model_pred_train))
    mape_train = MAPE(y_train,model_pred_train)

    #Concatenating the predictions to the original Dataset
    df_pred[currency] = pd.concat([df[currency],model_pred_df], axis = 1)

    summary_SVM[currency] = [round(model_new_pred_df[-3:-2]['pred'].tolist()[0],2),
                            round(model_new_pred_df[-2:-1]['pred'].tolist()[0],2)]

In [92]:
#calling the predictions from Random Forest
svm_predictions = summary_SVM['ATOM-USD'].reset_index(drop=True)

#storing the predictions in result
result_c = pd.concat([result_b, svm_predictions], axis=1)

#storing the validation predictions for the chosen currencies
svm_atom = df_pred['ATOM-USD'][['Date','target_close']][-10:-3]
svm_atom = svm_atom.rename(columns={'target_close': 'ATOM-USD'})  

<hr>
<a class="anchor" id="neural">
    
## Neural Network Regressor
    
</a>

In [93]:
df_pred = {}
summary_NNR = pd.DataFrame(index=['New_pred_1','New_pred_2'])

#Dataset splitting parameters
initial_Date = init_date
splitting_Date= split_date

for currency in list_of_currencys:
    
    # List of features to be used
    feat_list = []
    feat_list = selected_features[currency]

    # Splitting the dataset
    X = df[currency][feat_list]
    y = df[currency]['target_close']
    initial_index = df[currency].index[df[currency]['Date'] == initial_Date].tolist()
    splitt_index = df[currency].index[df[currency]['Date'] == splitting_Date].tolist()    
    X_train = X[initial_index[0]:splitt_index[0]]
    X_val = X[splitt_index[0]:-3]
    y_train = y[initial_index[0]:splitt_index[0]]
    y_val = y[splitt_index[0]:-3]
    X_new_pred = X[-3:]
    
#     # Splitting the dataset - classic approach
#     X = df[currency].drop(['Date','target_close'], axis = 1)
#     y = df[currency]['target_close']
#     X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.1, random_state=0, stratify=None, shuffle=False)

    #Training the model and running the predictions
    model = MLPRegressor(hidden_layer_sizes=(100,100,100,100,100,100),max_iter=100,learning_rate_init=0.0001,learning_rate='adaptive')
    model.fit(X_train,y_train)
    model_pred = model.predict(X_val)
    model_pred_df = pd.DataFrame(data=model_pred, index=X_val.index, columns=['pred'])
    
    #Running predictions for the training dataset for overfitting checking purpose
    model_pred_train = model.predict(X_train)
    model_pred_train_df = pd.DataFrame(data=model_pred_train, index=X_train.index, columns=['pred'])
    model_pred_train_df = pd.concat([df[currency],model_pred_train_df], axis = 1)
    
    # Predicting the new data
    model_new_pred = model.predict(X_new_pred)
    model_new_pred_df = pd.DataFrame(data=model_new_pred, index=X_new_pred.index, columns=['pred'])
    model_new_pred_df = pd.concat([df[currency],model_new_pred_df], axis = 1)
    
    
    #RMSE and R-squared
    rmse_val = np.sqrt(MSE(y_val,model_pred))
    mape_val = MAPE(y_val,model_pred)
    rmse_train = np.sqrt(MSE(y_train,model_pred_train))
    mape_train = MAPE(y_train,model_pred_train)

    #Concatenating the predictions to the original Dataset
    df_pred[currency] = pd.concat([df[currency],model_pred_df], axis = 1)

    summary_NNR[currency] = [round(model_new_pred_df[-3:-2]['pred'].tolist()[0],2),
                            round(model_new_pred_df[-2:-1]['pred'].tolist()[0],2)]

In [94]:
#calling the predictions from Random Forest
nnr_predictions = summary_NNR['AXS-USD'].reset_index(drop=True)

#storing the predictions for D+1 and D+2 in the dataframe df_pred_dashboard
df_pred_dashboard = pd.concat([result_c, nnr_predictions], axis=1)

#storing the validation predictions for the chosen currencies
nnr_axs = df_pred['AXS-USD'][['Date','target_close']][-10:-3]
nnr_axs = nnr_axs.rename(columns={'target_close': 'AXS-USD'})

In [95]:
#storing the validation predictions for the chosen currencies
temp_val_a = pd.merge(df_val_final, lr_avax, left_on='Date', right_on='Date', how='inner')
temp_val_b = pd.merge(temp_val_a, lr_luna, left_on='Date', right_on='Date', how='inner')
temp_val_c = pd.merge(temp_val_b, rf_ada, left_on='Date', right_on='Date', how='inner')
temp_val_d = pd.merge(temp_val_c, svm_atom, left_on='Date', right_on='Date', how='inner')
df_val_dashboard = pd.merge(temp_val_d, nnr_axs, left_on='Date', right_on='Date', how='inner')

In [96]:
#final validation dataset to be used on Dashboard
df_val_dashboard

Unnamed: 0,Date,BTC-USD,LINK-USD,MATIC-USD,ETH-USD,SOL-USD,AVAX-USD,LUNA1-USD,ADA-USD,ATOM-USD,AXS-USD
0,2022-05-23,29788.937088,7.20255,0.684968,2010.718204,51.183741,29.026232,0.00016,0.512605,11.132798,20.923748
1,2022-05-24,28277.592449,6.425694,0.58355,1888.179571,46.129848,29.025648,0.000177,0.521224,11.082132,21.247337
2,2022-05-25,27738.028119,6.074141,0.52024,1777.391835,41.076308,27.225376,0.000182,0.513877,10.739818,20.898247
3,2022-05-26,28126.591006,6.205845,0.545342,1810.634183,42.06737,23.55003,0.000139,0.480883,9.643618,19.561852
4,2022-05-27,27493.34938,6.000148,0.533776,1764.054651,40.43035,22.418623,0.000119,0.455507,9.252514,18.198242
5,2022-05-28,26706.93135,5.65226,0.510146,1710.048201,38.30441,22.749695,0.000121,0.455911,9.414568,18.242544
6,2022-05-29,26604.632945,5.634501,0.522695,1716.712163,38.648659,26.166412,0.000105,0.480782,9.616521,18.423071


In [97]:
#final predictions dataset to be used on Dashboard
df_pred_dashboard

Unnamed: 0,Date,BTC-USD,LINK-USD,MATIC-USD,ETH-USD,SOL-USD,AVAX-USD,LUNA1-USD,ADA-USD,ATOM-USD,AXS-USD
0,2022-05-30,25541.601371,4.911794,0.438159,1604.93193,31.542039,21.34,0.92,1.2,11.97,17.89
1,2022-05-31,24745.639369,4.38216,0.364323,1471.716503,25.628087,21.42,0.92,1.2,11.89,17.93


<div class="alert alert-block alert-success">
    
__FIM__ <br>
     
    
</div>