## Ethereum Price Classification (up/down)

##### By using the sliding window method, we can convert the time series data problem into a supervised learning.

In [1]:
# Import neccessary packages
import numpy as np 
import pandas as pd 
from pandas import DataFrame
from pandas import concat
from matplotlib import pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import log_loss, roc_auc_score, average_precision_score, classification_report

import warnings
warnings.filterwarnings("ignore")

##### Create some functions we are going to use:

In [2]:
def time_series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
        data: Sequence of observations as a list or NumPy array.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
        Pandas DataFrame of series framed for supervised learning.
    """
    
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
        
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    
    # put it all together
    agg = concat(cols, axis=1)
    agg.columns = names
    
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [4]:
def binary_classifier_metrics(y_test, y_pred):
    
    """
    Quick function to return metrics for binary classification models.
    
    Requires the imports of the functions below.
    """
    
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Precision score: ', precision_score(y_test, y_pred))
    print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred))
    print('F1 score: ', f1_score(y_test, y_pred))
    print('Recall score: ', recall_score(y_test, y_pred))

### 1. Data loading

In [5]:
df = pd.read_csv('data/eth_clean.csv')
df.drop(columns='Unnamed: 0', inplace = True)
df

Unnamed: 0,date,PriceUSD,AdrActCnt,AdrBal1in100MCnt,AdrBal1in10BCnt,AdrBal1in10MCnt,AdrBal1in1BCnt,AdrBal1in1MCnt,CapMrktCurUSD,CapRealUSD,...,FlowOutExUSD,GasUsedTx,GasUsedTxMean,HashRate,RevHashNtv,RevHashRateNtv,RevHashRateUSD,SplyAdrBalUSD1M,TxCnt,TxTfrValMedUSD
0,2015-08-08,1.199990,1208,9958,10267,9550,10115,8111,8.676871e+07,1.500465e+07,...,1.698517e+04,376006093,130512.354391,0.096483,3.360253,290325.822770,348388.084065,1.661840e+07,2881,1.199990
1,2015-08-09,1.199990,1113,10043,10411,9573,10222,8091,8.680133e+07,1.778419e+07,...,1.127113e+05,38863003,29242.289691,0.101360,3.105048,268276.120316,321928.661618,1.682678e+07,1329,15.599147
2,2015-08-10,1.199990,1430,10145,10572,9611,10348,8101,8.683471e+07,1.878138e+07,...,2.135630e+05,74070061,36362.327442,0.111855,2.881582,248968.649994,298759.890307,1.720648e+07,2037,0.718002
3,2015-08-11,0.990000,2697,10188,10706,9614,10429,8081,7.166698e+07,1.869114e+07,...,1.752126e+05,163481740,32940.104775,0.124450,2.607691,225304.507322,223051.462249,1.551874e+07,4963,0.053993
4,2015-08-12,1.288000,1219,10296,10893,9654,10574,8105,9.327472e+07,1.983690e+07,...,1.891297e+05,70102332,34431.400786,0.130915,2.422720,209322.978321,269607.996077,1.851254e+07,2036,12.880000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2291,2021-11-15,4569.407770,606036,1166692,18202185,241239,5573039,36595,5.367033e+11,2.735353e+11,...,1.029855e+10,97422476367,75542.615410,817.739849,0.000213,18.400399,84078.925802,1.025092e+08,1289636,456.940777
2292,2021-11-16,4234.131465,609879,1165639,18237665,240967,5580973,36564,4.973249e+11,2.675504e+11,...,1.678261e+09,96356492010,73545.984332,793.794054,0.000219,18.962616,80290.211052,1.022489e+08,1310153,423.413146
2293,2021-11-17,4265.599006,695412,1167736,18217312,241192,5589438,36563,5.010266e+11,2.678047e+11,...,1.282525e+09,98477697827,72772.919850,830.259240,0.000207,17.849640,76139.406528,1.022624e+08,1353219,319.823949
2294,2021-11-18,3985.674373,591159,1170963,18239379,241438,5599347,36605,4.681491e+11,2.625782e+11,...,1.424465e+09,98258045079,76510.060408,851.155962,0.000203,17.527776,69860.008277,1.019504e+08,1284250,398.991334


##### In order to use machine learning models, we need to reframe our multivariate time-series data into a supervised learning problem.

### 2. Data processing

In [6]:
def data_processing(df, window_length):
   
    """
    Create a function to process data with dataframe, window length arguments:
       - Drop the column date as we don't use it as a predictor
       - Use the defined function above to convert multivariate time-series data into supervised learning
       - y: new label (0,1)
       - Remove columns (t) we don't need    
    Return: X, y
    """
    
    data = df.copy()
    data.drop(columns = 'date', axis=1, inplace=True)
    
    # create a new feature named "Label": 1 means the price is up, 0 means the price is down or not changed
    data['Label'] = 0

    for i in range(len(data.index)):
        if i == 0:
            data['Label'][i] = 0 
        elif data['PriceUSD'][i] > data['PriceUSD'][i-1]:
            data['Label'][i] = 1 
        else:
            data['Label'][i] = 0 
        
    X = time_series_to_supervised(data, window_length,1)
    y = data['Label'][window_length:]

    X.drop(columns = ['var1(t)','var2(t)','var3(t)','var4(t)','var5(t)','var6(t)','var7(t)','var8(t)','var9(t)','var10(t)','var11(t)','var12(t)','var13(t)','var14(t)','var15(t)','var16(t)','var17(t)','var18(t)','var19(t)','var20(t)','var21(t)','var22(t)','var23(t)','var24(t)'], inplace=True)
    X.head()
    
    return X, y

##### Data processing

In [7]:
window_length = 7
X, y = data_processing(df, window_length)

In [8]:
print(X.shape)
print(y.shape)

(2289, 176)
(2289,)


##### Split data into train and test datasets with time-series order (shuffle=False)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

### 3. Train data with different machine learning models

In [16]:
def ml_model(X_train, y_train, X_test, y_test, model):
    
    """
    Define a pipeline, fit the model, predict price, print evaluation metrics  """
    
    # Define a pipeline
    pipeline = Pipeline(steps=[('scaling', StandardScaler()), ('classifier', model)])
    
    # Fit the model
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    # Print evaluation metrics
    print('Evaluation metrics: ')
    binary_classifier_metrics(y_test, y_pred)

#### 3.1 Logistic Regression

In [17]:
ml_model(X_train, y_train, X_test, y_test, LogisticRegression())

Evaluation metrics: 
Accuracy:  0.9781659388646288
Precision score:  0.9946236559139785
Confusion Matrix: 
 [[302   2]
 [ 13 370]]
F1 score:  0.9801324503311258
Recall score:  0.9660574412532638


#### 3.2 LinearSVC

In [18]:
linearSVC_model = ml_model(X_train, y_train, X_test, y_test, LinearSVC())

Evaluation metrics: 
Accuracy:  1.0
Precision score:  1.0
Confusion Matrix: 
 [[304   0]
 [  0 383]]
F1 score:  1.0
Recall score:  1.0


#### 3.3 XGBoost

In [19]:
ml_model(X_train, y_train, X_test, y_test, XGBClassifier())

Evaluation metrics: 
Accuracy:  1.0
Precision score:  1.0
Confusion Matrix: 
 [[304   0]
 [  0 383]]
F1 score:  1.0
Recall score:  1.0


#### 3.4 GaussianNB

In [14]:
NB_model = ml_model(X_train, y_train, X_test, y_test, GaussianNB())

Evaluation metrics: 
Accuracy:  1.0
Precision score:  1.0
Confusion Matrix: 
 [[304   0]
 [  0 383]]
F1 score:  1.0
Recall score:  1.0


#### 3.5 RandomForestClassifier

In [15]:
RF_model = ml_model(X_train, y_train, X_test, y_test, RandomForestClassifier(max_features='sqrt'))

Evaluation metrics: 
Accuracy:  1.0
Precision score:  1.0
Confusion Matrix: 
 [[304   0]
 [  0 383]]
F1 score:  1.0
Recall score:  1.0


### 4 Conclusion

##### The accuary of models is 1.0 because our dataset is small. We need to have a bigger dataset to have a more reliable model.