In [36]:
import pandas as pd
import numpy as np
import datetime as dt
import yfinance as yf

from functions import grangerTests, plot_seasonal_decompose, daySignal, addDateParts

In [37]:
#define the ticker symbol
tickerSymbol = 'ETH-USD' #'MSFT'

# Date period
start = dt.datetime(2010,1,1)
end = dt.datetime.now()

#get data on this ticker
tickerData = yf.Ticker(tickerSymbol)
#get the historical prices for this ticker
df = tickerData.history(period='1D', start=start, end=end)

# Removing features with constant values
df = df.loc[:,df.apply(pd.Series.nunique) != 1]

# Some calculated variables
df['day_change'] = df['Close'] - df['Open']
df['day_change_pct'] = (df['day_change'] / df['Open']) * 100
df['day_change_signal'] = df['day_change'].apply(daySignal)

# Lagging close, high and low by one day
for feature in ['Close', 'High', 'Low']:
    df[f"{feature}_lag1"] = df[feature].shift(1).bfill() 

# Adding date features
df = addDateParts(df)

# Exit datetime index
df = df.reset_index()

# View dataframe
df

  df['week'] = df.index.week


Unnamed: 0,Date,Open,High,Low,Close,Volume,day_change,day_change_pct,day_change_signal,Close_lag1,High_lag1,Low_lag1,day_of_year,day_of_week,week,month
0,2017-11-09 00:00:00+00:00,308.644989,329.451996,307.056000,320.884003,893249984,12.239014,3.965402,1,320.884003,329.451996,307.056000,9,3,45,11
1,2017-11-10 00:00:00+00:00,320.670990,324.717987,294.541992,299.252991,885985984,-21.417999,-6.679120,-1,320.884003,329.451996,307.056000,10,4,45,11
2,2017-11-11 00:00:00+00:00,298.585999,319.453003,298.191986,314.681000,842300992,16.095001,5.390407,1,299.252991,324.717987,294.541992,11,5,45,11
3,2017-11-12 00:00:00+00:00,314.690002,319.153015,298.513000,307.907990,1613479936,-6.782013,-2.155141,-1,314.681000,319.453003,298.191986,12,6,45,11
4,2017-11-13 00:00:00+00:00,307.024994,328.415009,307.024994,316.716003,1041889984,9.691010,3.156424,1,307.907990,319.153015,298.513000,13,0,46,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1902,2023-01-24 00:00:00+00:00,1627.848267,1639.723877,1551.389771,1556.604248,8180274691,-71.244019,-4.376576,-1,1628.251099,1641.218140,1607.903076,24,1,4,1
1903,2023-01-25 00:00:00+00:00,1556.807495,1632.241699,1530.797852,1611.711060,10598973448,54.903564,3.526677,1,1556.604248,1639.723877,1551.389771,25,2,4,1
1904,2023-01-26 00:00:00+00:00,1611.080933,1626.198242,1586.598145,1603.105957,8395315241,-7.974976,-0.495008,-1,1611.711060,1632.241699,1530.797852,26,3,4,1
1905,2023-01-27 00:00:00+00:00,1603.080078,1617.000854,1565.244995,1598.156494,8124465373,-4.923584,-0.307133,-1,1603.105957,1626.198242,1586.598145,27,4,4,1


In [38]:
X = df[['Date', 'Open', 'Close_lag1', 'day_of_year', 'day_of_week', 'week', 'month']]
y = df[['Date', 'day_change_signal']]

In [39]:
split = X['Date'].max()-dt.timedelta(days=1*365)

X_train = X[X['Date'] < split].drop(columns=('Date'))
y_train = y[y['Date'] < split].drop(columns=('Date'))
X_test = X[X['Date'] >= split].drop(columns=('Date'))
y_test = y[y['Date'] >= split].drop(columns=('Date'))

In [40]:
print(f"The shape of the training data is {X_train.shape} features and {y_train.shape}")
print("--------------------------")
print(f"The shape of the testing data is {X_test.shape} features and {y_test.shape}")

The shape of the training data is (1541, 6) features and (1541, 1)
--------------------------
The shape of the testing data is (366, 6) features and (366, 1)


In [41]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y_train_enc = pd.Series(le.fit_transform(y_train))
y_test_enc = pd.Series(le.transform(y_test))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [42]:
y_train_enc.value_counts(normalize=True)

1    0.515899
0    0.484101
dtype: float64

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import r2_score, accuracy_score, classification_report, roc_auc_score, recall_score, precision_score, f1_score, cohen_kappa_score

In [44]:
perf_dict = dict()
model_dict = {
    'logr_clf':LogisticRegression(random_state=0),
    'svc_clf':SVC(gamma='auto'), 
}

In [45]:
def logPerf(y_pred, y_proba, y_test):

    # Log performance metrics
    perf_dict = {
        'accuracy': accuracy_score(y_test, y_pred),
        'auc': roc_auc_score(y_test, y_proba, multi_class='ovr'),
        'precision': precision_score(y_test, y_pred, average='weighted'),
        'recall': recall_score(y_test, y_pred, average='weighted'),
        'f1': f1_score(y_test, y_pred, average='weighted'),
        'kappa': cohen_kappa_score(y_test, y_pred)
    }

    ret_dict = {
        'y_pred': y_pred,
        'y_proba':y_proba,
        'perf_results': perf_dict
    }

    return ret_dict

In [47]:
for clf in model_dict:
    print(clf)
    # Define classifer
    clf = model_dict[clf].fit(X_train, y_train_enc)
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)
    perf_dict[clf] = logPerf(y_pred, y_proba, X_test)

logr_clf


ValueError: Classification metrics can't handle a mix of continuous-multioutput and binary targets

In [48]:
model_name = 'logr_clf'

# Define classifer
clf = LogisticRegression(random_state=0).fit(X_train, y_train_enc)
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)

# Log performance metrics
perf_dict[model_name] = {
    'accuracy': accuracy_score(y_test_enc, y_pred),
    'auc': roc_auc_score(y_test_enc, y_proba, multi_class='ovr'),
    'precision': precision_score(y_test_enc, y_pred, average='weighted'),
    'recall': recall_score(y_test_enc, y_pred, average='weighted'),
    'f1': f1_score(y_test_enc, y_pred, average='weighted'),
    'kappa': cohen_kappa_score(y_test_enc, y_pred)
}

ValueError: y should be a 1d array, got an array of shape (366, 2) instead.

In [49]:
pd.DataFrame(perf_dict).T