In [336]:
import pandas as pd
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [337]:
def date_parser(x):
    return datetime.strptime(x, '%Y-%m-%d')

In [338]:
data = pd.read_csv('data/usdchf_daily_2024.csv')
data.dropna(inplace=True)

In [339]:
len(data)

195

In [340]:
data = data.sort_values(by='date').reset_index(drop=True)

In [341]:
data_lag_1 = data[['open', 'high', 'low', 'close']].shift(1).add_suffix('_lag_1')
data_lag_2 = data[['open', 'high', 'low', 'close']].shift(2).add_suffix('_lag_2')
data_lag_3 = data[['open', 'high', 'low', 'close']].shift(3).add_suffix('_lag_3')
data_lag_4 = data[['open', 'high', 'low', 'close']].shift(4).add_suffix('_lag_4')
data_lag_5 = data[['open', 'high', 'low', 'close']].shift(5).add_suffix('_lag_5')
data_lag_6 = data[['open', 'high', 'low', 'close']].shift(6).add_suffix('_lag_6')
data_lag_7 = data[['open', 'high', 'low', 'close']].shift(7).add_suffix('_lag_7')

In [342]:
data = pd.concat([data, data_lag_1, data_lag_2, data_lag_3, data_lag_4, data_lag_5, data_lag_6, data_lag_7], axis=1)

In [343]:
data['gain_on_day'] = (data['open'] > data['close']).apply(int)

In [344]:
data = data.dropna()

In [345]:
feature_columns = list(data.columns)
feature_columns.remove('date')
feature_columns.remove('gain_on_day')
feature_columns.remove('open')
feature_columns.remove('close')
feature_columns.remove('high')
feature_columns.remove('low')

In [346]:
x = data[feature_columns] # Features
y = data.gain_on_day # Target variable
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, shuffle=False)

In [347]:
logreg = LogisticRegression(class_weight='balanced')
logreg.fit(x_train, y_train)
logreg_y_probabilities = logreg.predict_proba(x_test)
logreg_y_predictions = [int(x > .49) for x in list(logreg_y_probabilities[:, 1])]
print(metrics.classification_report(y_test, logreg_y_predictions))

              precision    recall  f1-score   support

           0       0.41      0.89      0.57        19
           1       0.67      0.14      0.24        28

    accuracy                           0.45        47
   macro avg       0.54      0.52      0.40        47
weighted avg       0.56      0.45      0.37        47



In [348]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
rf_y_predictions = rf.predict(x_test)
print(metrics.classification_report(y_test, rf_y_predictions))

              precision    recall  f1-score   support

           0       0.43      0.95      0.59        19
           1       0.80      0.14      0.24        28

    accuracy                           0.47        47
   macro avg       0.61      0.55      0.42        47
weighted avg       0.65      0.47      0.38        47



In [349]:
gb = GradientBoostingClassifier()
gb.fit(x_train, y_train)
gb_y_predictions = gb.predict(x_test)
print(metrics.classification_report(y_test, gb_y_predictions))

              precision    recall  f1-score   support

           0       0.42      0.84      0.56        19
           1       0.67      0.21      0.32        28

    accuracy                           0.47        47
   macro avg       0.54      0.53      0.44        47
weighted avg       0.57      0.47      0.42        47

