In [2]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import datetime
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import metrics
from sklearn.metrics import classification_report

In [3]:
# load up dataframe
df_base = pd.read_csv('./price data/eurusd-otc_5sec_2024-06-11_2024-07-05.csv', index_col=0)
df_base.head()

Unnamed: 0,time,close
0,2024-06-11 02:00:00,1.0615
1,2024-06-11 02:00:05,1.06162
2,2024-06-11 02:00:10,1.06154
3,2024-06-11 02:00:15,1.061595
4,2024-06-11 02:00:20,1.06161


In [4]:
# create a copy of base in case we need base again later
df = df_base.copy()

# seperate into numpy arrays for faster computation
close_l = df['close'].values

# create a diff array to see if price went up or down between rows
diff_l = [0]

for i in range(0,len(close_l)-1):
    if close_l[i+1] > close_l[i]:
        diff_l.append(1)
    elif close_l[i+1] < close_l[i]:
        diff_l.append(-1)
    elif close_l[i+1] == close_l[i]:
        diff_l.append(-1)

# add back onto df
df['diff'] = diff_l
df.head()

Unnamed: 0,time,close,diff
0,2024-06-11 02:00:00,1.0615,0
1,2024-06-11 02:00:05,1.06162,1
2,2024-06-11 02:00:10,1.06154,-1
3,2024-06-11 02:00:15,1.061595,1
4,2024-06-11 02:00:20,1.06161,1


In [5]:
# convert time column from string to datetime
df['time'] = pd.to_datetime(df['time'])
# split the times
df['day'] = df['time'].dt.dayofweek
df['hour'] = df['time'].dt.hour
df['minute'] = df['time'].dt.minute
# rearrange the df
df = df[['time', 'close', 'day', 'hour', 'minute', 'diff']]
df.head()

Unnamed: 0,time,close,day,hour,minute,diff
0,2024-06-11 02:00:00,1.0615,1,2,0,0
1,2024-06-11 02:00:05,1.06162,1,2,0,1
2,2024-06-11 02:00:10,1.06154,1,2,0,-1
3,2024-06-11 02:00:15,1.061595,1,2,0,1
4,2024-06-11 02:00:20,1.06161,1,2,0,1


In [6]:
# we can look at some basic correlation in the df
df.corr()

Unnamed: 0,time,close,day,hour,minute,diff
time,1.0,-0.306953,-0.008029,0.001719,0.001086,-0.002057
close,-0.306953,1.0,-0.047274,-0.021774,0.00034,0.001961
day,-0.008029,-0.047274,1.0,-0.015358,-0.000203,-0.00072
hour,0.001719,-0.021774,-0.015358,1.0,-8.2e-05,-3.5e-05
minute,0.001086,0.00034,-0.000203,-8.2e-05,1.0,-0.001172
diff,-0.002057,0.001961,-0.00072,-3.5e-05,-0.001172,1.0


In [7]:
# feature selection
X = df.iloc[:,1:5] # all features
Y = df.iloc[:,-1] # target variable

# Select the top 3 features:
best_features= SelectKBest(score_func=chi2, k=3)
fit= best_features.fit(X,Y)

df_scores= pd.DataFrame(fit.scores_)
df_columns= pd.DataFrame(X.columns)

features_scores= pd.concat([df_columns, df_scores], axis=1)
features_scores.columns= ['Features', 'Score']
features_scores.sort_values(by = 'Score')

Unnamed: 0,Features,Score
0,close,0.000599
1,day,1.526037
2,hour,7.7705
3,minute,35.406001


In [8]:
# build the model
# split the datasets
X = df[['minute', 'day', 'hour']]  # the top 3 features
Y = df[['diff']]  # the target output

# Second, split the dataset into train and test:
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.4,random_state=100)

# Third, create a logistic regression body:
logreg = LogisticRegression()
logreg.fit(X_train,y_train.values.ravel())

# run prediction
y_pred = logreg.predict(X_test)
print(X_test) #test dataset
print(y_pred) #predicted values

        minute  day  hour
338189      42    6    15
54328       27    4     5
285859       1    3    15
16799       19    2     1
27022       31    2    15
...        ...  ...   ...
224985      28    0     2
178330      40    4     9
32259       48    2    22
43302        8    3    14
305731      37    4    18

[169304 rows x 3 columns]
[-1 -1 -1 ... -1 -1 -1]


In [11]:
X_train.shape

(253955, 3)

In [10]:
# Evaluate the Model’s Performance
print('Accuracy: ',metrics.accuracy_score(y_test, y_pred))
print('Recall: ',metrics.recall_score(y_test, y_pred, zero_division=1))
print('Precision:',metrics.precision_score(y_test, y_pred, zero_division=1))
print('CL Report:',metrics.classification_report(y_test, y_pred, zero_division=1))

Accuracy:  0.5129294050937958


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].