In [94]:
!pip install pandas_ta
!pip install xgboost

import pandas as pd
import numpy as np
import pandas_ta as ta
import matplotlib.pyplot as plt
from scipy.stats import linregress
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier



**Loading and Preparing Data**

In [95]:
df = pd.read_csv("data/AAPL.USUSD_Candlestick_1_Hour_BID_26.01.2017-05.01.2024.csv")
df.tail()

Unnamed: 0,Local time,Open,High,Low,Close,Volume
60859,05.01.2024 19:00:00.000 GMT-0500,181.127,181.127,181.127,181.127,0.0
60860,05.01.2024 20:00:00.000 GMT-0500,181.127,181.127,181.127,181.127,0.0
60861,05.01.2024 21:00:00.000 GMT-0500,181.127,181.127,181.127,181.127,0.0
60862,05.01.2024 22:00:00.000 GMT-0500,181.127,181.127,181.127,181.127,0.0
60863,05.01.2024 23:00:00.000 GMT-0500,181.127,181.127,181.127,181.127,0.0


*Remove any rows with a volume of 0 (market closed)*

In [96]:
indexZeros = df[df['Volume'] == 0].index

df.drop(indexZeros, inplace = True)
df.loc[(df["Volume"] == 0)]
df.isna().sum()

Local time    0
Open          0
High          0
Low           0
Close         0
Volume        0
dtype: int64

In [97]:
#adding indicators from pandas_ta
df['ATR'] = df.ta.atr(length=20)    #length param represents # of candles analyzed
df['RSI'] = df.ta.rsi()
df['Average'] = df.ta.midprice(length=1)    #midprice
#moving averages
df['MA40'] = df.ta.sma(length=40)
df['MA80'] = df.ta.sma(length=80)
df['MA160'] = df.ta.sma(length=160)

def get_slope(array):
    y = np.array(array)
    x = np.arange(len(y))
    slope, intercept, r_value, p_value, std_err = linregress(x, y)
    return slope

#now get the slope of each indicator
df['RSI_Slope'] = df['RSI'].rolling(window=10).apply(get_slope, raw=True)
df['Average_Slope'] = df['Average'].rolling(window=10).apply(get_slope, raw=True)
df['MA40_Slope'] = df['MA40'].rolling(window=10).apply(get_slope, raw=True)
df['MA80_Slope'] = df['MA80'].rolling(window=10).apply(get_slope, raw=True)
df['MA160_Slope'] = df['MA160'].rolling(window=10).apply(get_slope, raw=True)

df.tail()

Unnamed: 0,Local time,Open,High,Low,Close,Volume,ATR,RSI,Average,MA40,MA80,MA160,RSI_Slope,Average_Slope,MA40_Slope,MA80_Slope,MA160_Slope
60851,05.01.2024 11:00:00.000 GMT-0500,182.647,182.727,181.907,182.086,92.304,1.121175,34.376395,182.317,187.5255,191.095587,192.684031,1.362818,0.047876,-0.262235,-0.169567,-0.047159
60852,05.01.2024 12:00:00.000 GMT-0500,182.096,182.206,181.267,181.586,99.048,1.112066,32.484464,181.7365,187.242475,190.908325,192.624156,1.132512,0.027609,-0.2609,-0.168842,-0.049363
60853,05.01.2024 13:00:00.000 GMT-0500,181.576,182.226,181.196,182.027,104.976,1.107963,35.838538,181.711,186.96375,190.7247,192.566469,0.973469,-0.031839,-0.261791,-0.170322,-0.051592
60854,05.01.2024 14:00:00.000 GMT-0500,182.026,182.167,180.417,180.476,124.044,1.140065,30.163062,181.292,186.6235,190.5272,192.499588,0.344893,-0.091988,-0.268011,-0.174671,-0.054374
60855,05.01.2024 15:00:00.000 GMT-0500,180.467,181.227,180.166,181.127,95.568,1.136111,34.828211,180.6965,186.296525,190.338325,192.448031,0.285097,-0.175961,-0.277119,-0.178951,-0.056198


**Target Categories**

In [102]:
ratio = 2   #take profit / ratio = stop loss

def my_target(bars, df):
    
    length = len(df)
    high = list(df['High'])
    low = list(df['Low'])
    open = list(df['Open'])
    close = list(df['Close'])
    trendcat = [None] * length
    
    for line in range(0, length - bars - 2):
        tp = 0.01 * open[line + 1]  #take profit at 2% moves
        valueOpenLow = 0
        valueOpenHigh = 0
        for i in range(1, bars + 2):
            value1 = open[line + 1] - low[line + i]
            value2 = open[line + 1] - high[line + i]
            valueOpenLow = max(value1, valueOpenLow)
            valueOpenHigh = min(value2, valueOpenHigh)

            if ((valueOpenLow >= tp) and (-valueOpenHigh <= (tp / ratio))):
                trendcat[line] = 1  #downtrend
                break
            elif ((valueOpenLow <= (tp / ratio)) and (-valueOpenHigh >= tp)):
                trendcat[line] = 2  #uptrend
            else:
                trendcat[line] = 0  #no clear trend
                
    return trendcat

In [103]:
df['Target'] = my_target(16, df)
df_model = df[['Volume', 'ATR', 'RSI', 'Average', 'MA40', 'MA80', 'MA160', 'RSI_Slope', 'Average_Slope', 'MA40_Slope', 'MA80_Slope', 'MA160_Slope', 'Target']]

#uncomment if data is weird
'''
fig = plt.figure(figsize = (15,20))
ax = fig.gca()
df_model.hist(ax = ax)
plt.show()
'''

'\nfig = plt.figure(figsize = (15,20))\nax = fig.gca()\ndf_model.hist(ax = ax)\nplt.show()\n'

**Splitting Input Data and Target**

In [104]:
df_model = df_model.dropna()    #clean empty rows with dropna()

inputs = ['Volume', 'ATR', 'RSI', 'Average', 'MA40', 'MA80', 'MA160', 'RSI_Slope', 'Average_Slope', 'MA40_Slope', 'MA80_Slope', 'MA160_Slope']
X = df_model[inputs]
Y = df_model['Target']
#print(X)

**Fitting the KNN Model + Evaluation**

*This approach is changed from sampling random data from the dataset.
Instead, it trains on the first 80% of price data, and tests on the remaining 20%, which better simulates real world data*


In [109]:
train_index = int(0.8 * len(X))
x_train, x_test = X[:train_index], X[train_index:]
y_train, y_test = Y[:train_index], Y[train_index:]

model = XGBClassifier()
model.fit(x_train, y_train)
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
acc_train = accuracy_score(y_train, pred_train)
acc_test = accuracy_score(y_test, pred_test)
print("Accuracy train: {:.4%}".format(acc_train))
print("Accuracy test: {:.4%}".format(acc_test))
print(df_model['Target'].value_counts() * 100 / df_model['Target'].count())

#Accuracy of random signals for seed
pred_test = np.random.choice([0, 1, 2], len(y_test))
accuracy_test = accuracy_score(y_test, pred_test)
print("Gambler's Accuracy: %.2f%%" % (accuracy_test * 100.0))


Accuracy train: 95.2002%
Accuracy test: 40.8172%
Target
0.0    45.735282
1.0    30.245232
2.0    24.019486
Name: count, dtype: float64
Gambler's Accuracy: 32.03%
