In [29]:
#Ref - https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8
import pandas as pd
import numpy as np
import datetime
import talib as ta
import matplotlib.pyplot as plt
import seaborn as sns
import talib as ta

In [30]:
#Read data
data = pd.read_csv('data/analysis/D1_XAUUSD_Returns.csv', parse_dates=[0], index_col='Date')
#print(data.head())

In [31]:
#Date related features
data['Week'] = data.index.week
data['Day_Week'] = data.index.dayofweek
data['daily_return'] = data.Close.pct_change().mul(100)
data['day_week_sin'] = np.sin(data.Day_Week*(2.*np.pi/7))
data['day_week_cos'] = np.cos(data.Day_Week*(2.*np.pi/7))
data['week_sin'] = np.sin((data.Week-1)*(2.*np.pi/52))
data['week_cos'] = np.cos((data.Week-1)*(2.*np.pi/52))
data['yearday_sin'] = np.sin((data.Yearday)*(2.*np.pi/365.25))
data['yearday_cos'] = np.cos((data.Yearday)*(2.*np.pi/365.25))
#Simple Moving Average - 50 day
data['SMA_50'] = ta.SMA(data.Close, timeperiod = 50)
#Simple Moving Average - 200 day
data['SMA_200'] = ta.SMA(data.Close, timeperiod = 200)
# Exponential Moving Average - 9 day
data['EMA_9'] = ta.EMA(data.Close, timeperiod = 9)
# Exponential Moving Average - 21 day
data['EMA_21'] = ta.EMA(data.Close, timeperiod = 21)
# MACD
data['macd'], data['macdsignal'], data['macdhist'] = ta.MACD(data.Close, fastperiod=12, slowperiod=200, signalperiod=9)
# RSI
data['RSI'] = ta.RSI(data.Close, timeperiod=14)
#Define relative indicators
data['SMA_Delta'] = (data['SMA_50'] - data['SMA_200']) / data['Close']
#Define relative indicators
data['EMA_Delta'] = (data['EMA_9'] - data['EMA_21']) / data['Close']

In [32]:
print(data.tail())

               Open     High      Low    Close  Volume    Weekday  Yearday  \
Date                                                                         
2019-04-03  1291.70  1294.49  1288.34  1291.76   94534  Wednesday       93   
2019-04-04  1291.76  1294.34  1280.85  1292.10   77631   Thursday       94   
2019-04-05  1292.10  1293.15  1284.33  1291.18  102223     Friday       95   
2019-04-07  1291.31  1293.63  1291.20  1291.67    2133     Sunday       97   
2019-04-08  1291.73  1303.65  1291.58  1297.94   73029     Monday       98   

            Return_per_day  Return_log  Week  ...     SMA_50     SMA_200  \
Date                                          ...                          
2019-04-03        0.004645    0.000046    14  ...  1308.0620  1252.10065   
2019-04-04        0.026321    0.000263    14  ...  1307.6060  1252.59180   
2019-04-05       -0.071202   -0.000712    14  ...  1307.2982  1253.17900   
2019-04-07        0.027879    0.000379    14  ...  1306.9242  1253.76085 

In [33]:
#Choosing target variable
data['Large_return'] = np.where(data['daily_return']>=0.5, 1, 0)

In [34]:
#Drop rows with nan values
data.dropna(inplace=True)

In [35]:
#Pick features
features_list = ['Volume','day_week_sin','day_week_cos','week_sin','week_cos','yearday_sin','yearday_cos',
                 'macdhist', 'RSI', 'SMA_Delta','EMA_Delta']
features = data[features_list]
features.head()

Unnamed: 0_level_0,Volume,day_week_sin,day_week_cos,week_sin,week_cos,yearday_sin,yearday_cos,macdhist,RSI,SMA_Delta,EMA_Delta
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2004-01-12,20736,0.0,1.0,0.239316,0.970942,0.204966,0.978769,2.239954,70.847411,0.071611,0.011534
2004-01-13,21473,0.781831,0.62349,0.239316,0.970942,0.221772,0.975099,1.835514,68.227339,0.072198,0.010996
2004-01-14,20415,0.974928,-0.222521,0.239316,0.970942,0.238513,0.971139,0.945729,57.107955,0.073547,0.009352
2004-01-15,22449,0.433884,-0.900969,0.239316,0.970942,0.255182,0.966893,-0.982156,39.857679,0.075568,0.005214
2004-01-16,19571,-0.433884,-0.900969,0.239316,0.970942,0.271777,0.96236,-2.539018,37.444023,0.076086,0.001362


In [36]:
#Pick target variable 
target_unshifted = data['Large_return']
#Add shift so we can predict target on time t=1 with features from time t
target = target_unshifted.shift(-1)
#Drop last value from features and target (Nan)
features = features[:-1]
target = target[:-1]
print(features.tail())
print(target.tail())

            Volume  day_week_sin  day_week_cos  week_sin      week_cos  \
Date                                                                     
2019-04-02   84862      0.781831      0.623490       1.0  6.123234e-17   
2019-04-03   94534      0.974928     -0.222521       1.0  6.123234e-17   
2019-04-04   77631      0.433884     -0.900969       1.0  6.123234e-17   
2019-04-05  102223     -0.433884     -0.900969       1.0  6.123234e-17   
2019-04-07    2133     -0.781831      0.623490       1.0  6.123234e-17   

            yearday_sin  yearday_cos  macdhist        RSI  SMA_Delta  \
Date                                                                   
2019-04-02     0.999930    -0.011826 -5.395313  42.392384   0.044015   
2019-04-03     0.999579    -0.029025 -5.344675  42.440840   0.043322   
2019-04-04     0.998932    -0.046215 -5.125334  42.734791   0.042577   
2019-04-05     0.997989    -0.063391 -4.943795  42.108149   0.041915   
2019-04-07     0.995218    -0.097683 -4.629876  4

In [37]:
target.value_counts()

0.0    3604
1.0    1149
Name: Large_return, dtype: int64

In [38]:
def split_train_test(data, train_ratio):
    indices = np.arange(len(data))
    train_set_size = int(len(data) * train_ratio)
    train_indices = indices[:train_set_size]
    test_indices = indices[train_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

X_train_set, X_test_set = split_train_test(features, 0.8)

In [39]:
print(len(X_train_set))
print(len(X_test_set))
print("Training Series:", "\n", X_train_set.head(), "\n")
print("Training Series:", "\n", X_train_set.tail(), "\n")
print("Testing Series:", "\n", X_test_set.head())
print("Testing Series:", "\n", X_test_set.tail())

3802
951
Training Series: 
             Volume  day_week_sin  day_week_cos  week_sin  week_cos  \
Date                                                                 
2004-01-12   20736      0.000000      1.000000  0.239316  0.970942   
2004-01-13   21473      0.781831      0.623490  0.239316  0.970942   
2004-01-14   20415      0.974928     -0.222521  0.239316  0.970942   
2004-01-15   22449      0.433884     -0.900969  0.239316  0.970942   
2004-01-16   19571     -0.433884     -0.900969  0.239316  0.970942   

            yearday_sin  yearday_cos  macdhist        RSI  SMA_Delta  \
Date                                                                   
2004-01-12     0.204966     0.978769  2.239954  70.847411   0.071611   
2004-01-13     0.221772     0.975099  1.835514  68.227339   0.072198   
2004-01-14     0.238513     0.971139  0.945729  57.107955   0.073547   
2004-01-15     0.255182     0.966893 -0.982156  39.857679   0.075568   
2004-01-16     0.271777     0.962360 -2.539018  3

In [40]:
y_train_set, y_test_set = split_train_test(target, 0.8)

In [41]:
print(len(y_train_set))
print(len(y_test_set))
print("Training Series:", "\n", y_train_set.head(), "\n")
print("Training Series:", "\n", y_train_set.tail(), "\n")
print("Testing Series:", "\n", y_test_set.head())
print("Testing Series:", "\n", y_test_set.tail())

3802
951
Training Series: 
 Date
2004-01-12    0.0
2004-01-13    0.0
2004-01-14    0.0
2004-01-15    0.0
2004-01-16    0.0
Name: Large_return, dtype: float64 

Training Series: 
 Date
2016-03-10    0.0
2016-03-11    0.0
2016-03-13    0.0
2016-03-14    0.0
2016-03-15    1.0
Name: Large_return, dtype: float64 

Testing Series: 
 Date
2016-03-16    0.0
2016-03-17    0.0
2016-03-18    0.0
2016-03-20    0.0
2016-03-21    0.0
Name: Large_return, dtype: float64
Testing Series: 
 Date
2019-04-02    0.0
2019-04-03    0.0
2019-04-04    0.0
2019-04-05    0.0
2019-04-07    0.0
Name: Large_return, dtype: float64


In [67]:
#SMOTE to balance data
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)

columns = X_train_set.columns

os_data_X,os_data_y=os.fit_sample(X_train_set, y_train_set.values.ravel())
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of positive returns",len(os_data_y[os_data_y['y']==1]))
print("Proportion of pos. returns data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of pos. returns data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))

length of oversampled data is  5620
Number of no subscription in oversampled data 2810
Number of positive returns 2810
Proportion of pos. returns data in oversampled data is  0.5
Proportion of pos. returns data in oversampled data is  0.5


In [68]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
rfe = RFE(logreg, 12)
rfe = rfe.fit(os_data_X, os_data_y.values.ravel())
print(rfe.support_)
print(rfe.ranking_)

[ True  True  True  True  True  True  True  True  True  True  True]
[1 1 1 1 1 1 1 1 1 1 1]




In [69]:
#Use SMOTE data

X_train_set, X_test_set = split_train_test(os_data_X, 0.8)
y_train_set, y_test_set = split_train_test(os_data_y, 0.8)

In [70]:
print(len(X_train_set))
print(len(X_test_set))
print("Training Series:", "\n", X_train_set.head(), "\n")
print("Training Series:", "\n", X_train_set.tail(), "\n")
print("Testing Series:", "\n", X_test_set.head())
print("Testing Series:", "\n", X_test_set.tail())

4496
1124
Training Series: 
     Volume  day_week_sin  day_week_cos  week_sin  week_cos  yearday_sin  \
0  20736.0      0.000000      1.000000  0.239316  0.970942     0.204966   
1  21473.0      0.781831      0.623490  0.239316  0.970942     0.221772   
2  20415.0      0.974928     -0.222521  0.239316  0.970942     0.238513   
3  22449.0      0.433884     -0.900969  0.239316  0.970942     0.255182   
4  19571.0     -0.433884     -0.900969  0.239316  0.970942     0.271777   

   yearday_cos  macdhist        RSI  SMA_Delta  EMA_Delta  
0     0.978769  2.239954  70.847411   0.071611   0.011534  
1     0.975099  1.835514  68.227339   0.072198   0.010996  
2     0.971139  0.945729  57.107955   0.073547   0.009352  
3     0.966893 -0.982156  39.857679   0.075568   0.005214  
4     0.962360 -2.539018  37.444023   0.076086   0.001362   

Training Series: 
             Volume  day_week_sin  day_week_cos  week_sin  week_cos  \
4491  14331.016310      0.831205     -0.402743  0.518991  0.285975   

In [71]:
print(len(y_train_set))
print(len(y_test_set))
print("Training Series:", "\n", y_train_set.head(), "\n")
print("Training Series:", "\n", y_train_set.tail(), "\n")
print("Testing Series:", "\n", y_test_set.head())
print("Testing Series:", "\n", y_test_set.tail())

4496
1124
Training Series: 
      y
0  0.0
1  0.0
2  0.0
3  0.0
4  0.0 

Training Series: 
         y
4491  1.0
4492  1.0
4493  1.0
4494  1.0
4495  1.0 

Testing Series: 
         y
4496  1.0
4497  1.0
4498  1.0
4499  1.0
4500  1.0
Testing Series: 
         y
5615  1.0
5616  1.0
5617  1.0
5618  1.0
5619  1.0


In [72]:
#model = LogisticRegression(multi_class='multinomial',solver ='lbfgs', max_iter=100) 
model = LogisticRegression(max_iter=100) 
model.fit(X_train_set, y_train_set.values.ravel())
y_model = model.predict(X_test_set)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(model.score(X_test_set, y_test_set)))

Accuracy of logistic regression classifier on test set: 0.00




In [73]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
confusion_matrix = confusion_matrix(y_test_set, y_model)
print(confusion_matrix)
print(classification_report(y_test_set, y_model))

[[   0    0]
 [1124    0]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         0
         1.0       0.00      0.00      0.00      1124

   micro avg       0.00      0.00      0.00      1124
   macro avg       0.00      0.00      0.00      1124
weighted avg       0.00      0.00      0.00      1124



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
