In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score

In [11]:
df = pd.read_csv('Data/Training/pair_features1_300_60.csv')
df['pnls'] = df['pnls']*100.0
df.head()

Unnamed: 0,Date,Ticker_P1,Close_P1,Ticker_P2,Close_P2,High_P1,High_P2,Low_P1,Low_P2,Volume_P1,...,abs_spread_normed_max,abs_spread_normed_90th,abs_spread_normed_75th,abs_spread_normed_median,abs_spread_normed_l7_avg,abs_spread_normed_l14_avg,cos_sim,corr_coef,pnls,num_entries
0,2015-01-02,CAG,21.161039,NOC,126.093826,28.241245,149.160004,27.867704,144.539993,5126379.0,...,,,,,,,,,,
1,2015-01-05,CAG,20.930645,NOC,123.433464,28.023346,146.470001,27.564201,142.449997,5400213.0,...,,,,,,,,,,
2,2015-01-06,CAG,20.723877,NOC,124.113678,27.743191,146.0,27.167315,141.580002,5229051.0,...,,,,,,,,,,
3,2015-01-07,CAG,21.214211,NOC,128.039536,27.976654,148.830002,27.431908,144.880005,3124092.0,...,,,,,,,,,,
4,2015-01-08,CAG,21.674999,NOC,131.009796,28.56031,153.139999,28.054476,149.809998,4154019.0,...,,,,,,,,,,


In [12]:
df.columns

Index(['Date', 'Ticker_P1', 'Close_P1', 'Ticker_P2', 'Close_P2', 'High_P1',
       'High_P2', 'Low_P1', 'Low_P2', 'Volume_P1', 'Volume_P2', 'abs_spread',
       'abs_spread_mean', 'abs_spread_std', 'abs_spread_mean_l28',
       'abs_spread_std_l28', 'spread_normed', 'abs_spread_normed_max',
       'abs_spread_normed_90th', 'abs_spread_normed_75th',
       'abs_spread_normed_median', 'abs_spread_normed_l7_avg',
       'abs_spread_normed_l14_avg', 'cos_sim', 'corr_coef', 'pnls',
       'num_entries'],
      dtype='object')

In [13]:
spy_df = pd.read_csv('Data/Training/1999-12-01-2023-12-31_SPY.csv')
spy_df = spy_df[['Date','Adj Close']]
spy_df.columns = ['Date','SPY_Close']

# Define a variable to calculate the return if we just buy SPY and sell in the next 60 days
spy_60_return = []
for i in range(spy_df.shape[0]):
    if (i + 60) < spy_df.shape[0]:
        spy_60_return.append(
            100*(spy_df.loc[i+60]['SPY_Close'] - spy_df.loc[i]['SPY_Close'])/spy_df.loc[i]['SPY_Close']
        )
    else:
        spy_60_return.append(
            np.nan
        )
spy_df['SPY_return_next_60'] = spy_60_return

In [14]:
print(df.shape)
df = df.drop(df.groupby(['Ticker_P1','Ticker_P2']).head(301).index,axis=0)
print(df.shape)
df = df.drop(df.groupby(['Ticker_P1','Ticker_P2']).tail(61).index,axis=0)
print(df.shape)


(1257000, 27)
(956000, 27)
(895000, 27)


In [15]:
df.isna().sum().sort_values(ascending=False).head(20)

abs_spread_normed_median     297000
abs_spread_normed_75th       297000
abs_spread_normed_90th       297000
abs_spread_normed_max        297000
abs_spread_normed_l14_avg     11000
abs_spread_normed_l7_avg       4000
Date                              0
abs_spread_mean_l28               0
pnls                              0
corr_coef                         0
cos_sim                           0
spread_normed                     0
abs_spread_std_l28                0
abs_spread_std                    0
Ticker_P1                         0
abs_spread_mean                   0
abs_spread                        0
Volume_P2                         0
Volume_P1                         0
Low_P2                            0
dtype: int64

In [16]:
df = pd.merge(df, spy_df[['Date','SPY_return_next_60']], how='left', on='Date')
df['better_than_spy'] = df.SPY_return_next_60 < df.pnls
df['recommended_trade'] = (df.pnls >= 0) & (df.better_than_spy)

In [17]:
df.columns

Index(['Date', 'Ticker_P1', 'Close_P1', 'Ticker_P2', 'Close_P2', 'High_P1',
       'High_P2', 'Low_P1', 'Low_P2', 'Volume_P1', 'Volume_P2', 'abs_spread',
       'abs_spread_mean', 'abs_spread_std', 'abs_spread_mean_l28',
       'abs_spread_std_l28', 'spread_normed', 'abs_spread_normed_max',
       'abs_spread_normed_90th', 'abs_spread_normed_75th',
       'abs_spread_normed_median', 'abs_spread_normed_l7_avg',
       'abs_spread_normed_l14_avg', 'cos_sim', 'corr_coef', 'pnls',
       'num_entries', 'SPY_return_next_60', 'better_than_spy',
       'recommended_trade'],
      dtype='object')

In [19]:
features_names = ['cos_sim', 'corr_coef',
       'abs_spread_normed_max', 'abs_spread_normed_90th',
       'abs_spread_normed_75th', 'abs_spread_normed_median',
       'abs_spread_normed_l7_avg', 'abs_spread_normed_l14_avg']

label = 'recommended_trade'

In [23]:
print(df.shape)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.dropna(subset=features_names)
print(df.shape)

(895000, 30)
(598000, 30)


In [24]:
X_train, X_test, y_train, y_test = train_test_split(df[features_names], df[label], test_size=0.2, shuffle=False)

In [25]:
# Initialize and train the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Initialize and train the Logistic Regression model
log_reg = LogisticRegression(max_iter=200, random_state=42)
log_reg.fit(X_train, y_train)

In [26]:
# Plot feature importances
importances = clf.feature_importances_
feature_imp_tb_tree = pd.DataFrame()
feature_imp_tb_tree['features'] = features_names
feature_imp_tb_tree['importances'] = importances

In [27]:
# Plot feature importances
importances = log_reg.coef_[0]
feature_imp_tb_lr = pd.DataFrame()
feature_imp_tb_lr['features'] = features_names
feature_imp_tb_lr['coef'] = importances

In [28]:
feature_imp_tb_tree

Unnamed: 0,features,importances
0,cos_sim,0.165585
1,corr_coef,0.174071
2,abs_spread_normed_max,0.086254
3,abs_spread_normed_90th,0.083288
4,abs_spread_normed_75th,0.090323
5,abs_spread_normed_median,0.12428
6,abs_spread_normed_l7_avg,0.122541
7,abs_spread_normed_l14_avg,0.153659


In [29]:
feature_imp_tb_lr

Unnamed: 0,features,coef
0,cos_sim,-0.816847
1,corr_coef,0.184939
2,abs_spread_normed_max,-0.120894
3,abs_spread_normed_90th,0.525478
4,abs_spread_normed_75th,-0.605174
5,abs_spread_normed_median,0.837406
6,abs_spread_normed_l7_avg,0.756699
7,abs_spread_normed_l14_avg,-0.913138


In [30]:
# Predict the labels for the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Calculate precision, recall, and F1 score
report = classification_report(y_test, y_pred)
print(report)

Accuracy: 0.65
              precision    recall  f1-score   support

       False       0.77      0.74      0.75     86099
        True       0.39      0.43      0.41     33501

    accuracy                           0.65    119600
   macro avg       0.58      0.59      0.58    119600
weighted avg       0.66      0.65      0.66    119600



In [31]:
np.mean(y_test)

0.2801086956521739

In [32]:
# Predict the labels for the test set
y_pred = log_reg.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Calculate precision, recall, and F1 score
report = classification_report(y_test, y_pred)
print(report)

Accuracy: 0.72
              precision    recall  f1-score   support

       False       0.72      1.00      0.84     86099
        True       0.96      0.00      0.00     33501

    accuracy                           0.72    119600
   macro avg       0.84      0.50      0.42    119600
weighted avg       0.79      0.72      0.60    119600



In [33]:
np.mean(y_pred)

0.0005936454849498327