In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score

In [6]:
df = pd.read_csv('Data/Training/pair_features1_300_60.csv')
df['pnls'] = df['pnls']*100.0
df.head()

Unnamed: 0,Date,Ticker_P1,Close_P1,Ticker_P2,Close_P2,High_P1,High_P2,Low_P1,Low_P2,Volume_P1,...,abs_spread_normed_max,abs_spread_normed_90th,abs_spread_normed_75th,abs_spread_normed_median,abs_spread_normed_l7_avg,abs_spread_normed_l14_avg,cos_sim,corr_coef,pnls,num_entries
0,2015-01-02,EPAM,46.470001,DAL,43.946671,48.16,50.009998,46.0,48.709999,379300.0,...,,,,,,,,,,
1,2015-01-05,EPAM,46.27,DAL,43.213924,47.619999,49.48,45.799999,47.810001,294300.0,...,,,,,,,,,,
2,2015-01-06,EPAM,46.290001,DAL,42.19524,47.23,48.740002,45.77,46.25,426200.0,...,,,,,,,,,,
3,2015-01-07,EPAM,47.290001,DAL,41.97184,47.560001,47.59,46.470001,46.509998,253900.0,...,,,,,,,,,,
4,2015-01-08,EPAM,48.669998,DAL,43.062004,48.779999,48.32,47.509998,47.259998,562300.0,...,,,,,,,,,,


In [7]:
df.columns

Index(['Date', 'Ticker_P1', 'Close_P1', 'Ticker_P2', 'Close_P2', 'High_P1',
       'High_P2', 'Low_P1', 'Low_P2', 'Volume_P1', 'Volume_P2', 'abs_spread',
       'same_sector_flag', 'same_sub_industry_flag', 'abs_spread_mean',
       'abs_spread_std', 'abs_spread_mean_l28', 'abs_spread_std_l28',
       'spread_normed', 'abs_spread_normed_max', 'abs_spread_normed_90th',
       'abs_spread_normed_75th', 'abs_spread_normed_median',
       'abs_spread_normed_l7_avg', 'abs_spread_normed_l14_avg', 'cos_sim',
       'corr_coef', 'pnls', 'num_entries'],
      dtype='object')

In [8]:
spy_df = pd.read_csv('Data/Training/1999-12-01-2023-12-31_SPY.csv')
spy_df = spy_df[['Date','Adj Close']]
spy_df.columns = ['Date','SPY_Close']

look_forward_d = 60
# Define a variable to calculate the return if we just buy SPY and sell in the next 60 days
spy_60_return = []
for i in range(spy_df.shape[0]):
    if (i + look_forward_d) < spy_df.shape[0]:
        spy_60_return.append(
            100*(spy_df.loc[i+look_forward_d]['SPY_Close'] - spy_df.loc[i]['SPY_Close'])/spy_df.loc[i]['SPY_Close']
        )
    else:
        spy_60_return.append(
            np.nan
        )
spy_df['SPY_return_next_60'] = spy_60_return

In [9]:
print(df.shape)
df = df.drop(df.groupby(['Ticker_P1','Ticker_P2']).head(301).index,axis=0)
print(df.shape)
df = df.drop(df.groupby(['Ticker_P1','Ticker_P2']).tail(61).index,axis=0)
print(df.shape)

(1257000, 29)
(956000, 29)
(895000, 29)


In [10]:
df.isna().sum().sort_values(ascending=False).head(20)

abs_spread_normed_median     297000
abs_spread_normed_75th       297000
abs_spread_normed_90th       297000
abs_spread_normed_max        297000
abs_spread_normed_l14_avg     11000
abs_spread_normed_l7_avg       4000
Date                              0
abs_spread_std                    0
pnls                              0
corr_coef                         0
cos_sim                           0
spread_normed                     0
abs_spread_std_l28                0
abs_spread_mean_l28               0
abs_spread_mean                   0
Ticker_P1                         0
same_sub_industry_flag            0
same_sector_flag                  0
abs_spread                        0
Volume_P2                         0
dtype: int64

In [11]:
df = pd.merge(df, spy_df[['Date','SPY_return_next_60']], how='left', on='Date')
df['better_than_spy'] = df.SPY_return_next_60 < df.pnls
df['recommended_trade'] = (df.pnls >= 0) & (df.better_than_spy)

In [23]:
features_names = ['cos_sim', 'corr_coef', 'same_sector_flag', 'same_sub_industry_flag',
       'abs_spread_normed_max', 'abs_spread_normed_90th',
       'abs_spread_normed_75th', 'abs_spread_normed_median',
       'abs_spread_normed_l7_avg', 'abs_spread_normed_l14_avg']

# features_names = ['Date', 'Ticker_P1','Ticker_P2' ,'cos_sim', 'corr_coef', 'same_sector_flag', 'same_sub_industry_flag']
# features_names = ['corr_coef', 'same_sector_flag', 'same_sub_industry_flag']

# label = ['Date', 'Ticker_P1','Ticker_P2','recommended_trade','pnls']
label = ['recommended_trade']

In [24]:
print(df.shape)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.dropna(subset=features_names)
print(df.shape)

(598000, 32)
(598000, 32)


In [25]:
df.columns

Index(['Date', 'Ticker_P1', 'Close_P1', 'Ticker_P2', 'Close_P2', 'High_P1',
       'High_P2', 'Low_P1', 'Low_P2', 'Volume_P1', 'Volume_P2', 'abs_spread',
       'same_sector_flag', 'same_sub_industry_flag', 'abs_spread_mean',
       'abs_spread_std', 'abs_spread_mean_l28', 'abs_spread_std_l28',
       'spread_normed', 'abs_spread_normed_max', 'abs_spread_normed_90th',
       'abs_spread_normed_75th', 'abs_spread_normed_median',
       'abs_spread_normed_l7_avg', 'abs_spread_normed_l14_avg', 'cos_sim',
       'corr_coef', 'pnls', 'num_entries', 'SPY_return_next_60',
       'better_than_spy', 'recommended_trade'],
      dtype='object')

In [26]:
X_train, X_test, y_train, y_test = train_test_split(df[features_names], df[label], test_size=0.2, shuffle=False)

In [27]:
# Initialize and train the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# # Initialize and train the Logistic Regression model
# log_reg = LogisticRegression(max_iter=200, random_state=42)
# log_reg.fit(X_train, y_train)

# # Initialize RandomForest 
# rf = RandomForestClassifier()
# rf.fit(X_train, y_train)

xgb = XGBClassifier()
xgb.fit(X_train, y_train)

In [28]:
# Plot feature importances
importances = clf.feature_importances_
feature_imp_tb_tree = pd.DataFrame()
feature_imp_tb_tree['features'] = features_names
feature_imp_tb_tree['importances'] = importances

In [29]:
# # Plot feature importances
# importances = rf.feature_importances_
# feature_imp_tb_rf = pd.DataFrame()
# feature_imp_tb_rf['features'] = features_names
# feature_imp_tb_rf['importances'] = importances

# Plot feature importances
importances = xgb.feature_importances_
feature_imp_tb_xgb = pd.DataFrame()
feature_imp_tb_xgb['features'] = features_names
feature_imp_tb_xgb['importances'] = importances

In [30]:
# # Plot feature importances
# importances = log_reg.coef_[0]
# feature_imp_tb_lr = pd.DataFrame()
# feature_imp_tb_lr['features'] = features_names
# feature_imp_tb_lr['coef'] = importances

In [31]:
feature_imp_tb_tree

Unnamed: 0,features,importances
0,cos_sim,0.172855
1,corr_coef,0.161415
2,same_sector_flag,0.007726
3,same_sub_industry_flag,0.000743
4,abs_spread_normed_max,0.082156
5,abs_spread_normed_90th,0.095348
6,abs_spread_normed_75th,0.104701
7,abs_spread_normed_median,0.122351
8,abs_spread_normed_l7_avg,0.12673
9,abs_spread_normed_l14_avg,0.125974


In [32]:
feature_imp_tb_rf

NameError: name 'feature_imp_tb_rf' is not defined

In [33]:
feature_imp_tb_xgb

Unnamed: 0,features,importances
0,cos_sim,0.120647
1,corr_coef,0.12979
2,same_sector_flag,0.092785
3,same_sub_industry_flag,0.04065
4,abs_spread_normed_max,0.105478
5,abs_spread_normed_90th,0.107309
6,abs_spread_normed_75th,0.108519
7,abs_spread_normed_median,0.114118
8,abs_spread_normed_l7_avg,0.091829
9,abs_spread_normed_l14_avg,0.088876


In [34]:
# Predict the labels for the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Calculate precision, recall, and F1 score
report = classification_report(y_test, y_pred)
print(report)

Accuracy: 0.62
              precision    recall  f1-score   support

       False       0.74      0.73      0.74     87928
        True       0.29      0.30      0.30     31672

    accuracy                           0.62    119600
   macro avg       0.52      0.52      0.52    119600
weighted avg       0.62      0.62      0.62    119600



In [35]:
# Predict the labels for the test set
y_pred = rf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Calculate precision, recall, and F1 score
report = classification_report(y_test, y_pred)
print(report)

NameError: name 'rf' is not defined

In [36]:
# Predict the labels for the test set
y_pred = xgb.predict(X_test)
y_pred_proba = [x[1] for x in xgb.predict_proba(X_test)]

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Calculate precision, recall, and F1 score
report = classification_report(y_test, y_pred)
print(report)

Accuracy: 0.72
              precision    recall  f1-score   support

       False       0.74      0.96      0.84     87928
        True       0.36      0.06      0.10     31672

    accuracy                           0.72    119600
   macro avg       0.55      0.51      0.47    119600
weighted avg       0.64      0.72      0.64    119600

