In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score

In [51]:
recorded_info_tb = pd.read_csv("Data/recorded_info_tb.csv")
features_tb = pd.read_csv('Data/features_tb.csv')
labels_tb = pd.read_csv('Data/labels_tb.csv')

In [56]:
combined.head()

Unnamed: 0,ticker1,ticker2,target_date,same_sector_flag,same_sub_industry_flag,cos_sim,corr_coef,abs_spread_normed_max,abs_spread_normed_90th,abs_spread_normed_75th,abs_spread_normed_median,abs_spread_normed_l7_avg,abs_spread_normed_l14_avg,total_pnl,total_pnl_l28_mean_std,entry_detected_flag
0,NTAP,QRVO,2023-05-01,True,False,0.981826,0.770781,2.177635,1.718649,1.156688,0.693695,0.822221,0.852141,0.0,0.0,0
1,NTAP,QRVO,2023-05-22,True,False,0.982596,0.791802,2.316735,1.825371,1.096184,0.670432,0.802924,0.758347,0.022869,0.0,1
2,NTAP,QRVO,2023-03-30,True,False,0.979065,0.711062,2.125567,1.653205,1.209742,0.732704,0.570911,0.630389,0.0,0.0,0
3,NTAP,QRVO,2021-03-12,True,False,0.941363,0.075981,1.964868,1.496531,1.342377,0.892583,1.48614,1.543575,0.101811,0.0,1
4,NTAP,QRVO,2022-06-24,True,False,0.97481,0.393727,1.96143,1.585725,1.238452,0.813605,1.894564,1.789668,0.0,0.0,0


In [60]:
combined2 = pd.read_csv("Data/tech_sector_pairs.csv")

In [62]:
cross_check_tb = pd.merge(
    combined[['ticker1','ticker2','target_date', 'total_pnl']],
    combined2[['ticker1','ticker2','target_date', 'total_pnl']],
    how='inner', on= ['ticker1','ticker2', 'target_date']
)

In [63]:
cross_check_tb

Unnamed: 0,ticker1,ticker2,target_date,total_pnl_x,total_pnl_y
0,NTAP,QRVO,2022-06-24,0.000000,0.000000
1,NTAP,VRSN,2022-03-14,0.048947,0.061699
2,QRVO,TEL,2022-02-22,0.000000,-0.020837
3,QRVO,SWKS,2022-12-16,0.000000,-0.004340
4,QRVO,WDC,2021-06-24,-0.043489,0.172910
...,...,...,...,...,...
157,CDW,HPE,2022-01-24,0.000000,0.000000
158,TER,TRMB,2021-03-30,0.000000,0.000000
159,TXN,TYL,2023-02-27,0.000000,-0.077625
160,ACN,HPE,2022-01-04,0.000000,0.000000


In [9]:
features_names = ['same_sector_flag',
       'same_sub_industry_flag', 'cos_sim', 'corr_coef',
       'abs_spread_normed_max', 'abs_spread_normed_90th',
       'abs_spread_normed_75th', 'abs_spread_normed_median',
       'abs_spread_normed_l7_avg', 'abs_spread_normed_l14_avg']

In [15]:
combined = pd.merge(features_tb, labels_tb, how='inner', on= ['ticker1','ticker2', 'target_date'])
combined['entry_detected_flag'] = combined.total_pnl>0
combined['entry_detected_flag'] = combined.entry_detected_flag.astype('int')

In [16]:
combined.describe()

Unnamed: 0,cos_sim,corr_coef,abs_spread_normed_max,abs_spread_normed_90th,abs_spread_normed_75th,abs_spread_normed_median,abs_spread_normed_l7_avg,abs_spread_normed_l14_avg,total_pnl,total_pnl_l28_mean_std,entry_detected_flag
count,20476.0,20476.0,20476.0,20476.0,20476.0,20476.0,20476.0,20476.0,20476.0,20476.0,20476.0
mean,0.980883,0.568039,2.478097,1.592114,1.183756,0.759731,1.212421,1.186605,0.040456,0.021435,0.466107
std,0.020456,0.359889,0.478235,0.122108,0.096608,0.112586,0.770792,0.736894,0.071007,0.068625,0.498862
min,0.694536,-0.827622,1.415789,0.822182,0.64322,0.291153,0.000329,3.7e-05,-0.183071,-0.14039,0.0
25%,0.976393,0.37855,2.151688,1.514297,1.134768,0.683027,0.597506,0.589719,0.0,0.0,0.0
50%,0.986709,0.688876,2.392692,1.590818,1.196107,0.762942,1.139417,1.12699,0.0,0.0,0.0
75%,0.99258,0.846721,2.725476,1.668712,1.248333,0.838553,1.721915,1.67939,0.062095,0.0,1.0
max,0.999382,0.988769,6.556147,2.340277,1.458642,1.101794,5.389695,4.52291,1.004704,1.050091,1.0


In [17]:
X_train, X_test, y_train, y_test = train_test_split(combined[features_names], combined['entry_detected_flag'], test_size=0.2, random_state=42)

In [44]:
# Initialize and train the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Initialize and train the Logistic Regression model
log_reg = LogisticRegression(max_iter=200, random_state=42)
log_reg.fit(X_train, y_train)

In [45]:
# Plot feature importances
importances = clf.feature_importances_
feature_imp_tb_tree = pd.DataFrame()
feature_imp_tb_tree['features'] = features_names
feature_imp_tb_tree['importances'] = importances

In [46]:
# Plot feature importances
importances = log_reg.coef_[0]
feature_imp_tb_lr = pd.DataFrame()
feature_imp_tb_lr['features'] = features_names
feature_imp_tb_lr['coef'] = importances

In [49]:
feature_imp_tb_tree

Unnamed: 0,features,importances
0,same_sector_flag,0.0
1,same_sub_industry_flag,0.01238
2,cos_sim,0.132803
3,corr_coef,0.134713
4,abs_spread_normed_max,0.138055
5,abs_spread_normed_90th,0.119029
6,abs_spread_normed_75th,0.116854
7,abs_spread_normed_median,0.116636
8,abs_spread_normed_l7_avg,0.111104
9,abs_spread_normed_l14_avg,0.118426


In [50]:
feature_imp_tb_lr

Unnamed: 0,features,coef
0,same_sector_flag,0.029602
1,same_sub_industry_flag,-0.159025
2,cos_sim,-1.031043
3,corr_coef,0.210748
4,abs_spread_normed_max,0.009809
5,abs_spread_normed_90th,0.093079
6,abs_spread_normed_75th,0.198902
7,abs_spread_normed_median,0.19477
8,abs_spread_normed_l7_avg,0.363212
9,abs_spread_normed_l14_avg,-0.2221


In [41]:
# Predict the labels for the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Calculate precision, recall, and F1 score
report = classification_report(y_test, y_pred)
print(report)

Accuracy: 0.55
              precision    recall  f1-score   support

           0       0.58      0.57      0.57      2177
           1       0.52      0.53      0.53      1919

    accuracy                           0.55      4096
   macro avg       0.55      0.55      0.55      4096
weighted avg       0.55      0.55      0.55      4096



In [57]:
np.mean(y_test)

0.468505859375

In [43]:
# Predict the labels for the test set
y_pred = log_reg.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Calculate precision, recall, and F1 score
report = classification_report(y_test, y_pred)
print(report)

Accuracy: 0.55
              precision    recall  f1-score   support

           0       0.55      0.85      0.67      2177
           1       0.56      0.21      0.31      1919

    accuracy                           0.55      4096
   macro avg       0.55      0.53      0.49      4096
weighted avg       0.55      0.55      0.50      4096

