In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler
from numpy import dot
from numpy.linalg import norm
import random
import warnings
warnings.filterwarnings('ignore')

def cos_sim(a,b):
    return dot(a, b)/(norm(a)*norm(b))

def score_pair(vec, past_successes, weights=[]):
    """
    vec: a 1d numpy array 
    past_succcesses: a N-d Numpy array
    """
    # calculate how similar is the current pair's historical patterns to past successes
    scores = [cos_sim(vec, x) for x in past_successes]
    if len(weights) > 0:
        weights_sum = np.sum(weights)
        # re-scaling
        weights = np.array([x/weights_sum for x in weights])
        scores = scores*weights

    return np.mean(scores)

# Load and pre-process the data

In [2]:
df = pd.read_csv('../../Data/Training/exploration_190pairs_300_20.csv')
# df['pnls'] = df['pnls']*100.0
df = df.sort_values('Date', ascending=True)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.both_legs_profited = df.both_legs_profited=='True'
df = df.dropna(subset=['pnls'])
print(df.shape)

(178030, 38)


In [3]:
df.columns

Index(['Date', 'Ticker_P1', 'Close_P1', 'Ticker_P2', 'Close_P2', 'High_P1',
       'High_P2', 'Low_P1', 'Low_P2', 'Volume_P1', 'Volume_P2', 'abs_spread',
       'same_sector_flag', 'same_sub_industry_flag', 'abs_spread_mean',
       'abs_spread_std', 'abs_spread_mean_l20', 'abs_spread_std_l20',
       'spread_normed', 'abs_spread_normed_max', 'abs_spread_normed_90th',
       'abs_spread_normed_75th', 'abs_spread_normed_median',
       'abs_spread_normed_l7_avg', 'abs_spread_normed_l14_avg', 'cos_sim',
       'corr_coef_l5', 'corr_coef_l10', 'corr_coef_l15', 'corr_coef_l20',
       'corr_coef_l40', 'corr_coef_l60', 'pnls', 'num_entries',
       'days_till_first_entry', 'both_legs_profited', 'SPY_return',
       'successful_pair_trading'],
      dtype='object')

In [8]:
# features_names = ['cos_sim', 'corr_coef_l5','corr_coef_l10', 'corr_coef_l15', 
#                    'corr_coef_l20', 'corr_coef_l40', 'corr_coef_l60',
#                   'same_sector_flag', 'same_sub_industry_flag',
#                    'abs_spread_normed_max', 'abs_spread_normed_90th',
#                    'abs_spread_normed_75th', 'abs_spread_normed_median',
#                    'abs_spread_normed_l7_avg', 'abs_spread_normed_l14_avg']
features_names = ['corr_coef_l5','corr_coef_l10', 'corr_coef_l15', 
                   'corr_coef_l20', 'corr_coef_l40', 'corr_coef_l60',
                  'same_sector_flag', 'same_sub_industry_flag']

label = 'both_legs_profited'
# label = 'successful_pair_trading'
print(df.shape)
df = df.dropna(subset=features_names)
print(df.shape)

(178030, 39)
(178030, 39)


# Playground

In [22]:
# test data is the latest date
test_data  = df[df['Date']=='2019-09-23'].reset_index(drop=True)

In [23]:
# train data is 20 trading days ago. For ease of compute just took anything more than a month back
train_data = df[df['Date']<'2019-08-23'].reset_index(drop=True)

In [24]:
# get all the success pair tradings from history
all_successes = train_data[train_data[label]]

# weights = 

In [25]:
X_train = all_successes[features_names]
X_test = test_data[features_names]
y_test = test_data[[label]]

In [26]:
scaler = StandardScaler().fit(X_train)
scaled_X_train = scaler.transform(X_train)
# scaled_X_test = scaler.transform(X_test)

In [31]:
i = 0
pair_past_success_tb = all_successes[
        (all_successes.Ticker_P1==test_data.Ticker_P1.values[i])&(all_successes.Ticker_P2==test_data.Ticker_P2.values[i])
    ]
pair_past_success_tb.shape[0] > 0

119

In [None]:
for i in range(test_data.shape[0]):
    pair_past_success_tb = all_successes[
        
    ]
    if

In [None]:
scores = [score_pair(x, scaled_X_train) for x in scaled_X_test]

In [None]:
y_test['scores'] = scores

In [None]:
total_positives = sum(y_test[label])
print(total_positives)

In [None]:
"""
Highest scores
"""
pat1 = y_test.sort_values('scores',ascending=False).head(1)[label].mean()

pat3 = y_test.sort_values('scores',ascending=False).head(3)[label].mean()

pat5 = y_test.sort_values('scores',ascending=False).head(5)[label].mean()
print(f'Number of positive pairs: {total_positives}')
print(f'Pct of positive pairs among all possible pairs: {np.mean(y_test[label])}')
print(f'Precision @ 1: {pat1}')
print(f'Precision @ 3: {pat3}')
print(f'Precision @ 5: {pat5}')

In [None]:
"""
Lowest scores for sanity check
"""
pat1 = y_test.sort_values('scores',ascending=True).head(1)[label].mean()
pat3 = y_test.sort_values('scores',ascending=True).head(3)[label].mean()
pat5 = y_test.sort_values('scores',ascending=True).head(5)[label].mean()
print(f'Precision @ 1: {pat1}')
print(f'Precision @ 3: {pat3}')
print(f'Precision @ 5: {pat5}')

# Pick test dates and score the method

In [None]:
n_test_dates = 10
all_test_dates = df['Date'].drop_duplicates().sort_values(ascending=True).values[30:]
test_dates = random.sample(list(all_test_dates), n_test_dates)

# test on the latest 60 days
test_dates = df['Date'].drop_duplicates().sort_values(ascending=False).head(60).values
train_test_date_pairs = [
    ((datetime.strptime(x, '%Y-%m-%d') - timedelta(days=30)).strftime('%Y-%m-%d'),x) for x in test_dates
]
train_test_date_pairs
"""
First value of the train_test_date_pairs tuple is the threshold on which we collect training data.
The second value is the date
"""

target_date = [x[1] for x in train_test_date_pairs]
all_pat1 = []
all_pat3 = []
all_pat5 = []

for tup in train_test_date_pairs:
    # test data is the latest date
    test_data  = df[df['Date']==tup[1]]
    # train data is 20 trading days ago. For ease of compute just took anything more than a month back
    train_data = df[df['Date']<tup[0]] 
    # get all the success pair tradings from history
    all_successes = train_data[train_data[label]]

    # Get the features
    X_train = all_successes[features_names]
    X_test = test_data[features_names]
    y_test = test_data[[label]]

    # Scaling
    scaler = StandardScaler().fit(X_train)
    scaled_X_train = scaler.transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    
    scores = [score_pair(x, scaled_X_train) for x in scaled_X_test]

    y_test['scores'] = scores
    pat1 = y_test.sort_values('scores',ascending=False).head(1)[label].mean()
    pat3 = y_test.sort_values('scores',ascending=False).head(3)[label].mean()
    pat5 = y_test.sort_values('scores',ascending=False).head(5)[label].mean()
    all_pat1.append(pat1)
    all_pat3.append(pat3)
    all_pat5.append(pat5)

np.mean(all_pat1)
np.mean(all_pat3)
np.mean(all_pat5)