# Imports

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.inspection import permutation_importance
import ta

from tqdm.notebook import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.metrics import f1_score
from imblearn.under_sampling import RandomUnderSampler
pd.set_option('display.float_format', lambda x: '%.3f' % x)
np.set_printoptions(suppress=True)
from pprint import pprint

import gc
import optuna
from sklearn.feature_selection import RFE

In [None]:
df = pd.read_csv('../../Data/Training/pair_features_1_pairs500_300_120.csv')
df.head()

In [None]:
total_days = df.Date.nunique()
total_days

In [None]:
df.pnls.describe()

In [None]:
class feature_engineering(object):
    def __init__(self):
        super(feature_engineering, self).__init__()

    def generate_technical_indicator(self, data_df): 
        '''
        Function to generate additional technical indicators for the stock

        Uses the "statsmodels.tsa.stattools" (as "ts") package to apply 
        the equations specified in the "Technical Indicators" markdown section 
        to stock closing data.
        
        Input:
        data_df-- Dataframe containing stock finacials data
        
        Output:
        Stock finacials data with added Dataframe of feature obtained from feature engineering
        ''' 
        # 1. Momentum Indicators
        # Relative Strength Index
        df = data_df
        df['rsi'] = ta.momentum.rsi(df['Close'], window=14)
        # Kaufman’s Adaptive Moving Average (KAMA)
        df['kama'] = ta.momentum.kama(df['Close'],window=14)

        # 2. Volume Indicators
        # Accumulation/Distribution Index (ADI)
        df['adi'] = ta.volume.acc_dist_index(df['High'], df['Low'], df['Close'], df['Volume'])

        # Volume-price trend (VPT)
        df['vpt'] = ta.volume.volume_price_trend(df['Close'], df['Volume'])

        # 3. Volatility Indicators
        # Average True Range (ATR)
        df['atr'] = ta.volatility.average_true_range(df['High'], df['Low'],df['Close'], window=14)

        # Bollinger Bands (BB) N-period simple moving average (MA)
        df['bb_ma'] = ta.volatility.bollinger_mavg(df['Close'], window=20)

        # 4. Trend Indicators
        # Average Directional Movement Index (ADX)
        df['adx'] = ta.trend.adx(df['High'], df['Low'], df['Close'], window=14)

        # Exponential Moving Average
        df['ema'] = ta.trend.ema_indicator(df['Close'], window=14)

        # Moving Average Convergence Divergence (MACD)
        df['macd'] = ta.trend.macd(df['Close'], window_fast=14, window_slow=30)

        # 5. Other Indicators
        # Daily Log Return (DLR)
        df['dlr'] = ta.others.daily_log_return(df['Close'])

        # Daily Returns
        df['daily_returns'] = df['Close'].pct_change()

        # Moving Averages
        averages = [50,200]
        for avg in averages:
            col_name = str(avg) +' Days Average'
            df[col_name] = df['Close'].rolling(window = avg, center = False).mean()

        return df

In [None]:
pd.to_datetime(df['Date']).dt.date

In [None]:
spy_df = pd.read_csv('../../Data/Training/1999-12-01-2023-12-31_SPY.csv')
spy_df = spy_df[['Date','Adj Close']]
spy_df.columns = ['Date','SPY_Close']
df['Date'] = pd.to_datetime(df['Date']).dt.date

spy_df['Date'] = pd.to_datetime(spy_df['Date']).dt.date
df = pd.merge(df,spy_df,on='Date',how='left')

In [None]:
fe_obj = feature_engineering()

### Beta

In [None]:
all_tickers = df['Ticker_P1'].unique().tolist()+df['Ticker_P2'].unique().tolist()
len(all_tickers)

In [None]:
def compute_beta(rs,df,ticker,market):
    rows = df.loc[rs.index]
    sec_returns = np.log( df[[ticker,market]] / df[[ticker,market]].shift(1) ) 

    cov = sec_returns.cov() * 250
    cov_with_market = cov.iloc[0,1]
    market_var = sec_returns[market].var() * 250
    beta = cov_with_market / market_var
    return beta

In [None]:
all_tickers_df_list = []
for t in tqdm(all_tickers):
    single_ticker_df = (df[df['Ticker_P1']==t] if t in df['Ticker_P1'].unique() else df[df['Ticker_P2']==t])
    single_ticker_df = single_ticker_df.sort_values('Date')
    single_ticker_df = single_ticker_df[['Date','Ticker_P1','Close_P1','High_P1','Low_P1','Volume_P1','SPY_Close']] if t in df['Ticker_P1'].unique() else\
                       single_ticker_df[['Date','Ticker_P2','Close_P2','High_P2','Low_P2','Volume_P2','SPY_Close']]
    single_ticker_df.columns = ['Date','Ticker','Close','High','Low','Volume','SPY_Close']
    single_ticker_df = single_ticker_df.drop_duplicates()

    single_ticker_df['rolling_beta'] = single_ticker_df['Close'].rolling(300).progress_apply(compute_beta, \
                                        args=(single_ticker_df,'Close','SPY_Close'))

    single_ticker_df_with_technical_indicators = fe_obj.generate_technical_indicator(single_ticker_df)
    
    all_tickers_df_list.append(single_ticker_df_with_technical_indicators)


### Single Ticker Features

In [None]:
df.columns

In [None]:
df.sort_values('Date')

In [None]:
all_tickers_df = pd.concat(all_tickers_df_list,axis=0,ignore_index=True).reset_index(drop=True)
all_tickers_df.shape

In [None]:
print(df.shape)
all_tickers_df_P1_suffix = all_tickers_df.copy()
all_tickers_df_P1_suffix.columns = ['Date','Ticker']+[c+'_P1' for c in all_tickers_df.columns if c not in ['Date','Ticker']]
all_tickers_df_P2_suffix = all_tickers_df.copy()
all_tickers_df_P2_suffix.columns = ['Date','Ticker']+[c+'_P2' for c in all_tickers_df.columns if c not in ['Date','Ticker']]

df = pd.merge(df,all_tickers_df_P1_suffix,left_on=['Date','Ticker_P1'],right_on=['Date','Ticker'],\
              how='left',suffixes=['','_P1']).drop_duplicates()
print(df.shape)
df = pd.merge(df,all_tickers_df_P2_suffix,left_on=['Date','Ticker_P2'],right_on=['Date','Ticker'],\
              how='left',suffixes=['','_P2']).drop_duplicates()
print(df.shape)
df = df.loc[:,~df.columns.duplicated()]
print(df.shape)


In [None]:
# for i in [1,20,30,60,90]:
#     for c in ['num_entries','pnls']:
#         df['PREV_'+str(i)+'_'+c] = df.groupby(['Ticker_P1','Ticker_P2'])[c].shift(20+i)

In [None]:
# for c in ['num_entries','pnls']:
#     df['PREV_'+c+'_mean'] = df[['PREV_1_'+c,'PREV_20_'+c,'PREV_30_'+c,'PREV_60_'+c,'PREV_90_'+c]].mean(axis=1)

In [None]:
df['rsi_abs_spread'] = ta.momentum.rsi(df['abs_spread'], window=14)
# Kaufman’s Adaptive Moving Average (KAMA)
df['kama_abs_spread'] = ta.momentum.kama(df['abs_spread'],window=14)

# 2. Volume Indicators
# Accumulation/Distribution Index (ADI)
# df['adi'] = ta.volume.acc_dist_index(df['High'], df['Low'], df['Close'], df['Volume'])

# Volume-price trend (VPT)
df['vpt_abs_spread'] = ta.volume.volume_price_trend(df['abs_spread'], np.log(df['Volume_P1'])+np.log(df['Volume_P2']))

# 3. Volatility Indicators
# Average True Range (ATR)
# df['atr'] = ta.volatility.average_true_range(df['High'], df['Low'],df['Close'], window=14)

# Bollinger Bands (BB) N-period simple moving average (MA)
df['bb_ma_abs_spread'] = ta.volatility.bollinger_mavg(df['abs_spread'], window=20)

# 4. Trend Indicators
# Average Directional Movement Index (ADX)
# df['adx'] = ta.trend.adx(df['High'], df['Low'], df['Close'], window=14)

# Exponential Moving Average
df['ema_abs_spread'] = ta.trend.ema_indicator(df['abs_spread'], window=14)

# Moving Average Convergence Divergence (MACD)
df['macd_abs_spread'] = ta.trend.macd(df['abs_spread'], window_fast=14, window_slow=30)

# 5. Other Indicators
# Daily Log Return (DLR)
df['dlr_abs_spread'] = ta.others.daily_log_return(df['abs_spread'])

# Daily Returns
df['daily_returns_abs_spread'] = df['abs_spread'].pct_change()

# Moving Averages
averages = [50,200]
for avg in averages:
    col_name = str(avg) +' Days Average abs_spread'
    df[col_name] = df['abs_spread'].rolling(window = avg, center = False).mean()

In [None]:
for c in ['Close', 'High', 'Low', 'Volume', 'rolling_beta', 'rsi', 'kama',
       'adi', 'vpt', 'atr', 'bb_ma', 'adx', 'ema', 'macd', 'dlr',
       'daily_returns', '50 Days Average', '200 Days Average']:
# ,
#        'sector_rolling_beta_mean', 'sector_rolling_beta_std',
#        'sector_daily_return_mean', 'sector_daily_return_std']:
    df['DIFF_'+c] = df[c+'_P1'] - df[c+'_P2']

In [None]:
# # df['PCT_abs_spread_mean_l28_300'] = (df['abs_spread_mean_l28'] - df['abs_spread_mean'])/df['abs_spread_mean']
# df['PCT_abs_spread_normed_median_l7'] = (df['abs_spread_normed_median'] - df['abs_spread_normed_l7_avg'])/df['abs_spread_normed_median']
# df['PCT_abs_spread_normed_median_l4'] = (df['abs_spread_normed_median'] - df['abs_spread_normed_l14_avg'])/df['abs_spread_normed_median']

In [None]:
df['PCT_rolling_beta_P1'] = (df['rolling_beta_P1']-df.groupby('Ticker_P1')['rolling_beta_P1'].shift(20))/df.groupby('Ticker_P1')['rolling_beta_P1'].shift(20)
df['PCT_rolling_beta_P2'] = (df['rolling_beta_P2']-df.groupby('Ticker_P2')['rolling_beta_P2'].shift(20))/df.groupby('Ticker_P2')['rolling_beta_P2'].shift(20)
df['DIFF_secondary_rolling_beta'] = df['PCT_rolling_beta_P1']-df['PCT_rolling_beta_P2']

In [None]:
stock2vec = pd.read_csv('../../Data/Training/stock2vec.csv')
stock2vec.head()

In [None]:
stock2vec.columns = [c + '_P1' for c in stock2vec.columns]
df = pd.merge(df,stock2vec,on='Ticker_P1',how='left')
stock2vec.columns = [c[:-3] + '_P2' for c in stock2vec.columns]
df = pd.merge(df,stock2vec,on='Ticker_P2',how='left')
df.tail(2)

In [None]:
from numpy.linalg import norm

In [None]:
# Can be a lot faster if we do it on pair level first then merge.

vec1_sub1 = df[['STOCK2VEC_'+ str(i) + '_P1' for i in range(0,32)]]
vec2_sub1 = df[['STOCK2VEC_'+ str(i) + '_P2' for i in range(0,32)]]
cs = [np.dot(vec1_sub1.iloc[i], vec2_sub1.iloc[i]) / (norm(vec1_sub1.iloc[i]) * norm(vec2_sub1.iloc[i])) for i in tqdm(range(len(vec1_sub1)))]

In [None]:
df['stock2vec_cos_sim'] = cs

In [None]:
df = df.sort_values('Date')

In [None]:
for i in range(1,4):
    df['FUTURE_abs_spread_'+str(i)] = df.groupby(['Ticker_P1','Ticker_P2'])['abs_spread'].shift(-1*i)

df['FUTURE_abs_spread_avg'] = df[[c for c in df.columns if 'FUTURE_abs_spread_' in c]].mean(axis=1)

# df['PCT_CHANGE'] = (df['FUTURE_abs_spread_avg'] - df['abs_spread'])*100.0/df['abs_spread']
# df['direction'] = np.where(df['PCT_CHANGE'].abs()<3,0,np.where(df['PCT_CHANGE']>0,1,-1))
# df['direction'] = np.where((df['FUTURE_abs_spread_avg'] - (df['abs_spread_mean']+1.5*df['abs_spread_std']))>0,1,0)
df['entry_label1'] = np.where((df['FUTURE_abs_spread_avg'] - (df['abs_spread_mean']+1.5*df['abs_spread_std']))>0,1,0)

df['entry_label1'].value_counts()
# df['direction'] = df['PCT_CHANGE']

In [None]:

for i in range(9,12):
    df['FUTURE_abs_spread_'+str(i)] = df.groupby(['Ticker_P1','Ticker_P2'])['abs_spread'].shift(-1*i)

df['FUTURE_abs_spread_avg'] = df[[c for c in df.columns if 'FUTURE_abs_spread_' in c]].mean(axis=1)

# df['PCT_CHANGE'] = (df['FUTURE_abs_spread_avg'] - df['abs_spread'])*100.0/df['abs_spread']
# df['direction'] = np.where(df['PCT_CHANGE'].abs()<3,0,np.where(df['PCT_CHANGE']>0,1,-1))
# df['direction'] = np.where((df['FUTURE_abs_spread_avg'] - (df['abs_spread_mean']+1.5*df['abs_spread_std']))>0,1,0)
df['exit_label1'] = np.where((df['FUTURE_abs_spread_avg'] - (df['abs_spread_mean']+0.1*df['abs_spread_std']))<0,1,0)

df['exit_label1'].value_counts()
# df['direction'] = df['PCT_CHANGE']

In [None]:
df = df.drop([c for c in df.columns if 'FUTURE_abs_spread_' in c]+['FUTURE_abs_spread_avg'],axis=1)

In [None]:
print(df.shape)
df = df.drop('vpt_abs_spread',axis=1)
print(df.shape)


In [None]:
df.replace([np.inf, -np.inf], np.nan).isna().sum().sort_values(ascending=False)

In [None]:
# stop

In [None]:
df['abs_spread_log_mean_MA'] = np.log(df['abs_spread_mean_MA'])
df['abs_spread_log_std_mean_MA'] = np.log(df['abs_spread_std_MA'])

In [None]:
print(df.shape)
df = df.replace([np.inf, -np.inf], np.nan).dropna()
df.shape

In [None]:
df.Date.max()

In [None]:
# X['abs_spread_log_mean_MA'] = np.log(X['abs_spread_mean_MA'])
# X['abs_spread_log_std_mean_MA'] = np.log(X['abs_spread_std_MA'])
# X = X.drop(['abs_spread_mean_MA', 'abs_spread_std_MA'],axis=1)
# X

In [None]:
df.to_csv('../../Outputs/data_pipeline_output.csv', index=False)

# Entry model

In [None]:
label = 'entry_label1'
features_to_exclude = [] # ['High_P1', 'High_P2', 'Low_P1', 'Low_P2', 'abs_spread', 'abs_spread_mean', 'abs_spread_std']
always_exclude = ['pnls', 'actual_abs_spread_std','actual_abs_spread','exit_label1']
X = df
y = df[label]

X = X.drop(['Date', 'Ticker_P1', 'Ticker_P2', 'Volume_P1', 'Volume_P2', 'SPY_Close','Ticker',label]+\
           ['STOCK2VEC_'+ str(i) + '_P1' for i in range(0,32)]+\
           ['STOCK2VEC_'+ str(i) + '_P2' for i in range(0,32)],axis=1)
X = X.drop(always_exclude, axis=1)
X = X.drop(features_to_exclude, axis=1)

In [None]:
X = X.drop(['abs_spread_mean_MA', 'abs_spread_std_MA'],axis=1)
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle=False)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42,shuffle=False)

In [None]:
print(df.loc[X_train.index].Date.min(),df.loc[X_train.index].Date.max())

In [None]:
print(df.loc[X_val.index].Date.min(),df.loc[X_val.index].Date.max())

In [None]:
print(df.loc[X_test.index].Date.min(),df.loc[X_test.index].Date.max())

In [None]:
undersample = RandomUnderSampler(sampling_strategy='majority')
X_train, y_train = undersample.fit_resample(X_train, y_train)
X_val, y_val = undersample.fit_resample(X_val, y_val)

In [None]:
def objective(trial):
    params = {
        'boosting':'dart',
        "objective": "binary",
#         "metric": "accuracy",
#         'class_weight':'balanced',
        "n_estimators": 300,
        "verbosity": -1,
        "bagging_freq": 1,
        'max_bin':trial.suggest_int('max_bin',64,1024),
        'max_depth':trial.suggest_int('max_depth',4,20),

        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.2, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 3000),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 1000),
    }

    model = lgb.LGBMClassifier(**params)
    model.fit(X_train,y_train)
    predictions = model.predict(X_val)
    f1 = f1_score(y_val, predictions, average='weighted')
    return -1*f1

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=1)

In [None]:
print('Best hyperparameters:', study.best_params)
print('Best F1:', study.best_value*(-1))

In [None]:
hyper_params = study.best_params

In [None]:
hyper_params = study.best_params
hyper_params['boosting']='dart'
hyper_params["objective"] = "binary"
# hyper_params["metric"] = 'l2'
#         'class_weight':'balanced',
hyper_params["n_estimators"] = 300

In [None]:
y_train.value_counts()/len(y_train)

In [None]:
gbm = lgb.LGBMClassifier(**hyper_params)
callbacks = [\
#             lgb.early_stopping(stopping_rounds=10, verbose=1,min_delta=0.001,first_metric_only=False), 
             lgb.log_evaluation(period=1)]

gbm.fit(X_train,y_train, 
        eval_set=[(X_val, y_val)],
        eval_metric='accuracy',
        callbacks=callbacks,
        feature_name = X_train.columns.tolist(),
#         categorical_feature=categorical_features
       )

In [None]:
entry_preds = gbm.predict(X_test)

In [None]:
gbm.feature_names = X_train.columns

In [None]:
import pickle
with open('../../Outputs/entry_model_latest.pkl','wb') as file:
    pickle.dump(gbm, file)

# Exit model

In [None]:
label = 'exit_label1'
features_to_exclude = [] # ['High_P1', 'High_P2', 'Low_P1', 'Low_P2', 'abs_spread', 'abs_spread_mean', 'abs_spread_std']
always_exclude = ['pnls', 'actual_abs_spread_std','actual_abs_spread','entry_label1']
X = df
y = df[label]

X = X.drop(['Date', 'Ticker_P1', 'Ticker_P2', 'Volume_P1', 'Volume_P2', 'SPY_Close','Ticker',label]+\
           ['STOCK2VEC_'+ str(i) + '_P1' for i in range(0,32)]+\
           ['STOCK2VEC_'+ str(i) + '_P2' for i in range(0,32)],axis=1)
X = X.drop(always_exclude, axis=1)
X = X.drop(features_to_exclude, axis=1)

In [None]:
X = X.drop(['abs_spread_mean_MA', 'abs_spread_std_MA'],axis=1)
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle=False)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42,shuffle=False)

In [None]:
print(df.loc[X_train.index].Date.min(),df.loc[X_train.index].Date.max())

In [None]:
print(df.loc[X_val.index].Date.min(),df.loc[X_val.index].Date.max())

In [None]:
print(df.loc[X_test.index].Date.min(),df.loc[X_test.index].Date.max())

In [None]:
undersample = RandomUnderSampler(sampling_strategy='majority')
X_train, y_train = undersample.fit_resample(X_train, y_train)
X_val, y_val = undersample.fit_resample(X_val, y_val)

In [None]:
def objective(trial):
    params = {
        'boosting':'dart',
        "objective": "binary",
#         "metric": "accuracy",
#         'class_weight':'balanced',
        "n_estimators": 300,
        "verbosity": -1,
        "bagging_freq": 1,
        'max_bin':trial.suggest_int('max_bin',64,1024),
        'max_depth':trial.suggest_int('max_depth',4,20),

        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.2, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 3000),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 1000),
    }

    model = lgb.LGBMClassifier(**params)
    model.fit(X_train,y_train)
    predictions = model.predict(X_val)
    f1 = f1_score(y_val, predictions, average='weighted')
    return -1*f1

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=1)

In [None]:
print('Best hyperparameters:', study.best_params)
print('Best F1:', study.best_value*(-1))

In [None]:
hyper_params = study.best_params

In [None]:
hyper_params = study.best_params
hyper_params['boosting']='dart'
hyper_params["objective"] = "binary"
# hyper_params["metric"] = 'l2'
#         'class_weight':'balanced',
hyper_params["n_estimators"] = 300

In [None]:
y_train.value_counts()/len(y_train)

In [None]:
gbm_exit = lgb.LGBMClassifier(**hyper_params)
callbacks = [\
#             lgb.early_stopping(stopping_rounds=10, verbose=1,min_delta=0.001,first_metric_only=False), 
             lgb.log_evaluation(period=1)]

gbm_exit.fit(X_train,y_train, 
        eval_set=[(X_val, y_val)],
        eval_metric='accuracy',
        callbacks=callbacks,
        feature_name = X_train.columns.tolist(),
#         categorical_feature=categorical_features
       )

In [None]:
gbm_exit.feature_names = X_train.columns
with open('../../Outputs/exit_model_latest.pkl','wb') as file:
    pickle.dump(gbm_exit, file)

In [None]:
exit_preds = gbm_exit.predict(X_test)

# Spread model

In [None]:
label = 'actual_abs_spread'
features_to_exclude = [] # ['High_P1', 'High_P2', 'Low_P1', 'Low_P2', 'abs_spread', 'abs_spread_mean', 'abs_spread_std']
always_exclude = ['pnls', 'actual_abs_spread_std','entry_label1','exit_label1']
X = df
y = df[label]

X = X.drop(['Date', 'Ticker_P1', 'Ticker_P2', 'Volume_P1', 'Volume_P2', 'SPY_Close','Ticker',label]+\
           ['STOCK2VEC_'+ str(i) + '_P1' for i in range(0,32)]+\
           ['STOCK2VEC_'+ str(i) + '_P2' for i in range(0,32)],axis=1)
X = X.drop(always_exclude, axis=1)
X = X.drop(features_to_exclude, axis=1)

In [None]:
X['abs_spread_log_mean_MA'] = np.log(X['abs_spread_mean_MA'])
X['abs_spread_log_std_mean_MA'] = np.log(X['abs_spread_std_MA'])
X = X.drop(['abs_spread_mean_MA', 'abs_spread_std_MA'],axis=1)
X

In [None]:
y = np.log(y)
y.describe()

In [None]:
y = y[X.index]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle=False)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42,shuffle=False)

In [None]:
def objective(trial):
    params = {
        'boosting':'goss',
        "objective": "mae",
#         'device':'gpu',
        "metric": 'l2',
#         'class_weight':'balanced',
        "n_estimators": 300,
        "verbosity": -1,
#         "bagging_freq": 1,
        'max_bin':trial.suggest_int('max_bin',64,1024),
        'max_depth':trial.suggest_int('max_depth',4,30),

        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.2, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 3000),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 1000),
    }

    model = lgb.LGBMRegressor(**params)
    model.fit(X_train,y_train)
    predictions = model.predict(X_val)
    mae = np.mean(abs(predictions-y_val))
    mse = np.mean((predictions-y_val)**2)

    return mae

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

In [None]:
print('Best hyperparameters:', study.best_params)
print('Best mae:', study.best_value)

In [None]:
hyper_params = study.best_params
hyper_params['boosting']='goss'
hyper_params["objective"] = "mae"
# hyper_params["metric"] = 'accuracy'
#         'class_weight':'balanced',
hyper_params["n_estimators"] = 300

In [None]:
gbm = lgb.LGBMRegressor(**hyper_params)
# gbm = lgb.LGBMClassifier(**hyper_params)

callbacks = [\
#             lgb.early_stopping(stopping_rounds=10, verbose=1,min_delta=0.001,first_metric_only=False), 
             lgb.log_evaluation(period=1)]

gbm.fit(X_train,y_train, 
        eval_set=[(X_val, y_val )],
#         eval_metric='l1',
        callbacks=callbacks,
#         feature_name = features,
#         categorical_feature=categorical_features
       )

In [None]:
importances = gbm.feature_importances_
importances = pd.Series(importances,index=X_train.columns).sort_values(ascending=True)

In [None]:
fig,ax=plt.subplots(figsize=[20,14])
importances.plot.barh(ax=ax)
ax.set_title("Feature Importance by Gain")
fig.tight_layout()

In [None]:
inference_result = gbm.predict(X_test)

In [None]:
plot_df = pd.DataFrame(pd.concat([pd.Series(inference_result),y_test]).reset_index(drop=True))
plot_df.columns=['value']
plot_df['color'] = ['preds']*len(inference_result)+['true']*len(y_test)
plot_df.shape

In [None]:
px.histogram(plot_df,x='value',color='color')

In [None]:
results = pd.DataFrame({'preds':inference_result,'true':y_test})
results.index = X_test.index
results

In [None]:

one_pair = df.loc[X_test.index][(df['Ticker_P1']==df.loc[X_test.index]['Ticker_P1'].iloc[-1])&\
                                (df['Ticker_P2']==df.loc[X_test.index]['Ticker_P2'].iloc[-1])].index
one_pair

In [None]:
px.line(results.loc[one_pair],results.loc[one_pair].index,y=results.columns)

In [None]:
gbm_mae = (inference_result[:len(y_test)] - y_test).abs().mean()
gbm_mae

In [None]:
inference_result

In [None]:
mae = gbm_mae / y_test.abs().mean()
mae

In [None]:
# stop

# Backtest

In [None]:
df.loc[X_test.index, 'entry_preds'] = entry_preds
df.loc[X_test.index, 'exit_preds'] = exit_preds

In [None]:
# Generate the PNL from predicted spread
from pair_trading_foundations.data import ExecutePairTrading

BacktestData = df.loc[X_test.index]

samples = BacktestData
samples = samples.drop(['Date', 'Ticker_P1', 'Ticker_P2', 'Volume_P1', 'Volume_P2','SPY_Close','Ticker', label]+\
                       ['STOCK2VEC_'+ str(i) + '_P1' for i in range(0,32)]+\
                       ['STOCK2VEC_'+ str(i) + '_P2' for i in range(0,32)],axis=1)
samples = samples.drop(always_exclude,axis=1)
samples = samples.drop(features_to_exclude,axis=1)
# samples['abs_spread_log_mean_MA'] = np.log(samples.abs_spread_mean_MA)
# samples['abs_spread_log_std_mean_MA'] = np.log(samples.abs_spread_std_MA)
# samples = samples.drop(['abs_spread_mean_MA', 'abs_spread_std_MA'],axis=1)


model_input = np.reshape(samples, (samples.shape[0], 1, samples.shape[1])).astype(np.float32)
# inference_result = lstm_model.predict(model_input).flatten()
# inference_result = gbm.predict(samples[X_test.columns])
BacktestData['pred_abs_spread'] = np.exp(inference_result)
BacktestData['z_score'] = (np.log(BacktestData['pred_abs_spread']) / BacktestData['abs_spread_std_MA'])
BacktestData = BacktestData.reset_index(drop=True)
BacktestData = BacktestData.drop(['actual_abs_spread', 'actual_abs_spread_std'], axis=1)
BacktestData 

In [None]:
pnls = []
entry_signal = 2    # Make sure this matches with data pipeline
exit_signal = 0.5   # Make sure this matches with data pipeline
test_len = 60       # Make sure this matches with data pipeline
for idx in tqdm(range(BacktestData.shape[0])):
    if (idx > BacktestData.shape[0]-test_len-1):
        pnls.append(np.nan)
    else:
        current_row = BacktestData.loc[idx]
        result = ExecutePairTrading(current_row.pred_abs_spread,   # Use predicted spread
                                    current_row.abs_spread_std_MA, # Use current std deviation
                                    entry_signal=entry_signal,
                                    exit_signal=exit_signal
                                ).execute(
                                    # Forward window
                                    vec1=BacktestData.loc[(idx+1):(idx+test_len)]['Close_P1'].values,
                                    vec2=BacktestData.loc[(idx+1):(idx+test_len)]['Close_P2'].values,
                                    dates=BacktestData.loc[(idx+1):(idx+test_len)]['Date'].values,
                                    base_fund=100,
                                )

        pnls.append(result.final_pl_pct)

BacktestData['pred_pnls'] = pnls
BacktestData = BacktestData.dropna()

strategy = BacktestData.loc[(abs(BacktestData['z_score']) > 0.95) & (BacktestData['pred_pnls'] > 0)]
strategy

In [None]:
# strategy = BacktestData.loc[(abs(BacktestData['z_score']) > 0.95) & (BacktestData['pred_pnls'] > 0)]
strategy = BacktestData.iloc[:-150000]

In [None]:
print(strategy.shape)
strategy = strategy[strategy['entry_preds']==1]
print(strategy.shape)


In [None]:
print(strategy.shape)
strategy = strategy[strategy['exit_preds']==1]
print(strategy.shape)


In [None]:
print(strategy.shape)
strategy = strategy[strategy['stock2vec_cos_sim']>=0.8]
print(strategy.shape)



In [None]:
BacktestData.iloc[:-150000].Date.min(),BacktestData.iloc[:-150000].Date.max()

In [None]:
if len(strategy):
    pnl_filters = strategy.filter(items=['Date', 'pnls', 'pred_pnls'])
    pnl_filters = pnl_filters.rename(columns={'pnls': 'actual_PNLS', 'pred_pnls': 'predicted_PNLS'})
#     ax = pnl_filters.plot(x='Date',y=['predicted_PNLS'],  linestyle='-', marker='o', color='red')
#     pnl_filters.plot(x='Date',y=['actual_PNLS'], kind='bar', ax=ax)
#     ax.set_title('Predicted pnls over actual')
#     plt.figure(figsize=(8,6))
#     plt.show()
else:
    print('No valid entry available')

In [None]:
# Strategy 1: Execute each recommended trade by X amount
if len(strategy):
    average_pnl = pnl_filters.actual_PNLS.sum() / len(pnl_filters.actual_PNLS)
    print('Returns: ', average_pnl)

In [None]:
# Strategy 2: Execute top 10 recommended trade by X amount
if len(strategy):
    best_num = 10
    pnl_filters = pnl_filters.reset_index(drop=True)
    # Get the top 10 predictions for PNL
    sorted_indices = pnl_filters.predicted_PNLS.argsort()[::-1]
    top_k = sorted_indices[sorted_indices < best_num]

    # Retrieve the actual PNL
    average_pnl = pnl_filters.actual_PNLS[top_k].sum() / best_num
    print('Returns: ', average_pnl)

In [None]:
# Normality test

from scipy.stats import shapiro
# normality test
stat, p = shapiro(inference_result)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
    print('Sample looks Gaussian (fail to reject H0)')
else:
    print('Sample does not look Gaussian (reject H0)')

In [None]:
# D’Agostino’s K^2 Test
from scipy.stats import normaltest
# normality test
stat, p = normaltest(inference_result)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
    print('Sample looks Gaussian (fail to reject H0)')
else:
    print('Sample does not look Gaussian (reject H0)')

In [None]:
# Anderson-Darling Test
from scipy.stats import anderson
result = anderson(inference_result)
print('Statistic: %.3f' % result.statistic)
p = 0
for i in range(len(result.critical_values)):
    sl, cv = result.significance_level[i], result.critical_values[i]
if result.statistic < result.critical_values[i]:
    print('%.3f: %.3f, data looks normal (fail to reject H0)' % (sl, cv))
else:
    print('%.3f: %.3f, data does not look normal (reject H0)' % (sl, cv))