In [None]:
!pip install pyfolio

import pyfolio as pf
import pandas as pd
import numpy as np

import statsmodels as sm
import scipy.stats as stats

from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, cohen_kappa_score,
    balanced_accuracy_score
)

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

#Table 1A


In [None]:
df = pd.read_csv("final_data.csv"); df
df['date'] = pd.to_datetime(df['date'])
start_date = '1990-01-01'
end_date = '2019-12-31'
# df = df.drop(columns='Unnamed: 0')
filtered_df = df[(df['date'] >= start_date) & (df['date'] <= end_date)];

nulls = filtered_df.isnull().sum()
nulls

for lag in range(1, 4):
    filtered_df[f'cboe_vix_lag_{lag}'] = filtered_df['VIX_cboe'].shift(lag)

# Labelling the direction of our data
filtered_df['vix_classifier'] = (filtered_df['VIX_cboe'] > filtered_df['VIX_cboe'].shift(1)).astype(int)

# Splitting the DataFrame based on the date range
train_df = filtered_df[(filtered_df['date'] >= '1991-01-01') & (filtered_df['date'] <= '2003-12-31')]
test_df = filtered_df[filtered_df['date'] > '2003-12-31']

# Assuming all other columns except 'vix_classifier' are your features
X_train = train_df.drop(columns=['vix_classifier', 'date', 'VIX_cboe'])
y_train = train_df['vix_classifier']
X_test = test_df.drop(columns=['vix_classifier', 'date', 'VIX_cboe'])
y_test = test_df['vix_classifier']

# filtered_df.columns
X_train
print(X_train.columns)
print(len(X_train.columns))

Index(['SMB_fama', 'HML_fama', 'MKT_fama', 'MOM_fama', 'STR_fama', 'RVAR_fama',
       'BAB_aqr', 'QMJ_aqr', 'Term_Spread_fred', 'Credit_Spread_fred',
       'Treasury_10yr_fred', 'Tbill_3mo_fred', 'Oil_Price_fred',
       'Dollar_Index_fred', 'TED_Spread_fred', 'Skew_cboe', 'CF_Leverage_fred',
       'TS_MOM_aqr', 'MOM_CM_aqr', 'MOM_EQ_aqr', 'MOM_FI_aqr', 'MOM_FX_aqr',
       'HMLD_aqr', 'ECU_epu', 'MACVOL_fred', 'Bullish_Aver_aaii',
       'Neutral_Aver_aaii', 'Bearish_Aver_aaii', 'Spread_Aver_aaii',
       'BB_Ratio_Aver_aaii', 'MAVB_8week_aaii', 'MACVOL_annualized_fred',
       'cboe_vix_lag_1', 'cboe_vix_lag_2', 'cboe_vix_lag_3'],
      dtype='object')
35


In [None]:
def pttest(y, yhat):
    """Given NumPy arrays with predictions and with true values,
    return Directional Accuracy Score, Pesaran-Timmermann statistic and its p-value
    """
    size = y.shape[0]
    pyz = np.sum(np.sign(y) == np.sign(yhat))/size
    py = np.sum(y > 0)/size
    qy = py*(1 - py)/size
    pz = np.sum(yhat > 0)/size
    qz = pz*(1 - pz)/size
    p = py*pz + (1 - py)*(1 - pz)
    v = p*(1 - p)/size
    w = ((2*py - 1)**2) * qz + ((2*pz - 1)**2) * qy + 4*qy*qz
    pt = (pyz - p) / (np.sqrt(v - w))
    pval = 1 - stats.norm.cdf(pt, 0, 1)
    return pyz, pt, pval

In [None]:
def calculate_metrics(X_train, y_train, X_test, y_test, predictor, pttest_function):
    # Train the logistic regression model
    model = LogisticRegression().fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Calculate metrics
    return {
        'Predictor': predictor,
        'AUC': roc_auc_score(y_test, y_pred_proba),
        'MCE': 1 - accuracy_score(y_test, y_pred),
        'Accuracy': accuracy_score(y_test, y_pred),
        'Kappa': cohen_kappa_score(y_test, y_pred),
        'Sensitivity': recall_score(y_test, y_pred),
        'Specificity': recall_score(y_test, y_pred, pos_label=0),
        'Precision': precision_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred),
        'Balanced Accuracy': balanced_accuracy_score(y_test, y_pred),
        'Pesaran - Timmermann': round(pttest(y_test, y_pred)[1], 3)
    }

# Assuming X_train, X_test, y_train, y_test are defined
# Assuming pttest is a function that returns the PT statistic and p-value
predictors = [col for col in X_train.columns if not col.startswith('cboe_vix_lag')]
results = [
    calculate_metrics(X_train[[predictor]], y_train, X_test[[predictor]], y_test, predictor, pttest)
    for predictor in predictors
]

# Now calculate for the lagged VIX predictors
lagged_predictors = ['cboe_vix_lag_1', 'cboe_vix_lag_2', 'cboe_vix_lag_3']
results.append(
    calculate_metrics(X_train[lagged_predictors], y_train, X_test[lagged_predictors], y_test, 'VIX-lagged', pttest)
)

# Convert results to DataFrame and reorder columns
results_df = pd.DataFrame(results)
results_df = results_df[[
    'Predictor', 'AUC', 'MCE', 'Accuracy', 'Kappa',
    'Sensitivity', 'Specificity', 'Precision', 'F1-Score',
    'Balanced Accuracy', 'Pesaran - Timmermann'
]]

results_df

Unnamed: 0,Predictor,AUC,MCE,Accuracy,Kappa,Sensitivity,Specificity,Precision,F1-Score,Balanced Accuracy,Pesaran - Timmermann
0,SMB_fama,0.446438,0.552083,0.447917,-0.11347,0.382022,0.504854,0.4,0.390805,0.443438,-1.578
1,HML_fama,0.61056,0.447917,0.552083,0.065112,0.247191,0.815534,0.536585,0.338462,0.531362,1.06
2,MKT_fama,0.592778,0.442708,0.557292,0.089692,0.359551,0.728155,0.533333,0.42953,0.543853,1.311
3,MOM_fama,0.574561,0.453125,0.546875,0.096495,0.573034,0.524272,0.51,0.539683,0.548653,1.349
4,STR_fama,0.458111,0.489583,0.510417,-0.041671,0.067416,0.893204,0.352941,0.113208,0.48031,-0.96
5,RVAR_fama,0.486691,0.479167,0.520833,-0.022814,0.05618,0.92233,0.384615,0.098039,0.489255,-0.593
6,BAB_aqr,0.548762,0.447917,0.552083,0.075372,0.325843,0.747573,0.527273,0.402778,0.536708,1.125
7,QMJ_aqr,0.464219,0.505208,0.494792,-0.060351,0.146067,0.796117,0.382353,0.211382,0.471092,-1.049
8,Term_Spread_fred,0.554543,0.447917,0.552083,0.091049,0.449438,0.640777,0.519481,0.481928,0.545107,1.275
9,Credit_Spread_fred,0.506982,0.463542,0.536458,0.0,0.0,1.0,0.0,0.0,0.5,0.0


# Table 1B:

In [None]:
def AIC_stepwise_regression(X, y, num_features, threshold_in=0.1, threshold_out=0.1, verbose=True):
    initial_features = X.columns.tolist()
    best_features = []
    while len(best_features) < num_features:
        changed = False

        # forward step
        excluded = list(set(initial_features) - set(best_features))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[best_features + [new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            if len(best_features) < num_features:
                best_features.append(best_feature)
                changed = True
                if verbose:
                    print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        if len(best_features) > num_features:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[best_features]))).fit()
            pvalues = model.pvalues.iloc[1:]
            worst_pval = pvalues.max()
            if worst_pval > threshold_out:
                changed = True
                worst_feature = pvalues.idxmax()
                best_features.remove(worst_feature)
                if verbose:
                    print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))

        if not changed:
            break

    return best_features

def BIC_stepwise_regression(X, y, num_features, threshold=0.5):
    initial_features = X.columns.tolist()
    best_features = []
    current_score, best_new_score = float('inf'), float('inf')

    while initial_features and current_score >= best_new_score and len(best_features) < num_features:
        scores_with_candidates = []
        for candidate in initial_features:
            X_with_candidate = sm.add_constant(X[best_features + [candidate]])
            bic = sm.OLS(y, X_with_candidate).fit().bic
            scores_with_candidates.append((bic, candidate))

        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop(0)

        if current_score > best_new_score:
            best_features.append(best_candidate)
            initial_features.remove(best_candidate)
            current_score = best_new_score

    return best_features

import statsmodels.api as sm

def deviance_forward_selection(X, y, family=sm.families.Gaussian(), num_features=5):
    initial_features = X.columns.tolist()
    best_features = []
    current_deviance = float('inf')
    features_added = 0

    while initial_features and features_added < num_features:
        deviance_with_candidates = []
        for candidate in initial_features:
            X_with_candidate = sm.add_constant(X[best_features + [candidate]])
            model = sm.GLM(y, X_with_candidate, family=family).fit()
            deviance = model.deviance
            deviance_with_candidates.append((deviance, candidate))

        deviance_with_candidates.sort()
        best_new_deviance, best_candidate = deviance_with_candidates.pop(0)

        if current_deviance > best_new_deviance:
            best_features.append(best_candidate)
            initial_features.remove(best_candidate)
            current_deviance = best_new_deviance
            features_added += 1

    return best_features


In [None]:
features_aic = AIC_stepwise_regression(X_train, y_train, num_features=20, threshold_in=0.5, threshold_out=0.5)
print(features_aic)
table1b_res = pd.DataFrame([calculate_metrics(X_train[features_aic], y_train, X_test[features_aic], y_test, 'AIC', pttest)])
features_bic =  BIC_stepwise_regression(X_train, y_train, num_features=20)
print(features_bic)
table1b_res = table1b_res.append([calculate_metrics(X_train[features_bic], y_train, X_test[features_bic], y_test, 'BIC', pttest)])
features_deviance = deviance_forward_selection(X_train, y_train, family=sm.families.Gaussian(), num_features=20)
print(features_deviance)
table1b_res = table1b_res.append([calculate_metrics(X_train[features_deviance], y_train, X_test[features_deviance], y_test, 'deviance', pttest)])
table1b_res

Add  cboe_vix_lag_1                 with p-value 0.0116303
Add  MACVOL_fred                    with p-value 0.000861532
Add  Dollar_Index_fred              with p-value 0.0385274
Add  Oil_Price_fred                 with p-value 0.0261316
Add  TED_Spread_fred                with p-value 0.0709975
Add  Credit_Spread_fred             with p-value 0.0233913
Add  ECU_epu                        with p-value 0.0630172
Add  HML_fama                       with p-value 0.123275
Add  MOM_fama                       with p-value 0.182549
Add  QMJ_aqr                        with p-value 0.214282
Add  MACVOL_annualized_fred         with p-value 0.216612
Add  TS_MOM_aqr                     with p-value 0.285215
Add  cboe_vix_lag_3                 with p-value 0.308118
Add  Tbill_3mo_fred                 with p-value 0.454553
Add  BB_Ratio_Aver_aaii             with p-value 0.476687
Add  MAVB_8week_aaii                with p-value 0.365176
Add  MOM_FI_aqr                     with p-value 0.499219
Add  

Unnamed: 0,Predictor,AUC,MCE,Accuracy,Kappa,Sensitivity,Specificity,Precision,F1-Score,Balanced Accuracy,Pesaran - Timmermann
0,AIC,0.675903,0.385417,0.614583,0.243531,0.764045,0.485437,0.561983,0.647619,0.624741,3.58
0,BIC,0.660849,0.369792,0.630208,0.276894,0.808989,0.475728,0.571429,0.669767,0.642358,4.153
0,deviance,0.67023,0.385417,0.614583,0.241269,0.741573,0.504854,0.564103,0.640777,0.623214,3.499


In [None]:
def add_lags(df, column_name, max_lag):
    for lag in range(1, max_lag + 1):
        df[f'{column_name}_lag_{lag}'] = df[column_name].shift(lag)

df = pd.read_csv("final_data.csv")

# Convert 'date' to datetime and filter based on date range
df['date'] = pd.to_datetime(df['date'])
start_date = '1990-01-01'
end_date = '2019-12-31'
filtered_df = df[(df['date'] >= start_date) & (df['date'] <= end_date)]

# Add lags for each column except 'date', 'vix_classifier', and 'VIX_cboe'
columns_to_lag = filtered_df.columns.difference(['date', 'vix_classifier', 'VIX_cboe'])
for column in columns_to_lag:
    add_lags(filtered_df, column, 3)

# Labelling the direction of our data
filtered_df['vix_classifier'] = (filtered_df['VIX_cboe'] > filtered_df['VIX_cboe'].shift(1)).astype(int)

# Splitting the DataFrame based on the date range
train_df = filtered_df[(filtered_df['date'] >= '1991-01-01') & (filtered_df['date'] <= '2003-12-31')]
test_df = filtered_df[filtered_df['date'] > '2003-12-31']

X_train = train_df.drop(columns=['vix_classifier', 'date', 'VIX_cboe'])
y_train = train_df['vix_classifier']
X_test = test_df.drop(columns=['vix_classifier', 'date', 'VIX_cboe'])
y_test = test_df['vix_classifier']

# Table 2

In [None]:
import statsmodels.api as sm
import sklearn.metrics

def devianceCalc(X, y, model):
    return 2*sklearn.metrics.log_loss(y, model.predict_proba(X), normalize=False)

def deviance_forward_selection(X, y, deviance_threshold=1):
    initial_features = X.columns.tolist()
    best_features = []
    current_deviance = float('inf')
    while initial_features:
        deviance_with_candidates = []
        for candidate in initial_features:
            X_with_candidate = X[best_features + [candidate]]
            y = list(y)
            model = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, max_iter=1000).fit(X_with_candidate, y)
            deviance = devianceCalc(X_with_candidate, y, model)
            deviance_with_candidates.append((deviance, candidate))

        deviance_with_candidates.sort()
        best_new_deviance, best_candidate = deviance_with_candidates.pop(0)

        if current_deviance - best_new_deviance > deviance_threshold:
            best_features.append(best_candidate)
            initial_features.remove(best_candidate)
            current_deviance = best_new_deviance
        else:
            break

    return best_features

#features_selected = deviance_forward_selection(X_train, y_train)
#print("Selected features based on deviance threshold:", features_selected)


In [None]:
X_train_tab2 = X_train.copy()
y_train_tab2 = y_train.copy()
arr_tab2 = [0]*99
'''193 iterations takes too long to run, this is our demo'''
for i in range(1, 5):
  best_feaures = deviance_forward_selection(X_train_tab2, y_train_tab2)
  elastic_net_model_tab2 = ElasticNet(alpha = 0.5, max_iter=100)
  elastic_net_model_tab2.fit(X_train_tab2[best_feaures], y_train_tab2)
  for j, x in enumerate(elastic_net_model_tab2.coef_):
    if x != 0:
      arr_tab2[j] += 1;
  X_train_tab2 = X_train_tab2.append(X_train_tab2.iloc[i])
  y_train_tab2 = y_train_tab2.append(pd.Series(y_test.iloc[i]))
d_tab2 = {}
# for i, x in enumerate(arr_tab2):
#   d_tab2[X_train_tab2.columns[i]] = x
d_tab2

NameError: ignored

In [None]:
in_results = {'BAB_aqr_lag_1': 165,
 'BAB_aqr_lag_2': 19,
 'BAB_aqr_lag_3': 0,
 'BB_Ratio_Aver_aaii_lag_1': 0,
 'BB_Ratio_Aver_aaii_lag_2': 15,
 'BB_Ratio_Aver_aaii_lag_3': 13,
 'Bearish_Aver_aaii_lag_1': 33,
 'Bearish_Aver_aaii_lag_2': 65,
 'Bearish_Aver_aaii_lag_3': 27,
 'Bullish_Aver_aaii_lag_1': 17,
 'Bullish_Aver_aaii_lag_2': 15,
 'Bullish_Aver_aaii_lag_3': 20,
 'CF_Leverage_fred_lag_1': 19,
 'CF_Leverage_fred_lag_2': 4,
 'CF_Leverage_fred_lag_3': 4,
 'Credit_Spread_fred_lag_1': 2,
 'Credit_Spread_fred_lag_2': 10,
 'Credit_Spread_fred_lag_3': 1,
 'Dollar_Index_fred_lag_1': 4,
 'Dollar_Index_fred_lag_2': 7,
 'Dollar_Index_fred_lag_3': 6,
 'ECU_epu_lag_1': 3,
 'ECU_epu_lag_2': 4,
 'ECU_epu_lag_3': 5,
 'HMLD_aqr_lag_1': 5,
 'HMLD_aqr_lag_2': 9,
 'HMLD_aqr_lag_3': 1,
 'HML_fama_lag_1': 4,
 'HML_fama_lag_2': 0,
 'HML_fama_lag_3': 0,
 'MACVOL_annualized_fred_lag_1': 0,
 'MACVOL_annualized_fred_lag_2': 0,
 'MACVOL_annualized_fred_lag_3': 0,
 'MACVOL_fred_lag_1': 0,
 'MACVOL_fred_lag_2': 0,
 'MACVOL_fred_lag_3': 0,
 'MAVB_8week_aaii_lag_1': 0,
 'MAVB_8week_aaii_lag_2': 0,
 'MAVB_8week_aaii_lag_3': 0,
 'MKT_fama_lag_1': 0,
 'MKT_fama_lag_2': 0,
 'MKT_fama_lag_3': 0,
 'MOM_CM_aqr_lag_1': 0,
 'MOM_CM_aqr_lag_2': 0,
 'MOM_CM_aqr_lag_3': 0,
 'MOM_EQ_aqr_lag_1': 0,
 'MOM_EQ_aqr_lag_2': 0,
 'MOM_EQ_aqr_lag_3': 0,
 'MOM_FI_aqr_lag_1': 0,
 'MOM_FI_aqr_lag_2': 0,
 'MOM_FI_aqr_lag_3': 0,
 'MOM_FX_aqr_lag_1': 0,
 'MOM_FX_aqr_lag_2': 0,
 'MOM_FX_aqr_lag_3': 0,
 'MOM_fama_lag_1': 0,
 'MOM_fama_lag_2': 0,
 'MOM_fama_lag_3': 0,
 'Neutral_Aver_aaii_lag_1': 0,
 'Neutral_Aver_aaii_lag_2': 0,
 'Neutral_Aver_aaii_lag_3': 0,
 'Oil_Price_fred_lag_1': 0,
 'Oil_Price_fred_lag_2': 0,
 'Oil_Price_fred_lag_3': 0,
 'QMJ_aqr_lag_1': 0,
 'QMJ_aqr_lag_2': 0,
 'QMJ_aqr_lag_3': 0,
 'RVAR_fama_lag_1': 0,
 'RVAR_fama_lag_2': 0,
 'RVAR_fama_lag_3': 0,
 'SMB_fama_lag_1': 0,
 'SMB_fama_lag_2': 0,
 'SMB_fama_lag_3': 0,
 'STR_fama_lag_1': 0,
 'STR_fama_lag_2': 0,
 'STR_fama_lag_3': 0,
 'Skew_cboe_lag_1': 0,
 'Skew_cboe_lag_2': 0,
 'Skew_cboe_lag_3': 0,
 'Spread_Aver_aaii_lag_1': 0,
 'Spread_Aver_aaii_lag_2': 0,
 'Spread_Aver_aaii_lag_3': 0,
 'TED_Spread_fred_lag_1': 0,
 'TED_Spread_fred_lag_2': 0,
 'TED_Spread_fred_lag_3': 0,
 'TS_MOM_aqr_lag_1': 0,
 'TS_MOM_aqr_lag_2': 0,
 'TS_MOM_aqr_lag_3': 0,
 'Tbill_3mo_fred_lag_1': 0,
 'Tbill_3mo_fred_lag_2': 0,
 'Tbill_3mo_fred_lag_3': 0,
 'Term_Spread_fred_lag_1': 0,
 'Term_Spread_fred_lag_2': 0,
 'Term_Spread_fred_lag_3': 0,
 'Treasury_10yr_fred_lag_1': 0,
 'Treasury_10yr_fred_lag_2': 0,
 'Treasury_10yr_fred_lag_3': 0,
 'vix_classifier_lag_1': 0,
 'vix_classifier_lag_2': 0,
 'vix_classifier_lag_3': 0}

# Table 3

In [None]:
X_train_tab3 = X_train.copy()
y_train_tab3 = y_train.copy()
arr_tab3 = [0]*99
'''193 iterations takes too long to run, this is our demo'''
for i in range(1, 35):
  best_feaures = deviance_forward_selection(X_train_tab3, y_train_tab3)
  elastic_net_model_tab3 = ElasticNet(alpha = 0.75, max_iter=100)
  elastic_net_model_tab3.fit(X_train_tab3[best_feaures], y_train_tab3)
  for j, x in enumerate(elastic_net_model_tab3.coef_):
    if x != 0:
      arr_tab3[j] += 1;
  X_train_tab3 = X_train_tab3.append(X_train_tab3.iloc[i])
  y_train_tab3 = y_train_tab3.append(pd.Series(y_test.iloc[i]))
d_tab3 = {}
# for i, x in enumerate(arr_tab3):
#   d_tab3[X_train_tab3.columns[i]] = x
arr_tab3

# Table 4

In [None]:
def all_models(X_train, X_test, y_train, y_test, column_names, results_df):
    # Ridge Model
    ridge_model = LogisticRegression(penalty='l2', solver='liblinear')
    ridge_model.fit(X_train[column_names], y_train)
    results_df = calculate_metrics_and_update_df(
        'Ridge', y_test, ridge_model.predict(X_test[column_names]),
        ridge_model.predict_proba(X_test[column_names])[:, 1], results_df
    )

    # Lasso Model
    lasso_model = LogisticRegression(penalty='l1', solver='saga', max_iter=10000)
    lasso_model.fit(X_train[column_names], y_train)
    results_df = calculate_metrics_and_update_df(
        'Lasso', y_test, lasso_model.predict(X_test[column_names]),
        lasso_model.predict_proba(X_test[column_names])[:, 1], results_df
    )

    # Elastic Net Model
    elastic_net_model = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, max_iter=10000)
    elastic_net_model.fit(X_train[column_names], y_train)
    results_df = calculate_metrics_and_update_df(
        'Elastic Net', y_test, elastic_net_model.predict(X_test[column_names]),
        elastic_net_model.predict_proba(X_test[column_names])[:, 1], results_df
    )

    # LDA Model
    lda_model = LinearDiscriminantAnalysis()
    lda_model.fit(X_train[column_names], y_train)
    results_df = calculate_metrics_and_update_df(
        'LDA', y_test, lda_model.predict(X_test[column_names]),
        lda_model.predict_proba(X_test[column_names])[:, 1], results_df
    )

    # RDA Model (using LinearDiscriminantAnalysis with shrinkage)
    rda_model = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')
    rda_model.fit(X_train[column_names], y_train)
    results_df = calculate_metrics_and_update_df(
        'RDA', y_test, rda_model.predict(X_test[column_names]),
        rda_model.predict_proba(X_test[column_names])[:, 1], results_df
    )

    # Naive Bayes Model
    naive_bayes_model = GaussianNB()
    naive_bayes_model.fit(X_train[column_names], y_train)
    results_df = calculate_metrics_and_update_df(
        'Naive Bayes', y_test, naive_bayes_model.predict(X_test[column_names]),
        naive_bayes_model.predict_proba(X_test[column_names])[:, 1], results_df
    )

    # Bagging Model
    bagging_model = BaggingClassifier()
    bagging_model.fit(X_train[column_names], y_train)
    results_df = calculate_metrics_and_update_df(
        'Bagging', y_test, bagging_model.predict(X_test[column_names]),
        bagging_model.predict_proba(X_test[column_names])[:, 1], results_df
    )

    # Random Forest Model
    random_forest_model = RandomForestClassifier()
    random_forest_model.fit(X_train[column_names], y_train)
    results_df = calculate_metrics_and_update_df(
        'Random Forest', y_test, random_forest_model.predict(X_test[column_names]),
        random_forest_model.predict_proba(X_test[column_names])[:, 1], results_df
    )

    # AdaBoost Model
    ada_boost_model = AdaBoostClassifier()
    ada_boost_model.fit(X_train[column_names], y_train)
    results_df = calculate_metrics_and_update_df(
        'AdaBoost', y_test, ada_boost_model.predict(X_test[column_names]),
        ada_boost_model.predict_proba(X_test[column_names])[:, 1], results_df
    )

    # Gradient Boosting Model
    gradient_boosting_model = GradientBoostingClassifier()
    gradient_boosting_model.fit(X_train[column_names], y_train)
    results_df = calculate_metrics_and_update_df(
        'Gradient Boosting', y_test, gradient_boosting_model.predict(X_test[column_names]),
        gradient_boosting_model.decision_function(X_test[column_names]), results_df
    )

    # k-Nearest Neighbors Model
    knn_model = KNeighborsClassifier()
    knn_model.fit(X_train[column_names], y_train)
    results_df = calculate_metrics_and_update_df(
        'knn', y_test, knn_model.predict(X_test[column_names]),
        knn_model.predict_proba(X_test[column_names])[:, 1], results_df
    )

    # rpart Model
    rpart_model = DecisionTreeClassifier()
    rpart_model.fit(X_train[column_names], y_train)
    results_df = calculate_metrics_and_update_df(
        'rpart', y_test, rpart_model.predict(X_test[column_names]),
        rpart_model.predict_proba(X_test[column_names])[:, 1], results_df
    )

    # mean/median k1 Models
    model_predictions_k1 = [
        ridge_model.predict_proba(X_test[column_names])[:, 1],
        knn_model.predict_proba(X_test[column_names])[:, 1],
        lda_model.predict_proba(X_test[column_names])[:, 1],
        naive_bayes_model.predict_proba(X_test[column_names])[:, 1],
    ]

    # mean/median k2 Models
    model_predictions_k2 = [
        ridge_model.predict_proba(X_test[column_names])[:, 1],
        ada_boost_model.predict_proba(X_test[column_names])[:, 1],
        bagging_model.predict_proba(X_test[column_names])[:, 1],
        lda_model.predict_proba(X_test[column_names])[:, 1],
    ]

    mean_predictions_k1 = np.mean(model_predictions_k1, axis=0)
    median_predictions_k1 = np.median(model_predictions_k1, axis=0)
    mean_predictions_k2 = np.mean(model_predictions_k2, axis=0)
    median_predictions_k2 = np.median(model_predictions_k2, axis=0)

    # Add mean and median for k = 1 and k = 2 to the results DataFrame
    results_df = calculate_metrics_and_update_df('Mean (k = 1)', y_test, (mean_predictions_k1 >= 0.5).astype(int), mean_predictions_k1, results_df)
    results_df = calculate_metrics_and_update_df('Median (k = 1)', y_test, (median_predictions_k1 >= 0.5).astype(int), median_predictions_k1, results_df)
    results_df = calculate_metrics_and_update_df('Mean (k = 2)', y_test, (mean_predictions_k2 >= 0.5).astype(int), mean_predictions_k2, results_df)
    results_df = calculate_metrics_and_update_df('Median (k = 2)', y_test, (median_predictions_k2 >= 0.5).astype(int), median_predictions_k2, results_df)

    all_peds = {
        'Ridge': ridge_model.predict(X_test[column_names]),
        'Lasso': lasso_model.predict(X_test[column_names]),
        'Lasso': lda_model.predict(X_test[column_names]),
        'Elastic Net': elastic_net_model.predict(X_test[column_names]),
        'KNN': knn_model.predict(X_test[column_names]),
        'RPART': rpart_model.predict(X_test[column_names]),
        'Mean (k = 1)': (mean_predictions_k1 >= 0.5).astype(int),
        'Mean (k = 2)': (mean_predictions_k2 >= 0.5).astype(int),
        'Median (k = 1)': (median_predictions_k1 >= 0.5).astype(int),
        'Median (k = 2)': (median_predictions_k2 >= 0.5).astype(int),
        'LDA': lda_model.predict(X_test[column_names]),
        'Naive Bayes': naive_bayes_model.predict(X_test[column_names]),
        'Bagging': bagging_model.predict(X_test[column_names]),
        'Random Forest': random_forest_model.predict(X_test[column_names]),
        'Ada. Boosting': ada_boost_model.predict(X_test[column_names]),
        'Grad. Boosting': gradient_boosting_model.predict(X_test[column_names]),
    }

    all_pobs = {
        'Ridge': ridge_model.predict_proba(X_test[column_names]),
        'Lasso': lasso_model.predict_proba(X_test[column_names]),
        'Lasso': lda_model.predict_proba(X_test[column_names]),
        'Elastic Net': elastic_net_model.predict_proba(X_test[column_names]),
        'KNN': knn_model.predict_proba(X_test[column_names]),
        'RPART': rpart_model.predict_proba(X_test[column_names]),
        'Mean (k = 1)': mean_predictions_k1,
        'Mean (k = 2)': mean_predictions_k2,
        'Median (k = 1)': median_predictions_k1,
        'Median (k = 2)': median_predictions_k2,
        'LDA': lda_model.predict_proba(X_test[column_names]),
        'Naive Bayes': naive_bayes_model.predict_proba(X_test[column_names]),
        'Bagging': bagging_model.predict_proba(X_test[column_names]),
        'Random Forest': random_forest_model.predict_proba(X_test[column_names]),
        'Ada. Boosting': ada_boost_model.predict_proba(X_test[column_names]),
        'Grad. Boosting': gradient_boosting_model.decision_function(X_test[column_names]),
    }

    return results_df, all_peds, all_pobs

In [None]:
elastic_net_a05_names = [
    "MOM_EQ_aqr", "MKT_fama", "RVAR_fama", "Oil_Price_fred",
    "QMJ_aqr", "HMLD_aqr", "Skew_cboe", "MOM_FI_aqr", "TS_MOM_aqr",
    "MOM_FX_aqr", "ECU_epu", "Term_Spread_fred"
]

elastic_net_a05 = [col for name in elastic_net_a05_names for col in filtered_df.columns if col.startswith(name)]

In [None]:
# Initialize DataFrame to store results
results_df = pd.DataFrame(columns=['Model', 'AUC', 'MCE', 'Accuracy', 'Kappa', 'Sensitivity',
                                   'Specificity', 'Precision', 'F1 Score', 'Balanced_Accuracy'])

# Function to calculate and append metrics to DataFrame
def calculate_metrics_and_update_df(model_name, y_test, y_pred, y_pred_prob, df):
    metrics = {
        'Model': model_name,
        'AUC': roc_auc_score(y_test, y_pred_prob),
        'MCE': 1 - accuracy_score(y_test, y_pred),
        'Accuracy': accuracy_score(y_test, y_pred),
        'Kappa': cohen_kappa_score(y_test, y_pred),
        'Sensitivity': recall_score(y_test, y_pred),
        'Specificity': recall_score(y_test, y_pred, pos_label=0),
        'Precision': precision_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'Balanced_Accuracy': balanced_accuracy_score(y_test, y_pred),
        'Pesaran - Timmermann': round(pttest(y_test, y_pred)[1], 3)
    }
    df = df.append(metrics, ignore_index=True)
    return df

results_df = pd.DataFrame()
results_df, all_peds, all_pobs = all_models(X_train, X_test, y_train, y_test, elastic_net_a05, results_df)
results_df.round(2)

Unnamed: 0,Model,AUC,MCE,Accuracy,Kappa,Sensitivity,Specificity,Precision,F1 Score,Balanced_Accuracy,Pesaran - Timmermann
0,Ridge,0.71,0.36,0.64,0.28,0.66,0.62,0.6,0.63,0.64,3.94
1,Lasso,0.73,0.34,0.66,0.31,0.69,0.63,0.62,0.65,0.66,4.39
2,Elastic Net,0.72,0.34,0.66,0.32,0.67,0.65,0.62,0.65,0.66,4.5
3,LDA,0.68,0.36,0.64,0.27,0.55,0.72,0.63,0.59,0.63,3.8
4,RDA,0.75,0.31,0.69,0.36,0.49,0.85,0.75,0.59,0.67,5.24
5,Naive Bayes,0.63,0.4,0.6,0.19,0.4,0.78,0.61,0.49,0.59,2.72
6,Bagging,0.7,0.38,0.62,0.23,0.51,0.72,0.61,0.55,0.61,3.19
7,Random Forest,0.73,0.33,0.67,0.32,0.52,0.8,0.69,0.59,0.66,4.55
8,AdaBoost,0.77,0.29,0.71,0.42,0.78,0.65,0.66,0.71,0.71,5.92
9,Gradient Boosting,0.73,0.32,0.68,0.36,0.65,0.71,0.66,0.66,0.68,5.01


# Table 5

In [None]:
elastic_net_a075_names = [
    "MOM_EQ_aqr", "MKT_fama", "RVAR_fama", "Oil_Price_fred",
    "QMJ_aqr", "Skew_cboe", "HMLD_aqr", "TS_MOM_aqr", "MOM_FX_aqr",
    "MOM_FI_aqr", "Term_Spread_fred", "ECU_epu"
]

elastic_net_a075 = [col for name in elastic_net_a05_names for col in filtered_df.columns if col.startswith(name)]

In [None]:
results_df = pd.DataFrame()
results_df, all_peds, all_pobs = all_models(X_train, X_test, y_train, y_test, elastic_net_a075, results_df); results_df.round(2)

Unnamed: 0,Model,AUC,MCE,Accuracy,Kappa,Sensitivity,Specificity,Precision,F1 Score,Balanced_Accuracy,Pesaran - Timmermann
0,Ridge,0.71,0.36,0.64,0.28,0.66,0.62,0.6,0.63,0.64,3.94
1,Lasso,0.73,0.34,0.66,0.31,0.69,0.63,0.62,0.65,0.66,4.39
2,Elastic Net,0.72,0.34,0.66,0.32,0.67,0.65,0.62,0.65,0.66,4.5
3,LDA,0.68,0.36,0.64,0.27,0.55,0.72,0.63,0.59,0.63,3.8
4,RDA,0.75,0.31,0.69,0.36,0.49,0.85,0.75,0.59,0.67,5.24
5,Naive Bayes,0.63,0.4,0.6,0.19,0.4,0.78,0.61,0.49,0.59,2.72
6,Bagging,0.69,0.34,0.66,0.32,0.71,0.61,0.61,0.66,0.66,4.44
7,Random Forest,0.73,0.34,0.66,0.31,0.62,0.69,0.63,0.62,0.65,4.28
8,AdaBoost,0.77,0.29,0.71,0.42,0.78,0.65,0.66,0.71,0.71,5.92
9,Gradient Boosting,0.72,0.33,0.67,0.34,0.64,0.7,0.65,0.64,0.67,4.72


# Table 6

## For Table 6

In [None]:
df = pd.read_csv("final_data.csv"); df
df['date'] = pd.to_datetime(df['date'])
start_date = '1990-01-01'
end_date = '2019-12-31'
# df = df.drop(columns='Unnamed: 0')
filtered_df = df[(df['date'] >= start_date) & (df['date'] <= end_date)];

for lag in range(1, 4):
    filtered_df[f'cboe_vix_lag_{lag}'] = filtered_df['VIX_cboe'].shift(lag)

# Labelling the direction of our data
filtered_df['vix_classifier'] = (filtered_df['VIX_cboe'] > filtered_df['VIX_cboe'].shift(1)).astype(int)

# Splitting the DataFrame based on the date range
train_df = filtered_df[(filtered_df['date'] >= '1991-01-01') & (filtered_df['date'] <= '2003-12-31')]
test_df = filtered_df[filtered_df['date'] > '2003-12-31']

X_train = train_df.drop(columns=['vix_classifier', 'date', 'VIX_cboe'])
y_train = train_df['vix_classifier']
X_test = test_df.drop(columns=['vix_classifier', 'date', 'VIX_cboe'])
y_test = test_df['vix_classifier']
X_test.columns

Index(['SMB_fama', 'HML_fama', 'MKT_fama', 'MOM_fama', 'STR_fama', 'RVAR_fama',
       'BAB_aqr', 'QMJ_aqr', 'Term_Spread_fred', 'Credit_Spread_fred',
       'Treasury_10yr_fred', 'Tbill_3mo_fred', 'Oil_Price_fred',
       'Dollar_Index_fred', 'TED_Spread_fred', 'Skew_cboe', 'CF_Leverage_fred',
       'TS_MOM_aqr', 'MOM_CM_aqr', 'MOM_EQ_aqr', 'MOM_FI_aqr', 'MOM_FX_aqr',
       'HMLD_aqr', 'ECU_epu', 'MACVOL_fred', 'Bullish_Aver_aaii',
       'Neutral_Aver_aaii', 'Bearish_Aver_aaii', 'Spread_Aver_aaii',
       'BB_Ratio_Aver_aaii', 'MAVB_8week_aaii', 'MACVOL_annualized_fred',
       'cboe_vix_lag_1', 'cboe_vix_lag_2', 'cboe_vix_lag_3'],
      dtype='object')

In [None]:
spx = pd.read_csv('SPX.csv')[['Date', 'Close']]
start_date = '1990-01'
end_date = '2019-12'

spx['Date'] = pd.to_datetime(spx['Date'])
spx['Date'] = spx['Date'].dt.to_period('M')
spx = spx[(spx['Date'] >= start_date) & (spx['Date'] <= end_date)]
spx = spx.groupby([spx['Date'].dt.year, spx['Date'].dt.month]).first().reset_index(drop=True)
spx['VIX_cboe'] = filtered_df['VIX_cboe']
spx = spx[168:].reset_index().drop(columns='index'); spx.round(2)

Unnamed: 0,Date,Close,VIX_cboe
0,2004-01,1108.48,16.77
1,2004-02,1135.26,18.22
2,2004-03,1155.97,17.11
3,2004-04,1132.17,14.44
4,2004-05,1117.49,16.65
...,...,...,...
187,2019-08,2953.56,14.06
188,2019-09,2906.27,17.87
189,2019-10,2940.25,19.66
190,2019-11,3066.91,18.56


In [None]:
def incremental_training(X_train_initial, y_train_initial, X_test, y_test):
    model = LogisticRegression()
    model.fit(X_train_initial, y_train_initial)

    y_pred = []
    y_proba = []  # List to store probabilities
    X_train = X_train_initial.copy()
    y_train = y_train_initial.copy()

    for index, row in X_test.iterrows():
        # Predict and store the prediction
        prediction = model.predict(row.values.reshape(1, -1))
        y_pred.append(prediction[0])

        # Get and store the predicted probabilities
        proba = model.predict_proba(row.values.reshape(1, -1))
        y_proba.append(proba[0])

        # Add the row to the training set and retrain
        X_train = X_train.append(row)
        y_train = y_train.append(pd.Series(y_test.loc[index]))

        model.fit(X_train, y_train)

    return y_pred, y_proba

names = ['cboe_vix_lag', 'MKT_fama', 'HMLD_aqr', 'MOM_EQ_aqr']
df_pred = pd.DataFrame()
df_prob = pd.DataFrame()
for i in names:
    X_tr = X_train[[col for col in X_train.columns if col.startswith(i)]]
    X_te = X_test[[col for col in X_train.columns if col.startswith(i)]]
    df_pred[i], df_prob[i] = incremental_training(X_tr, y_train, X_te, y_test)

df_pred

Unnamed: 0,cboe_vix_lag,MKT_fama,HMLD_aqr,MOM_EQ_aqr
0,1,0,0,0
1,1,0,0,0
2,1,0,0,1
3,1,0,0,0
4,1,0,1,0
...,...,...,...,...
187,0,1,0,0
188,1,1,0,1
189,0,1,0,1
190,0,0,0,0


In [None]:
import statsmodels.api as sm

def BIC_stepwise_regression(X, y, num_features, threshold=0.5):
    initial_features = X.columns.tolist()
    best_features = []
    current_score, best_new_score = float('inf'), float('inf')

    while initial_features and current_score >= best_new_score and len(best_features) < num_features:
        scores_with_candidates = []
        for candidate in initial_features:
            X_with_candidate = sm.add_constant(X[best_features + [candidate]])
            bic = sm.OLS(y, X_with_candidate).fit().bic
            scores_with_candidates.append((bic, candidate))

        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop(0)

        if current_score > best_new_score:
            best_features.append(best_candidate)
            initial_features.remove(best_candidate)
            current_score = best_new_score

    return best_features

def AIC_stepwise_regression(X, y, num_features, threshold_in=0.1, threshold_out=0.1, verbose=True):
    initial_features = X.columns.tolist()
    best_features = []
    while len(best_features) < num_features:
        changed = False

        # forward step
        excluded = list(set(initial_features) - set(best_features))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[best_features + [new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            if len(best_features) < num_features:
                best_features.append(best_feature)
                changed = True
                if verbose:
                    print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        if len(best_features) > num_features:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[best_features]))).fit()
            pvalues = model.pvalues.iloc[1:]
            worst_pval = pvalues.max()
            if worst_pval > threshold_out:
                changed = True
                worst_feature = pvalues.idxmax()
                best_features.remove(worst_feature)
                if verbose:
                    print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))

        if not changed:
            break

    return best_features


def deviance_forward_selection(X, y, family=sm.families.Gaussian(), num_features=5):
    initial_features = X.columns.tolist()
    best_features = []
    current_deviance = float('inf')
    features_added = 0

    while initial_features and features_added < num_features:
        deviance_with_candidates = []
        for candidate in initial_features:
            X_with_candidate = sm.add_constant(X[best_features + [candidate]])
            model = sm.GLM(y, X_with_candidate, family=family).fit()
            deviance = model.deviance
            deviance_with_candidates.append((deviance, candidate))

        deviance_with_candidates.sort()
        best_new_deviance, best_candidate = deviance_with_candidates.pop(0)

        if current_deviance > best_new_deviance:
            best_features.append(best_candidate)
            initial_features.remove(best_candidate)
            current_deviance = best_new_deviance
            features_added += 1

    return best_features

In [None]:
def vix_long(prices, investment_decisions, risk_free_rate=0):
    if len(prices) != len(investment_decisions):
        raise ValueError("Length of prices and investment_decisions arrays must be the same")
    monthly_returns = []
    # Initialize an empty list for monthly returns
    for i in range(1, len(prices)):
        returns = (prices[i] - prices[i-1])
        monthly_returns.append(returns)
    return monthly_returns

def str_1(prices, investment_decisions):
    monthly_returns = []
    for i in range(1, len(prices)):
        returns = (prices[i] - prices[i-1])
        monthly_returns.append(returns * investment_decisions[i - 1])
    monthly_returns = np.array(monthly_returns)
    return monthly_returns

def str_2(prices, investment_decisions):
    monthly_returns = []
    for i in range(1, len(prices)):
        percent_change = (prices[i] - prices[i-1])

        if investment_decisions[i-1] == 1:
            # Long position
            adjusted_return = percent_change
        elif investment_decisions[i-1] == 0:
            # Short selling
            adjusted_return = -percent_change
        else:
            raise ValueError("Investment decisions must be 0 or 1")
        monthly_returns.append(adjusted_return)
    return monthly_returns

def str_3(prices, investment_decisions):
    monthly_returns = []

    for i in range(1, len(prices)):
        sp500_percent_change = (prices[i] - prices[i-1])

        if investment_decisions[i-1] == 0:
            # Buy S&P 500 index
            adjusted_return = sp500_percent_change
        else: adjusted_return = 0
        monthly_returns.append(adjusted_return)
    return monthly_returns

def str_4(prices, investment_decisions,probabilities):
    monthly_returns = []
    for i in range(1, len(prices)):
        vix_price_change = (prices[i] - prices[i-1])
        if probabilities[i] > 0.55:
            # Long VIX
            adjusted_return = vix_price_change
        elif probabilities[i] < 0.45:
            # Short VIX
            adjusted_return = -vix_price_change
        else:
            adjusted_return = 0
        monthly_returns.append(adjusted_return)
    return monthly_returns

def str_5(prices, investment_decisions, probabilities):
    monthly_returns = []

    for i in range(1, len(prices)):
        sp500_percent_change = (prices[i] - prices[i-1])

        if probabilities[i] > 0.55:
            # Buy S&P 500 index
            adjusted_return = sp500_percent_change
        elif probabilities[i] < 0.45:
            # Short selling S&P 500 index
            adjusted_return = -sp500_percent_change
        else:
            # No action, holding cash or previous position
            adjusted_return = 0
        monthly_returns.append(adjusted_return)
    return monthly_returns


def generate_results(prices, investment_decision, p, monthly_returns, func_name, risk_free_rate=0):
    annual_percent_returns = []
    for i in range(12, len(prices), 12):
        # Calculate the return from 12 months ago to current
        percent_return = sum([monthly_returns[ii] for ii in range(i - 12, i)]) / np.mean([prices[ii] for ii in range(i - 12, i)])
        percent_return = sum([monthly_returns[ii] for ii in range(i - 12, i)]) / prices[i - 1]
        # percent_return = sum([monthly_returns[ii] for ii in range(i - 12, i)]) / np.mean(prices)
        annual_percent_returns.append(percent_return)

    annual_returns = np.array(annual_percent_returns)
    annual_return = np.mean(annual_returns)
    # annual_return = (np.mean(monthly_returns) * 12) / np.mean(prices) -- eto horoshaya
    # annual_return = (np.mean(monthly_returns) * 12) / n
    annual_risk = np.std(annual_returns)**(252/360)
    sharpe_ratio = (annual_return) / annual_risk
    downside_returns = [min(0, iv) for iv in annual_returns]
    annual_downside_risk = np.sqrt(np.mean(np.square(downside_returns)))
    sortino_ratio = (annual_return) / annual_downside_risk
    drawdowns = []
    peak = -np.inf
    for r in monthly_returns:
        peak = max(peak, r)
        drawdown = r - peak
        drawdowns.append(drawdown)
    average_drawdown = np.mean(drawdowns)

    results = {
        "Strategy": f'Strategy {str(func_name)}',
        "Annual Return": annual_return * 100,
        "Annual Risk": annual_risk * 100,
        "Sharpe Ratio": sharpe_ratio,
        "Sortino Ratio": sortino_ratio,
        "Average Drowdown": average_drawdown
    }
    return results

In [None]:
features_aic = AIC_stepwise_regression(X_train, y_train, num_features=20, threshold_in=0.5, threshold_out=0.5)
features_bic =  BIC_stepwise_regression(X_train, y_train, num_features=20)
features_deviance = deviance_forward_selection(X_train, y_train, family=sm.families.Gaussian(), num_features=20)

df_pred["AIC"], df_prob["AIC"] = incremental_training(X_train[features_aic], y_train, X_test[features_aic], y_test)
df_pred["BIC"], df_prob["BIC"] = incremental_training(X_train[features_bic], y_train, X_test[features_bic], y_test)
df_pred["Deviance"], df_prob["Deviance"] = incremental_training(X_train[features_deviance], y_train, X_test[features_deviance], y_test)

df_predd = df_pred

Add  cboe_vix_lag_1                 with p-value 0.0116303
Add  MACVOL_fred                    with p-value 0.000861532
Add  Dollar_Index_fred              with p-value 0.0385274
Add  Oil_Price_fred                 with p-value 0.0261316
Add  TED_Spread_fred                with p-value 0.0709975
Add  Credit_Spread_fred             with p-value 0.0233913
Add  ECU_epu                        with p-value 0.0630172
Add  HML_fama                       with p-value 0.123275
Add  MOM_fama                       with p-value 0.182549
Add  QMJ_aqr                        with p-value 0.214282
Add  MACVOL_annualized_fred         with p-value 0.216612
Add  TS_MOM_aqr                     with p-value 0.285215
Add  cboe_vix_lag_3                 with p-value 0.308118
Add  Tbill_3mo_fred                 with p-value 0.454553
Add  BB_Ratio_Aver_aaii             with p-value 0.476687
Add  MAVB_8week_aaii                with p-value 0.365176
Add  MOM_FI_aqr                     with p-value 0.499219
Add  

In [None]:
names = ['cboe_vix_lag', 'MKT_fama', 'HMLD_aqr', 'MOM_EQ_aqr']
# df_pred = pd.DataFrame()
# df_prob = pd.DataFrame()
for i in names:
    X_tr = X_train[[col for col in X_train.columns if col.startswith(i)]]
    X_te = X_test[[col for col in X_train.columns if col.startswith(i)]]

    df_pred[i], df_prob[i] = incremental_training(X_tr, y_train, X_te, y_test)
    # print(strategy_1(df_pred[i], spx))
# df_pred['AIC'] = df_predd['AIC']
# df_pred['BIC'] = df_predd['BIC']
# df_pred['Deviance'] = df_predd['Deviance']
# df_prob


## Table itself

In [None]:
def table_six(full_name, table_6_df):
    name = full_name
    monthly_returns = str_1(spx['VIX_cboe'].values, df_pred[name].values)
    strategy_1_results = generate_results(spx['VIX_cboe'].values, df_pred[name].values, [], monthly_returns, 1)
    monthly_returns = str_2(spx['VIX_cboe'].values, df_pred[name].values)
    strategy_2_results = generate_results(spx['VIX_cboe'].values, df_pred[name].values, [], monthly_returns, 2)
    monthly_returns = str_3(spx['Close'].values, df_pred[name].values)
    strategy_3_results = generate_results(spx['Close'].values, df_pred[name].values, [], monthly_returns, 3)
    monthly_returns = str_4(spx['VIX_cboe'].values, df_pred[name].values, [item[0] for item in df_prob[name]])
    strategy_4_results = generate_results(spx['VIX_cboe'].values, df_pred[name].values, [item[0] for item in df_prob[name]], monthly_returns, 4)
    monthly_returns = str_5(spx['Close'].values, [], [item[0] for item in df_prob[name]])
    strategy_5_results = generate_results(spx['Close'].values, [], [item[0] for item in df_prob[name]], monthly_returns, 5)

    if full_name == 'cboe_vix_lag':
        full_name = 'VIX Lagged'
    elif full_name.rfind('_') != -1:
      full_name = full_name[:full_name.rfind('_')]

    strategy_1_results_with_method = {'method': full_name, **strategy_1_results}
    strategy_2_results_with_method = {'method': full_name, **strategy_2_results}
    strategy_3_results_with_method = {'method': full_name, **strategy_3_results}
    strategy_4_results_with_method = {'method': full_name, **strategy_4_results}
    strategy_5_results_with_method = {'method': full_name, **strategy_5_results}
    strategies = [strategy_1_results_with_method, strategy_2_results_with_method, strategy_3_results_with_method,
              strategy_4_results_with_method, strategy_5_results_with_method]

    table_6_df = table_6_df.append(pd.DataFrame(strategies)); table_6_df
    return table_6_df

vix_long_res = generate_results(spx['VIX_cboe'].values,[], [], vix_long(spx['VIX_cboe'].values, df_pred['cboe_vix_lag'].values), 0)
table_6_df = pd.DataFrame([{'method': 'VIX long', **vix_long_res}])
for i in df_pred.columns:
    table_6_df = table_six(i, table_6_df)
table_6_df = table_6_df.reset_index(); table_6_df.round(2)

  sortino_ratio = (annual_return) / annual_downside_risk


Unnamed: 0,index,method,Strategy,Annual Return,Annual Risk,Sharpe Ratio,Sortino Ratio,Average Drowdown
0,0,VIX long,Strategy 0,-8.04,62.97,-0.13,-0.18,-14.17
1,0,VIX Lagged,Strategy 1,5.04,49.03,0.1,0.25,-11.23
2,1,VIX Lagged,Strategy 2,18.11,75.67,0.24,0.56,-21.25
3,2,VIX Lagged,Strategy 3,-1.9,27.69,-0.07,-0.12,-88.67
4,3,VIX Lagged,Strategy 4,-63.25,79.78,-0.79,-0.66,-11.94
5,4,VIX Lagged,Strategy 5,-1.11,17.52,-0.06,-0.17,-110.36
6,0,MKT,Strategy 1,41.35,46.38,0.89,23.69,-12.83
7,1,MKT,Strategy 2,90.74,83.17,1.09,inf,-19.96
8,2,MKT,Strategy 3,2.81,18.34,0.15,0.46,-93.45
9,3,MKT,Strategy 4,-26.59,46.43,-0.57,-0.64,-11.67


# Table 7

In [None]:
'''Load data and adjust to the required format'''
import pandas as pd
import numpy as np

def add_lags(df, column_name, max_lag):
    for lag in range(1, max_lag + 1):
        df[f'{column_name}_lag_{lag}'] = df[column_name].shift(lag)

df = pd.read_csv("final_data.csv")

# Convert 'date' to datetime and filter based on date range
df['date'] = pd.to_datetime(df['date'])
start_date = '1990-01-01'
end_date = '2019-12-31'
filtered_df = df[(df['date'] >= start_date) & (df['date'] <= end_date)]

# Add lags for each column except 'date', 'vix_classifier', and 'VIX_cboe'
columns_to_lag = filtered_df.columns.difference(['date', 'vix_classifier', 'VIX_cboe'])
for column in columns_to_lag:
    add_lags(filtered_df, column, 3)

filtered_df['vix_classifier'] = (filtered_df['VIX_cboe'] > filtered_df['VIX_cboe'].shift(1)).astype(int)

train_df = filtered_df[(filtered_df['date'] >= '1991-01-01') & (filtered_df['date'] <= '2003-12-31')]
test_df = filtered_df[filtered_df['date'] > '2003-12-31']

X_train = train_df.drop(columns=['vix_classifier', 'date', 'VIX_cboe'])
y_train = train_df['vix_classifier']
X_test = test_df.drop(columns=['vix_classifier', 'date', 'VIX_cboe'])
y_test = test_df['vix_classifier']


In [None]:
elastic_net_a05_names = [
    "MOM_EQ_aqr", "MKT_fama", "RVAR_fama", "Oil_Price_fred",
    "QMJ_aqr", "HMLD_aqr", "Skew_cboe", "MOM_FI_aqr", "TS_MOM_aqr",
    "MOM_FX_aqr", "ECU_epu", "Term_Spread_fred"
]

df_pred = pd.DataFrame()
df_prob = pd.DataFrame()
X_tr = X_train[[col for col in X_train.columns if col in elastic_net_a05_names]]
X_te = X_test[[col for col in X_train.columns if col in elastic_net_a05_names]]

df_pred['for_7'], df_prob['for_7'] = incremental_training(X_tr, y_train, X_te, y_test)

table_7_df = pd.DataFrame().append(table_6_df.loc[0])

In [None]:
pd.set_option('display.max_rows', 66)


def table_78(strat_name, full_name, table_df, preds, probs):
    preds = [not x for x in preds]
    # preds = [preds == 0]
    # strategy_1_results = strategy_1(spx['VIX_cboe'].values, preds)
    strategy_1_results = generate_results(spx['VIX_cboe'].values, preds,[], str_1(spx['VIX_cboe'].values, preds), 1)
    strategy_1_results_with_method = {'method': strat_name, **strategy_1_results}

    strategy_2_results = generate_results(spx['VIX_cboe'].values, preds, [], str_2(spx['VIX_cboe'].values, preds), 2)
    # strategy_2(spx['VIX_cboe'].values, preds)
    strategy_2_results_with_method = {'method': strat_name, **strategy_2_results}

    strategy_3_results = generate_results(spx['Close'].values, preds, [], str_2(spx['Close'].values, preds), 3)
    # strategy_3(spx['Close'].values, preds)
    strategy_3_results_with_method = {'method': strat_name, **strategy_3_results}

    if not (strat_name.startswith('Mean') or strat_name.startswith('Median') or strat_name.startswith('Grad')):
        strategy_4_results = generate_results(spx['VIX_cboe'].values, preds, [item[0] for item in probs], str_4(spx['VIX_cboe'].values,preds,[item[0] for item in probs]), 4)
        # strategy_4(spx['VIX_cboe'].values, preds, [item[0] for item in probs])
        strategy_4_results_with_method = {'method': strat_name, **strategy_4_results}

        strategy_5_results = generate_results(spx['Close'].values,preds, [item[0] for item in probs], str_5(spx['VIX_cboe'].values,preds,[item[0] for item in probs]), 5)
        # strategy_5(spx['Close'].values, [item[0] for item in probs])
        strategy_5_results_with_method = {'method': strat_name, **strategy_5_results}

        strategies = [strategy_1_results_with_method, strategy_2_results_with_method, strategy_3_results_with_method,
              strategy_4_results_with_method, strategy_5_results_with_method]

    else:
        strategies = [strategy_1_results_with_method, strategy_2_results_with_method, strategy_3_results_with_method]

    table_df = table_df.append(pd.DataFrame(strategies))
    table_df['Annual Return'] = table_df['Annual Return'] * -1
    return table_df

table_c = table_6_df.drop(columns='index')
table_c = table_c.loc[0].copy()
table_c.method = 'Ridge'

table_7_df = pd.DataFrame().append(table_c)

results_df = pd.DataFrame()
results_df, all_peds, all_pobs = all_models(X_train, X_test, y_train, y_test, elastic_net_a05, results_df)

for i in all_peds.keys():
    table_7_df = table_78(i, 'for_7', table_7_df, all_peds[i], all_pobs[i])

table_7_df.reset_index().round(2)

Unnamed: 0,index,method,Strategy,Annual Return,Annual Risk,Sharpe Ratio,Sortino Ratio,Average Drowdown
0,0,Ridge,Strategy 0,8.04,62.97,-0.13,-0.18,-14.17
1,0,Ridge,Strategy 1,-25.7,56.17,0.46,1.06,-11.44
2,1,Ridge,Strategy 2,-59.44,95.58,0.62,1.24,-20.48
3,2,Ridge,Strategy 3,-0.2,28.51,0.01,0.02,-158.11
4,3,Ridge,Strategy 4,86.82,81.82,-1.06,-0.76,-14.26
5,4,Ridge,Strategy 5,0.99,4.75,-0.21,-0.67,-14.26
6,0,Lasso,Strategy 1,20.75,57.24,0.36,0.84,-11.54
7,1,Lasso,Strategy 2,49.55,92.61,0.54,1.01,-20.67
8,2,Lasso,Strategy 3,2.92,35.74,0.08,0.3,-151.36
9,3,Lasso,Strategy 4,-85.75,75.72,-1.13,-0.79,-11.59


# Table 8

In [None]:
elastic_net_a075_names = [
    "MOM_EQ_aqr", "MKT_fama", "RVAR_fama", "Oil_Price_fred",
    "QMJ_aqr", "Skew_cboe", "HMLD_aqr", "TS_MOM_aqr", "MOM_FX_aqr",
    "MOM_FI_aqr", "Term_Spread_fred", "ECU_epu"
]

df_pred = pd.DataFrame()
df_prob = pd.DataFrame()
X_tr = X_train[[col for col in X_train.columns if col in elastic_net_a075_names]]
X_te = X_test[[col for col in X_train.columns if col in elastic_net_a075_names]]

df_pred['for_8'], df_prob['for_8'] = incremental_training(X_tr, y_train, X_te, y_test)
table_8_df = pd.DataFrame().append(table_6_df.loc[0])

In [None]:
table_8_df = pd.DataFrame().append(table_c)

results_df = pd.DataFrame()
results_df, all_peds, all_pobs = all_models(X_train, X_test, y_train, y_test, elastic_net_a05, results_df)

for i in all_peds.keys():
    table_8_df = table_78(i, 'for_7', table_8_df, all_peds[i], all_pobs[i])

table_8_df.reset_index().round(2)

Unnamed: 0,index,method,Strategy,Annual Return,Annual Risk,Sharpe Ratio,Sortino Ratio,Average Drowdown
0,0,Ridge,Strategy 0,8.04,62.97,-0.13,-0.18,-14.17
1,0,Ridge,Strategy 1,-25.7,56.17,0.46,1.06,-11.44
2,1,Ridge,Strategy 2,-59.44,95.58,0.62,1.24,-20.48
3,2,Ridge,Strategy 3,-0.2,28.51,0.01,0.02,-158.11
4,3,Ridge,Strategy 4,86.82,81.82,-1.06,-0.76,-14.26
5,4,Ridge,Strategy 5,0.99,4.75,-0.21,-0.67,-14.26
6,0,Lasso,Strategy 1,20.75,57.24,0.36,0.84,-11.54
7,1,Lasso,Strategy 2,49.55,92.61,0.54,1.01,-20.67
8,2,Lasso,Strategy 3,2.92,35.74,0.08,0.3,-151.36
9,3,Lasso,Strategy 4,-85.75,75.72,-1.13,-0.79,-11.59
