<font size="6">**Model Building: Linear**</font>

In [1]:

import numpy as np
import seaborn as sns

from sklearn.model_selection import (TimeSeriesSplit, GridSearchCV)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, roc_auc_score, confusion_matrix)

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%run ../nb_config.py

running notebook configuration


In [4]:
from src.load_data import market_data
from src.mle import time_series as mle_ts
from src import utils

ModuleNotFoundError: No module named 'quandl'

# Parameters

In [None]:
TAU_TARGET = 5
KFOLDS = 3
RND_SEED = 123
SPLIT_DT = '2019-12-31'
DATA_END = '2020-06-30'

# Load Data

In [None]:
comm_df = market_data.read_mkt_data().loc[:DATA_END, ['brent', 'wti']]
target = mle_ts.get_targets(
        y=comm_df.loc[:DATA_END, ['brent']], tau=TAU_TARGET
    ).rename(columns={'brent':'target'})

comm_df = comm_df.join(target, how='inner')

In [None]:
target['target_bin'] = (target['target']>=0).astype(int)
target['target_w'] = target['target'].rolling(60, min_periods=1).apply(lambda x: (x[-1] - x.mean())/x.std(), raw=True).fillna(method='bfill')

In [None]:
comm_df['x_is_eom'] = comm_df.index.is_month_end.astype(int)

In [None]:
seas_yr_feat = mle_ts.get_wave_features(comm_df.index.isocalendar().week, periods=[54], n_harmonics=1).values
seas_yr_feat = pd.DataFrame(index=comm_df.index, data=seas_yr_feat, columns=['x_cos_yr', 'x_sin_yr'])
comm_df[['x_cos_yr', 'x_sin_yr']] = seas_yr_feat

In [None]:
seas_wk_feat = mle_ts.get_wave_features(comm_df.index.dayofweek, periods=[5], n_harmonics=1).values
seas_wk_feat = pd.DataFrame(index=comm_df.index, data=seas_wk_feat, columns=['x_cos_wk', 'x_sin_wk'])
comm_df[['x_cos_wk', 'x_sin_wk']] = seas_wk_feat

In [None]:
comm_df[['brent_vs_wti']] = comm_df['brent'] - comm_df['wti']
comm_df[['x_brent_vs_wti_zscored']] = mle_ts.z_score(x=comm_df[['brent_vs_wti']], win_size=60, min_periods=1, fillna=True) 
 


# Data Split

In [None]:
raw_features = ['x_is_eom', 'x_cos_yr', 'x_sin_yr', 'x_cos_wk', 'x_sin_wk',
                'x_brent_vs_wti_zscored', 
                'brent']

X_train = comm_df.loc[:SPLIT_DT, raw_features]
X_test = comm_df.loc[SPLIT_DT:, raw_features]
y_train = target.loc[:SPLIT_DT, 'target_bin']
y_test = target.loc[SPLIT_DT:, 'target_bin']
w_train = target.loc[:SPLIT_DT, 'target_w']
w_test = target.loc[SPLIT_DT:, 'target_w']

In [None]:
y_train.value_counts(normalize=True)

In [None]:
y_test.value_counts(normalize=True)

In [None]:
tscv = TimeSeriesSplit(n_splits=KFOLDS)
sp_tscv  = tscv.split(y_train)


# Feature Enginering

In [None]:
fte_macd = mle_ts.MACD(short_tau=3, long_tau=20, zscore_tau=90)
fte_mom1 = mle_ts.Momentum(tau=10, zscore_tau=30, degree=1)
fte_mom2 = mle_ts.Momentum(tau=10, zscore_tau=30, degree=2)
fte_diff = mle_ts.Diff(tau=3, zscore_tau=30)
fte_volat = mle_ts.Volatility(tau=10, zscore_tau=60)

fte_ct = ColumnTransformer([
        ('macd', fte_macd, ['brent']),
        ('mom1', fte_mom1, ['brent']),
        ('mom2', fte_mom2, ['brent']),
        ('diff', fte_diff, ['brent']),
        ('volat', fte_volat, ['brent']),
    ],
    remainder='passthrough')


In [None]:
feature_names = [x[0] for x in fte_ct.transformers]

In [None]:
features_corr = pd.DataFrame(fte_ct.fit_transform(X_train[['brent']])).corr('spearman')
features_corr.index= feature_names
features_corr.columns= feature_names

sns.heatmap(features_corr, annot=True)
plt.title('Multicollinearity: Spearman Corr')
plt.show()

# Model Building

In [None]:
discr = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
model = LogisticRegression(fit_intercept=True, penalty='l2', random_state=RND_SEED)

grid_hparams = {
    'discr__n_bins': [3, 5, 10, 20],
    'model__C': [0.0001, 0.001, 0.01, 0.1, 1.]
               }

pl = Pipeline([('fte', fte_ct), ('discr', discr), ('model', model)])

pl_cv = GridSearchCV(pl, grid_hparams, scoring=('f1', 'roc_auc', 'accuracy'), cv=tscv, n_jobs=-1, refit='f1')
pl_cv.fit(X_train[['brent']], y_train)

In [None]:
from sklearn import set_config
set_config(display='diagram')

In [None]:
pl

In [None]:
pl_cv_res = pd.DataFrame(pl_cv.cv_results_).sort_values(by='rank_test_f1')
pl_cv_res.iloc[:10]

In [None]:
pl_cv_res.sort_values(by='rank_test_f1').plot.barh(x='params', y=['mean_test_f1', 'std_test_f1'], subplots=True, layout=(1,2), sharey=True)
plt.show()

In [None]:
pl_cv_res.sort_values(by='rank_test_accuracy').iloc[:10].plot.barh(x='params', y=['mean_test_accuracy', 'std_test_accuracy'], subplots=True, layout=(1,2), sharey=True)
plt.show()

In [None]:
pl_cv_res.loc[pl_cv_res['rank_test_f1']==1].filter(regex=r'(mean|std)_test_')

Best F1 model, achieves a good accuracy score, in addition, std deviations are the lowest

## Fit Champion Model

In [None]:
pl_cv.best_estimator_[-1].coef_

In [None]:
model = LogisticRegression(fit_intercept=True, penalty='l2', C=1, random_state=RND_SEED)

base_mod = Pipeline([('fte', fte_ct), ('discr', discr), ('model', model)])

base_mod.fit(X_train[['brent']], y_train)

In [None]:
model = LogisticRegression(fit_intercept=True, penalty='l2', C=0.01, random_state=RND_SEED)

champ_mod = Pipeline([('fte', fte_ct), ('discr', discr), ('model', model)])

champ_mod.fit(X_train[['brent']], y_train)

# Evaluation

In [None]:
def get_preds(X: pd.DataFrame, model, name='preds'):
    return pd.Series(index=X.index, data=model.predict_proba(X)[:, 1], name=name)

In [None]:
p_train = get_preds(X_train[['brent']], champ_mod)
p_test = get_preds(X_test[['brent']], champ_mod)


In [None]:
(p_train>=0.5).value_counts(normalize=True)

In [None]:
(p_test>=0.5).value_counts(normalize=True)

In [None]:
accuracy_score(y_test, p_test>=0.5)

In [None]:
confusion_matrix(y_train, p_train>=0.5, normalize='all')

In [None]:
confusion_matrix(y_test, p_test>=0.5, normalize='all')

In [None]:
roc_auc_score(y_train, p_train)

In [None]:
roc_auc_score(y_test, p_test)

## Calibration

In [None]:
q_train = pd.qcut(w_train, 5, labels=False)
q_test = pd.qcut(w_test, 5, labels=False)

In [None]:
y_train_df = pd.concat([q_train, p_train, y_train], axis=1)
y_test_df = pd.concat([q_test, p_test, y_test], axis=1)

In [None]:
y_train_df['q_preds'] = pd.qcut(p_train, 5, labels=False)
y_test_df['q_preds'] = pd.qcut(p_test, 5, labels=False)

In [None]:
y_train_df['target'] = target.loc[:SPLIT_DT, 'target']
y_test_df['target'] = target.loc[SPLIT_DT:, 'target']

In [None]:
fig, axs = plt.subplots(2 ,1, sharex=True, figsize=(16, 10))

ax = axs[0]
y_train_df.groupby('q_preds')['target'].median().plot.bar(ax=ax)
ax.set_ylabel("%")
ax.set_title("Model Post-Mortem Analysis: Train")
ax = axs[1]
y_train_df.groupby('q_preds')['target_bin'].mean().plot.bar(ax=ax)
ax.set_ylabel("event prop")
ax.set_xlabel("Predicted Probability Quantiles")
plt.show()

In [None]:
ax = y_train_df['target'].plot.hist(label='train', density=True, alpha=0.5)
y_test_df['target'].plot.hist(label='test', density=True, alpha=0.5, ax=ax)
plt.title('target (raw)')
plt.legend()
plt.show()

In [None]:
mle_ts.run_adf_test(y_train_df[['target']])

In [None]:
y_train_df.groupby('target_bin')['target'].agg([np.size, np.median])

In [None]:
y_test_df.groupby('target_bin')['target'].agg([np.size, np.median])

In [None]:
ax = y_train_df['preds'].plot.hist(label='train', density=True, alpha=0.5)
y_test_df['preds'].plot.hist(label='test', density=True, alpha=0.5, ax=ax)
plt.title('predicted probabilites')
plt.legend()
plt.show()

In [None]:
fig, axs = plt.subplots(2, 1, sharex=True, figsize=(16, 10))

ax = axs[0]
y_test_df.groupby('q_preds')['target'].median().plot.bar(ax=ax)
ax.set_ylabel("target (%)")
ax.set_title("Model Post-Mortem Analysis: Test")
ax = axs[1]
y_test_df.groupby('q_preds')['target_bin'].mean().plot.bar(ax=ax)
ax.set_ylabel("event prop")
ax.set_xlabel("Predicted Probability Quantiles")

plt.show()

In [None]:
y_test_df.loc[y_test_df['q_preds']==0, 'target_bin'].value_counts()

In [None]:
y_test_df.head()

In [None]:
y_test_df['q_preds'].value_counts()

In [None]:
brent_test.loc[y_test_df['target_bin']==3]

In [None]:
brent_test = comm_df.loc[SPLIT_DT:, 'brent']
mask_short = y_test_df['q_preds']==0
mask_long = y_test_df['q_preds']==4
ax = brent_test.plot()
brent_test.loc[mask_short].plot(ax=ax, style='v')
brent_test.loc[mask_long].plot(ax=ax, style='^')

for idx, row in y_test_df.loc[mask_short].iterrows():
    if row['target_bin']<=0:
        _color = 'green'
    else:
        _color = 'red'
    ax.axvline(x=idx, linewidth=1, alpha=0.5, color=_color)
    
for idx, row in y_test_df.loc[mask_long].iterrows():
    if row['target_bin']>0:
        _color = 'green'
    else:
        _color = 'red'
    ax.axvline(x=idx, linewidth=1, alpha=0.5, color=_color)
    
ax.set_ylabel('USD/bbl')
ax.set_title('Brent Trades on Predicted Prob Quantiles = {0, 4}')
plt.show()

# Interpretation

In [None]:
#feature_names = [x[0] for x in champ_mod[0].transformers]
features_imp = champ_mod[-1].coef_.reshape(-1)
features_imp = pd.Series(index=feature_names, data=features_imp).sort_values()

In [None]:
features_imp

In [None]:
base_mod_feat_imp = pd.Series(index=feature_names , data=base_mod[-1].coef_.reshape(-1)).sort_values()
base_mod_feat_imp

In [None]:
fig, axs = plt.subplots(1,2, sharex=True)

plt.suptitle('Feature Importance')
ax = axs[0]
ax.set_title('Champion Model (C=0.01)')
features_imp.plot.barh(ax=ax)

ax = axs[1]
base_mod_feat_imp.plot.barh(ax=ax)
ax.set_title('Base Model (C=1)')

for ax in axs:
    plt.xlabel('Coefficient Value')
plt.show()