###### <img src="Electronic_Brain.png" width="200" style="float:left">
<h1> Spring 2021 ML Course.</h1>
<h2> Exercise 10: Gradient Boosting Regression<br>Tools: CatBoost</h2>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import linalg as LA

from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoost, CatBoostRegressor
from icecream import ic

<img src="desktop-computer-icon.png" width="90" style="float:left; margin-right: 10px;">
<h1> &nbsp; Section A: Signal Generation.</h1>

In [None]:
def create_signal(noise_sigma=0.25, phase=np.pi/4, num_cycles=10, max_lookback=1000, feature_jump=29, look_forward=500):

    X = np.linspace(0, num_cycles*np.pi, 1000*num_cycles) 
    N = np.random.normal(0, noise_sigma*np.abs((np.sin(X + phase))), len(X))
    Y = np.sin(X) + N

    # Create the "signal" dataframe, along with exponential-moving-average mean & std.
    # We add 10 so we only have to deal with strictly positive samples.
    sig_df = pd.DataFrame({'time':X, 'signal':Y+10, 'true_signal':np.sin(X)+10})

    # Put together a big bunch of Ronny-Roshbakir style features.
    # Ronny samples the signals' log-gains at feature_jumps intervals, up to max_lookback steps back.
    # These features do a poor job of capturing the underlying dynamics of the problem.
    features = {'ronny':[], 'motti':[]}
    for i in range(max_lookback):
        if i%feature_jump==0:
            curr_feature = 'sig_gain_' + str(i+1)
            features['ronny'].append(curr_feature)
            # The log-gain feature is invariant to the signal's magnitude, and therefore makes sense when trading.
            sig_df[curr_feature] = 100*(np.log(sig_df['signal']) - np.log(sig_df['signal'].shift(periods=(i+1))))

    # Motti's features are better: they extract the signal's exponentially decaying mean & standard deviation.
    # window_lengths = [200, 400, 600, 800, 1000] # Try an evenly spaced grid (best results so far for Category B).
    # window_lengths = [50, 100, 200, 500, 1000]  # Or an almost-geometrically spaced grid (almost similar results on Category B).
    window_lengths = [25, 80, 200, 500, 1000]  # Or an almost-geometrically spaced grid (almost similar results on Category B).
    for win_len in window_lengths:
        # Pandas' ewm() provides exponentially weighted functions (here we use mean & std()).
        # The win_len parameter (see below) controls the "center-of-mass" ("COM", see documentation) of the moving average.
        # Note that the moving average considers ALL previous data, but after ~3 COMs the contribution is negligible.
        # This also means that the first ~3 COMs of data will be less reliable then the subsequent signal.
        # HENCE... We restrict the outputs s.t. only windows with at least the maximal window length observations are avail.
        sig_df['ewm_mean_'+str(win_len)] = sig_df['signal'].ewm(win_len, min_periods=max(window_lengths)).mean()
        sig_df['ewm_std_'+str(win_len)] = sig_df['signal'].ewm(win_len, min_periods=max(window_lengths)).std()
        features['motti'].append('ewm_mean_'+str(win_len))
        features['motti'].append('ewm_std_'+str(win_len))

    # The 20-day exponential moving averages and STDs are used (only) in the graphs below.
    sig_df['ewm_mean_20'] = sig_df['signal'].ewm(20).mean()
    sig_df['ewm_std_20'] = sig_df['signal'].ewm(20).std()
       
    # Extract a few "future features" (will be used for constructing the target).
    # We're interested in looking (no more than) look_forward steps into the future.
    # NOTICE: again, we restrict the outputs s.t. only windows with at least look_forward observations are avail.
    indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=look_forward)
    sig_df['future_mean'] = sig_df['signal'].rolling(window=indexer, min_periods=look_forward).mean()
    sig_df['future_std'] = sig_df['signal'].rolling(window=indexer, min_periods=look_forward).std()
    
    # Option 1: assume access to the "true" signal when calculating the future_gain (in practice of course we wouldn't).
    sig_df['future_gain'] = 100*(np.log(sig_df['future_mean']) - np.log(sig_df['true_signal']))
    # Option 2: in a slightly more realistic scenario we would be doing something like this:
    # sig_df['future_gain'] = 100*(np.log(sig_df['future_mean']) - np.log(sig_df['ewm_mean']))

    # Our target divides the future gain by the future standard deviation + 1 (we don't like volatility!).
    sig_df['target'] = sig_df['future_gain'].divide(sig_df['future_std'] + 1)

    return(sig_df, features)

In [None]:
# Show off today's signal family.
plt.rcParams['figure.figsize'] = [15, 8]
sig_df, _ = create_signal(phase=np.pi/4)
fig, axes = plt.subplots()
plt.scatter(sig_df['time'], sig_df['signal'], color='b', s=1.5, label='phase=pi/4');
plt.title("Today's signal: sine waves with constant phase random noise")
plt.legend();

In [None]:
sig_df, features = create_signal()
fig, ax = plt.subplots()
plt.title("Means, STDs and target values for a sample signal")
plt.scatter(sig_df['time'], sig_df['signal'], color='b', s=1.5)
ax.fill_between(sig_df['time'], sig_df['true_signal']-0.5*sig_df['ewm_std_20'], sig_df['true_signal']+0.5*sig_df['ewm_std_20'], color='r', alpha=0.25)
ax.fill_between(sig_df['time'], sig_df['true_signal']-1.5*sig_df['ewm_std_20'], sig_df['true_signal']+1.5*sig_df['ewm_std_20'], color='g', alpha=0.25)
ax.plot(sig_df['time'], sig_df['future_mean'], color='m', label='future_mean')
ax.legend(loc="lower right")
ax2 = ax.twinx()
ax2.plot(sig_df['time'], sig_df['target'], color='brown', label='target')
ax2.legend();

In [None]:
# Prepare a dataset using only Ronny's features, clean by dropping all rows with missing data.
ronny_clean_df = sig_df[features['ronny'] + ['target']].dropna()
X_ronny_df = ronny_clean_df[features['ronny']]
y_ronny = ronny_clean_df['target']

motti_clean_df = sig_df[features['motti'] + ['target']].dropna()
X_motti_df = motti_clean_df[features['motti']]
y_motti = motti_clean_df['target']

<img src="desktop-computer-icon.png" width="90" style="float:left; margin-right: 10px;">
<h1> &nbsp; Section B: Regression Training.</h1>
Unfortunately, my Windows machine (and probably all others) are not able to display the learning curve (below).<br>
10 extra points to anyone who manages to display this chart.

In [None]:
# Show the "learning curve" / "training curve" for Ronny.
X_train, X_test, y_train, y_test = train_test_split(X_ronny_df, y_ronny, test_size=0.25, random_state=42)
cat_regress = CatBoostRegressor(iterations=1000)
cat_regress.fit(X_train, y_train, plot=True, eval_set=(X_test, y_test), silent=True);

In [None]:
# Train a few regressors on the continuous target.
regressors = {}

# Train an L1 & L2 regressors on Ronny's features.
regressors['ronny_L1'] = CatBoostRegressor(loss_function='Lq:q=1')
regressors['ronny_L1'].fit(X_ronny_df, y_ronny, silent=True)
regressors['ronny_L2'] = CatBoostRegressor(loss_function='RMSE')
regressors['ronny_L2'].fit(X_ronny_df, y_ronny, silent=True)

# Repeat for Motti's features.
regressors['motti_L1'] = CatBoostRegressor(loss_function='Lq:q=1')
regressors['motti_L1'].fit(X_motti_df, y_motti, silent=True)
regressors['motti_L2'] = CatBoostRegressor(loss_function='RMSE')
regressors['motti_L2'].fit(X_motti_df, y_motti, silent=True);

<img src="desktop-computer-icon.png" width="90" style="float:left; margin-right: 10px;">
Exercise: look up other (regression) loss functions available in CatBoost:<br>
https://catboost.ai/en/docs/concepts/loss-functions-regression

Please repeat the experiment for the L_8 loss function and explain the results.<br>

In [None]:
predictions={}
prediction_errors={}

l1_losses={
    'ronny_L2' : [],   'ronny_L1'   : [],
    'motti_L2' : [],   'motti_L1'   : [],
}
l2_losses={
    'ronny_L2' : [],   'ronny_L1'   : [],
    'motti_L2' : [],   'motti_L1'   : [],
}

for _ in range(100):
    for regressor in l1_losses.keys():
        # NOTICE: activate the first line (constant phase) for Category A of the competition.
        # NOTICE: activate the second line (random phase) for Category B of the competition.
        sig_df, features = create_signal()
        # sig_df, features = create_signal(phase=np.random.rand()*2*np.pi)

        if 'ronny' in regressor:
            # Extract Ronny's features + target.
            clean_df = sig_df[features['ronny'] + ['target']].dropna()
            X_df = clean_df[features['ronny']]
        elif 'motti' in regressor:
            # Extract Motti's features + target.
            clean_df = sig_df[features['motti'] + ['target']].dropna()
            X_df = clean_df[features['motti']]
        
        y = clean_df['target']
        pred = pd.Series(regressors[regressor].predict(X_df), index=X_df.index)
        
        # Add the L1 & L2 losses *per sample*. This way we can compare regressors with different support sets.
        l1_loss = LA.norm(y-pred, ord=1) / len(y)
        l2_loss = LA.norm(y-pred, ord=2) / len(y)
        l1_losses[regressor].append(l1_loss)
        l2_losses[regressor].append(l2_loss)
        
        # Add a single predicted series per regressor (for the graphs below).
        if regressor not in predictions:
            predictions[regressor] = pred
            prediction_errors[regressor] = pred - y

In [None]:
plt.rcParams['figure.figsize'] = [15, 15]
fig = plt.figure()
gs = fig.add_gridspec(4, hspace=0.3)
axs = gs.subplots(sharex=True, sharey=True)
for i, regressor in enumerate(sorted(l1_losses.keys(), reverse=True)):
    axs[i].plot(predictions[regressor], label=regressor+' prediction', color='green')
    axs[i].plot(y, label='target', color='brown')
    axs[i].set(xlabel='Day Seq. Number', title=regressor + " regressor. Mean L1 loss per sample: " + \
        str(np.round(np.mean(l1_losses[regressor]), 5)) + " +/- " + str(np.round(np.std(l1_losses[regressor]), 5)) + \
        ". Mean L2 loss per sample: " + str(np.round(np.mean(l2_losses[regressor]), 5)) + " +/- " + str(np.round(np.std(l2_losses[regressor]), 5)))

In [None]:
plt.rcParams['figure.figsize'] = [15, 15]
fig = plt.figure()
gs = fig.add_gridspec(4, hspace=0.3)
axs = gs.subplots(sharex=True)
for i, regressor in enumerate(sorted(l1_losses.keys(), reverse=True)):
    axs[i].plot(prediction_errors[regressor])
    axs[i].set(title = regressor + " regressor" + \
        ". Mean L1 loss per sample: " + str(np.round(np.mean(l1_losses[regressor]), 5)) + " +/- " + str(np.round(np.std(l1_losses[regressor]), 5)) + \
        ". Mean L2 loss per sample: " + str(np.round(np.mean(l2_losses[regressor]), 5)) + " +/- " + str(np.round(np.std(l2_losses[regressor]), 5)))
axs[3].set(xlabel='Day Seq. Number');

<img src="desktop-computer-icon.png" width="90" style="float:left; margin-right: 10px;">
<h1> &nbsp; Section C: Variance Estimation.</h1>
The standard model optimized with the RMSE loss can only predict mean(x₁,x₂).
What if we want to estimate the variance of y, i.e., data uncertainty?<br>
To estimate data uncertainty, we need to use probabilistic regression models that predict both mean and variance.<br>

For this purpose, we use the *RMSEWithUncertainty* loss function in CatBoost.<br>
With this loss function, CatBoost estimates the mean and variance of the normal distribution optimizing the negative log-likelihood.<br>
For each example, CatBoost model returns two values: estimated mean and estimated variance.

In [None]:
# Train L2 regressors with uncertainty for the current signal using Ronny & Motti's features.
train_sig_df, features = create_signal()
test_sig_df, features = create_signal()

train_df = train_sig_df[features['ronny'] + ['true_signal']].dropna()
X_train_df = train_df[features['ronny']]
# train_df = train_sig_df[features['motti'] + ['current_signal']].dropna()
# X_train_df = train_df[features['motti']]
y_train = train_df['true_signal']

test_df = train_sig_df[features['ronny'] + ['true_signal']].dropna()
X_test_df = train_df[features['ronny']]
# test_df = test_sig_df[features['motti'] + ['current_signal']].dropna()
# X_test_df = test_df[features['motti']]

y_test = test_df['true_signal']

# A pool is just a convenient CatBoost "container" class for features and labels.
train_pool = Pool(X_train_df, y_train)
test_pool = Pool(X_test_df, y_test)

uncert_regressor = CatBoostRegressor(loss_function='RMSEWithUncertainty')
uncert_regressor.fit(train_pool, silent=True)

# Notice that for the 'RMSEWithUncertainty' loss, preds now contains two columns: one for mean & one for variance.
preds = uncert_regressor.predict(test_pool)
ic(preds.shape)

In [None]:
plt.rcParams['figure.figsize'] = [15, 8]
fig, axes = plt.subplots()
x_axis = y_test.index

# Reminder: the first columns of preds is the *estimated* mean.
plt.plot(x_axis, preds[:,0], color='brown', lw=0.5)
# While the second column is the *estimated* variance.
plt.fill_between(x_axis, preds[:,0] - 3*np.sqrt(preds[:,1]), preds[:,0] + 3*np.sqrt(preds[:,1]), color='green', alpha=0.3);

###### <img src="Electronic_Brain.png" width="100" style="float:left">
Q: What is the correlation coefficient between the predicted variance and the actual variance?