###### <img src="Electronic_Brain.png" width="200" style="float:left">
<h1> Spring 2021 ML Course.</h1>
<h2> Exercise 6: Feature Extraction, Linear Regression<br> Tools: Numpy, Pandas, Scikit-Learn, Optuna</h2>

In [None]:
!python --version
import math
import numpy as np
import pandas as pd
import matplotlib as plt
import matplotlib.pyplot as plt
from random import gauss, randint
from sklearn.linear_model import LinearRegression
from icecream import ic

In [None]:
# Home-grown scripts & libraries.
from stock_utils import rollout, generate_stock_prices

In [None]:
# Set a few defaults.
rand_seed = 100
np.random.seed(rand_seed)
pd.set_option('display.precision', 3)
plt.rcParams['figure.figsize'] = [15, 5]

<img src="desktop-computer-icon.png" width="90" style="float:left; margin-right: 10px;">
<h1> &nbsp; Section A: Signal Generation.</h1>

In [None]:
# The two stocks share the same Brownian params but the regimes are of random duration.
# Each regime is 20 to 40 days in length.
# We always start off with an UP leg.
return_params = []
daily_up_gain = 1.007    # Daily gain on UP legs is 0.7% on average.
daily_down_gain = 0.996  # Daily loss on DOWN legs is 0.4% on average.
daily_std = 0.01         # Daily STD is 1%.
for _ in range(10):
    return_params.append([daily_up_gain, daily_std, randint(20, 40)])
    return_params.append([daily_down_gain, daily_std, randint(20, 40)])
# ic(return_params)

regime_lengths = np.array([rp[2] for rp in return_params])
regime_start_days = np.cumsum(regime_lengths)
# ic(regime_start_days)

# Apple's stock is delayed by 5 days.
prices_df = generate_stock_prices(return_params, 5, 'AAPL', seed=111)
# AUX_0 is in phase with the market (and therefore 5 days ahead of AAPL).
aux_df = generate_stock_prices(return_params, 0, 'AUX_0', seed=222)

prices_df = pd.merge(prices_df, aux_df['AUX_0'], left_index=True, right_index=True, how='outer')
prices_df.head() # Show a few rows.

In [None]:
# Take a look at the set of all variables.
%whos

In [None]:
# Take a look (only) at the Numpy arrays.
%whos ndarray

In [None]:
# TRY both linear and semi-logarithmic plots!
fig, ax = plt.subplots()
ax.plot(prices_df['AAPL'], label='AAPL')
ax.plot(prices_df['AUX_0'], label='AUX_0')
# A semi-logarithmic axis is more informative: shows percent rather than absolute price moves.
# ax.semilogy(prices_df['AAPL'], label='AAPL')
# ax.semilogy(prices_df['AUX_0'], label='AUX_0')

# Show the actual regime transition days.
for regime_start_day in regime_start_days:
    ax.axvline(regime_start_day, linestyle=':')

ax.set(xlabel='Day Seq. Number', title='Stock Prices (note how AUX_0 coincides with the regimes while AAPL lags)');
plt.legend();

In [None]:
# Create "oracle" predictions: buy at the beginning of each UP leg, sell when it ends.
# The predictions will be zeros everywhere, with 1's on BUY days, -1's on SELL days.
# Since we are trading AAPL, we delay the "oracle" by 5 days.
oracle_preds = [0] * len(prices_df)  # Note this creates a list [0,0,...,0] of length len(prices_df).
oracle_preds[0] = 1                  # Start the list with a 1 (which means buy AAPL on the first day).
next_pred = -1
aapl_delay = 5
for ind in regime_start_days:
    oracle_preds[ind + aapl_delay] = next_pred
    next_pred *= -1

# Simulate our oracle predictions.
trades, days_in_market, reward = rollout(prices_df['AAPL'].to_list(), oracle_preds, buy_thresh=0.5, sell_thresh=-0.5)
reward_per_day = reward / days_in_market
ic(reward, days_in_market, reward_per_day)  # Use ic() to print out vars.
ic(trades);

<img src="desktop-computer-icon.png" width="90" style="float:left; margin-right: 10px;">
<h1> &nbsp; Section B: Feature & Target Extraction and Visualization</h1>

In [None]:
# Generate a few features via rolling means.
prices_df['AAPL_back_mean_10'] = prices_df['AAPL'].rolling(10, min_periods=10).mean()
prices_df['AAPL_back_gain_10'] = prices_df['AAPL'].divide(prices_df['AAPL_back_mean_10'])

prices_df['AUX_0_back_mean_10'] = prices_df['AUX_0'].rolling(10, min_periods=10).mean()
prices_df['AUX_0_back_gain_10'] = prices_df['AUX_0'].divide(prices_df['AUX_0_back_mean_10'])

In [None]:
# Generate a "target" score via a rolling forward mean.
indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=10)
prices_df['AAPL_forw_mean_10'] = prices_df['AAPL'].rolling(window=indexer, min_periods=10).mean()
# Define our target (score) as the ratio between the forward mean and the current price.
prices_df['AAPL_forw_score_10'] = prices_df['AAPL_forw_mean_10'].divide(prices_df['AAPL'])

In [None]:
fig, ax = plt.subplots()
ax.plot(prices_df['AAPL'], label='AAPL')
ax.plot(prices_df['AAPL_back_mean_10'], label='AAPL 10-day (backward) rolling mean')
ax.plot(prices_df['AAPL_forw_mean_10'], label='AAPL 10-day (forward) rolling score')
ax.set(xlabel='Day Seq. Number', title='AAPL Stock Prices');
plt.legend();

In [None]:
# Show the correlations between the two stocks' gains and AAPL's forward score
prices_df[['AUX_0_back_gain_10','AAPL_back_gain_10', 'AAPL_forw_score_10']].corr()

In [None]:
plt.rcParams['figure.figsize'] = [15, 6]
fig, axes = plt.subplots(nrows=1, ncols=2)
prices_df.plot.scatter(x="AAPL_back_gain_10", y="AAPL_forw_score_10", ax=axes[0], title='Scatter plots');
prices_df.plot.scatter(x="AUX_0_back_gain_10", y="AAPL_forw_score_10", ax=axes[1]);

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2)
prices_df.plot.hexbin(x="AAPL_back_gain_10", y="AAPL_forw_score_10", gridsize=25, ax=axes[0], title='Hexbin plots');
prices_df.plot.hexbin(x="AUX_0_back_gain_10", y="AAPL_forw_score_10", gridsize=25, ax=axes[1]);

<img src="Electronic_Brain.png" width="140" style="float:left; margin-right: 1px;">
<h1>Section C: Semi-Optimal Controller: Calculation & Simulation.</h1><br><br><br>
In this section we will create a <em>baseline</em> buy / sell rule.</br>
Using AAPL / AUX_0 historical data, find the points in time for which w.p. >95% a regime change has occured, using (only) 10-day histories.  
In the calculations we ignore the fact that percent changes are not additive (negligible for small numbers).  
We also ignore the alternative hypothesis (namely of a DOWN regime producing a positive sum).

<p> Assume we have access to (only) the 10-day gains.
Construct a "95%" buy / sell rule, based on our knowledge of the underlying process and params. The controller should switch from its current (LONG / NEUT) position when it calculates a 95% probability of having observed a regime change.</p>

**Solution**:
We present the calculation for an UP leg.  
The sum of 10 $\bf{independent}$ $(\mu, \sqrt{\sigma^2})$-distributed random vars is a $(10 \mu, \sqrt{10 \sigma^2})$-distributed normal variable.  
Calling it $X$, we can calculate the probability of it obtaining a negative value on any given day by via the error function:  
$\Pr[X<=0] = 1/2 + 1/2*\text{erf}\left( \frac{-10\mu}{\sqrt{2\times10\:}\:\times\sigma} \right) =
1/2 + 1/2*\text{erf}\left( \frac{-0.07}{\sqrt{20\:} \: \times \: 0.01} \right) \approx 3\%$.

For the 95% probability calculation we have, similarly:  
$\Pr[X<=L] = 1/2 + 1/2*\text{erf}\left( \frac{L - 10\mu}{\sqrt{2\times10 \:} \: \times\sigma} \right)$.  

Setting the right-hand side of the equation to 0.05, we want the error function to evaluate to -0.9, which via lookup table sets its argument at $\approx -0.8$. Solving, we find a 95% probability for a 10-day return above approx. 4%.  A similar calculation for DOWN regimes places the 95% probability at a loss of approx. 1%.

In [None]:
entry_threshold = 1.04
exit_threshold = 0.99

In [None]:
# Implement the 95%-optimal controller using only AAPL's historical gains.
# Use it to measure simulated profit via rollout().
# Again, predictions zeros everywhere, with 1's on BUY days, -1's on SELL days.
entry_mask = prices_df['AAPL_back_gain_10'] > entry_threshold
exit_mask = prices_df['AAPL_back_gain_10'] < exit_threshold
prices_df['semi_opt_aapl'] = 0
prices_df.loc[entry_mask, 'semi_opt_aapl'] = 1
prices_df.loc[exit_mask, 'semi_opt_aapl'] = -1

In [None]:
# Simulate the semi-optimal predictions.
trades, days_in_market, reward = rollout(prices_df['AAPL'].to_list(),
                                         prices_df['semi_opt_aapl'].to_list(),
                                         buy_thresh=0.5, sell_thresh=-0.5)
reward_per_day = reward / days_in_market
ic(reward, days_in_market, reward_per_day);

In [None]:
# Implement the 95%-optimal controller using only AUX_0's historical gains.
# Use it to measure simulated profit via rollout().
# Again, predictions zeros everywhere, with 1's on BUY days, -1's on SELL days.
entry_mask = prices_df['AUX_0_back_gain_10'] > entry_threshold
exit_mask = prices_df['AUX_0_back_gain_10'] < exit_threshold
prices_df['semi_opt_aux0'] = 0
prices_df.loc[entry_mask, 'semi_opt_aux0'] = 1
prices_df.loc[exit_mask, 'semi_opt_aux0'] = -1
# Simulate the semi-optimal predictions.
trades, days_in_market, reward = rollout(prices_df['AAPL'].to_list(),
                                         prices_df['semi_opt_aux0'].to_list(),
                                         buy_thresh=0.5, sell_thresh=-0.5)
reward_per_day = reward / days_in_market
ic(reward, days_in_market, reward_per_day);

<img src="desktop-computer-icon.png" width="90" style="float:left; margin-right: 10px;">
<h1> &nbsp; Section D: Linear Regression Using Scikit-Learn</h1><br><br>
We proceed to model the rolling 10-day forward gain via linear regression, using <em>both</em> AAPL's and AUX_0's historical prices in a single model.

In [None]:
# Prepare a "clean" dataset by dropping all rows with missing data.
# NOTICE: we first create a copy, then use dropna() to drop rows with missing data.
clean_df = prices_df[['AUX_0_back_gain_10', 'AAPL_back_gain_10', 'AAPL_forw_score_10']].copy()
clean_df.dropna(inplace=True)
X = clean_df[['AUX_0_back_gain_10', 'AAPL_back_gain_10']]

# Create continuous and discrete target variables.
y_cont = clean_df['AAPL_forw_score_10']
y_disc = clean_df['AAPL_forw_score_10']>1

In [None]:
# Train a linear regressor on the continuous target.
reg = LinearRegression()
reg.fit(X, y_cont)
# NOTICE: when we predict we keep the resulting index to avoid confusion later!
y_cont_pred = pd.Series(reg.predict(X), index=X.index)

In [None]:
# Show the linear regression's output.
sell_thresh = 0.995
buy_thresh = 1.005
fig, ax = plt.subplots()
ax2 = ax.twinx() # Instantiate a second axes sharing the same x-axis.
ax.plot(prices_df['AAPL'], label='AAPL')
ax2.plot(y_cont_pred, label='linear regression (10-day MA score)', color='orange')
ax2.hlines(1, y_cont_pred.index.min(), y_cont_pred.index.max(), color='red')
ax2.hlines(buy_thresh, y_cont_pred.index.min(), y_cont_pred.index.max(), color='red', linestyles='dashed')
ax2.hlines(sell_thresh, y_cont_pred.index.min(), y_cont_pred.index.max(), color='red', linestyles='dashed')
ax.set(xlabel='Day Seq. Number', title='AAPL Stock Prices');
plt.legend();

In [None]:
# When merging a Pandas dataframe with a Pandas series, the latter must have a name.
roll_df = pd.merge(prices_df['AAPL'],
                   y_cont_pred.rename('lin_reg'),
                   left_index=True, right_index=True, how='outer')

trades, days_in_market, reward = rollout(roll_df['AAPL'].to_list(), roll_df['lin_reg'].to_list(), buy_thresh, sell_thresh)
reward_per_day = reward / days_in_market
ic(reward, days_in_market, reward_per_day);

In [None]:
# Show the linear regression's output on the 2D plane defined by the two features.
# Define bounds of the domain.
min1, max1 =  clean_df['AAPL_back_gain_10'].min() - 0.01, clean_df['AAPL_back_gain_10'].max() + 0.01
min2, max2 =  clean_df['AUX_0_back_gain_10'].min() - 0.01, clean_df['AUX_0_back_gain_10'].max() + 0.01

# Define the x and y grid.
x1grid = np.linspace(min1, max1, 100)
x2grid = np.linspace(min2, max2, 100)

# Create the 2D grid.
xx, yy = np.meshgrid(x1grid, x2grid)

# Flatten each grid to a vector
r1, r2 = xx.flatten(), yy.flatten()
r1, r2 = r1.reshape((len(r1), 1)), r2.reshape((len(r2), 1))

# Horizontal stack vectors to create x1,x2 input for the model
grid = np.hstack((r1,r2))

# Use the model to make predictions on the grid.
y_hat = reg.predict(grid)

# Reshape the predictions back into a grid.
zz = y_hat.reshape(xx.shape)

# Plot the grid of x, y and z values as a surface
plt.rcParams['figure.figsize'] = [14,10]
plt.contourf(xx, yy, zz, cmap='gray')

# Add the actual training set as a scatter plot.
c = plt.scatter(x=clean_df["AUX_0_back_gain_10"], y=clean_df["AAPL_back_gain_10"], c=clean_df["AAPL_forw_score_10"], cmap='jet');
# Add a color bar.
plt.colorbar(c);

<img src="desktop-computer-icon.png" width="90" style="float:left; margin-right: 15px;">
<h1> &nbsp; Section E: Parameter Optimization via Optuna</h1><br><br>

In [None]:
import sys
import logging
import optuna

# Add stream handler of stdout to show the messages.
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
# We typically don't really want to see everything that's going on:
optuna.logging.set_verbosity(optuna.logging.WARNING)

# If we want to set a name for the study (but then we can't re-initialize the study):
# study_name = "make_money"  # Unique identifier of the study.
# If we're really serious about storing the study somewhere:
# storage_name = "sqlite:///{}.db".format(study_name)

# Here we can control the sampling algorithm.
study = optuna.create_study(study_name=None, storage=None, direction="maximize", sampler=optuna.samplers.CmaEsSampler())

In [None]:
def trading_objective(trial):
    sell_thresh = trial.suggest_float("sell_thresh", 0.99, 1)  # Provide the sampling range.
    buy_thresh = trial.suggest_float("buy_thresh", 1, 1.01)    # Provide the sampling range.
    trades, days_in_market, reward = rollout(roll_df['AAPL'].to_list(), roll_df['lin_reg'].to_list(), buy_thresh, sell_thresh)
    
    # objective_val = 0 if days_in_market==0 else reward / days_in_market
    objective_val = reward
    
    return objective_val

In [None]:
study.optimize(trading_objective, n_trials=500)

In [None]:
ic(study.best_params)
ic(study.best_value)
study.trials_dataframe().head()

In [None]:
from optuna.visualization import plot_optimization_history

In [None]:
plot_optimization_history(study)

In [None]:
# WARNING: this takes a LONG time (~2 minutes)!
optuna.visualization.plot_contour(study, params=["buy_thresh", "sell_thresh"])

<img src="Electronic_Brain.png" width="140" style="float:left; margin-right: 1px;">
Is it possible (theoretically) for the optimization above to beat the oracle's performance (defined above)?

<img src="desktop-computer-icon.png" width="90" style="float:left; margin-right: 25px;"><br>
* Fixing the optimization results obtained above (i.e., buy_thresh and sell_thresh), simulate the performance on data obtained from 10 new seeds. What do we see?
* Repeat the process with a different sampler (i.e., instead of CmaEsSampler()). How does this affect the contour plot?
* Propose and code a new objective function, which makes more "sense" in terms of trading.