###### <img src="Electronic_Brain.png" width="200" style="float:left">
<h1> Spring 2021 ML Course.</h1>
<h2> Exercise 11: Dimensionality Reduction<br>Tools: CatBoost, PCA, UMAP</h2>

In [None]:
import numpy as np
import pandas as pd

import umap
from catboost import CatBoostClassifier
from sklearn.decomposition import PCA

import icecream as ic
from tqdm import tqdm

import matplotlib as plt
import matplotlib.pyplot as plt
import plotly.graph_objects as go

In [None]:
num_delays = 20

<img src="desktop-computer-icon.png" width="90" style="float:left; margin-right: 10px;">
<h1> &nbsp; Section A: Signal Generation.</h1>

In [None]:
"""
# Generate daily stock prices.
# This function generates a sequence of segments, each comprising Brownian geometric "motion"
# with a given mean & variance and num. of steps.
"""
def generate_stock_prices(params, seed, add_tail=False):

    rng = np.random.default_rng(seed)
    curr_price = 100
    price_seq = [curr_price]
    color_seq = [0]
    
    # Outer loop over regimes:
    for [mu, sigma, steps] in params:
        
        # Inner loop over days in the regime.
        # WARNING: absolutely inefficient, only for demonstration! 
        for i in range(steps):
            daily_move = mu + rng.normal(0, sigma)
            next_price = curr_price * daily_move
            price_seq.append(next_price)
            color_seq.append(np.sign(1 - mu))
            curr_price = next_price

    # Maybe we wish to add a "tail" which behaves completely differently.
    if add_tail==True:
        for i in range(5*steps):
            daily_move = 1 + rng.normal(0, 0.1*sigma)
            next_price = curr_price * daily_move
            price_seq.append(next_price)
            color_seq.append(3)
            curr_price = next_price
    
    # Turn the whole thing into a Pandas dataframe and return it.
    # NOTICE we set the dataframe index to the day seq. num. to enable merges between stocks.
    prices_df = pd.DataFrame()
    prices_df['AAPL'] = price_seq
    prices_df['day_seqno'] = range(len(price_seq))
    prices_df['colors'] = np.array(color_seq)
    prices_df.set_index('day_seqno')

    # Form log-return "features" (num_delays of them), composed of the log-diff prices.
    log_rets_df = pd.DataFrame()
    log_rets_df['colors'] = prices_df['colors']
    for delay in range(num_delays):
        log_rets_df['AAPL_log_ret_' + str(delay)] = np.log(prices_df['AAPL']) - np.log(prices_df['AAPL'].shift(periods=delay))
    log_rets_df.dropna(inplace=True)
    
    # print("Seed:", seed, "checksum:", np.sum(price_seq), "feat length:", len(log_rets_df), "price length:", len(prices_df), "max color:", log_rets_df['colors'].max())
    return(prices_df, log_rets_df)

In [None]:
# All stocks share the same Brownian params but the regimes are of random duration.
# We always start off with an UP leg.
noise_sigma = 0.0125
return_params = []
for _ in range(10):
    return_params.append([1.007, noise_sigma, 100])
    return_params.append([0.996, noise_sigma, 100])

regime_lengths = np.array([rp[2] for rp in return_params])
regime_start_days = np.cumsum(regime_lengths)

In [None]:
# Show off the time series.
prices_df, _ = generate_stock_prices(return_params, seed=1701)

plt.rcParams['figure.figsize'] = [20, 8]
fig, ax = plt.subplots()
ax.plot(prices_df['AAPL'], label='AAPL')

# Show the actual regime transition days.
for regime_start_day in regime_start_days:
    ax.axvline(regime_start_day, linestyle=':')

ax.set(xlabel='Day Seq. Number', title='AAPL Prices');
plt.legend();

In [None]:
# Generate the prices and log-return features again, this time keep only the features.
_, log_rets_df = generate_stock_prices(return_params, seed=10)

# Take a look at AAPL's "phase space" behavior (in 2D).
plt.rcParams['figure.figsize'] = [12, 10]
fig, ax = plt.subplots()
log_rets_df.plot.scatter(x="AAPL_log_ret_5",
                         y="AAPL_log_ret_15",
                         title='Phase space via a scatter plot\n Note the binary colors, denoting the current (UP or DOWN) regime',
                         c=log_rets_df['colors'], cmap='viridis', ax=ax);

<img src="desktop-computer-icon.png" width="90" style="float:left; margin-right: 10px;">
<h1> &nbsp; Section B: Feature Selection.</h1>
Please "optimize" the feature pairs below (manually) to try to obtain a "good" pair.<br>
Then try to use the complete set of features (commented out in the code) and see what happens.

In [None]:
model_cb = CatBoostClassifier(iterations=400)

# Take any specific selection of features.
used_features = ['AAPL_log_ret_2', 'AAPL_log_ret_5']
# Or put together a list of all features.
# used_features = ['AAPL_log_ret_' + str(delay) for delay in range(num_delays)]

# Re-generate the features independently.
_, logs_rets_df = generate_stock_prices(return_params, seed=1)

# Form the inputs to the CatBoost classifier.
X = log_rets_df[used_features].to_numpy()
y = log_rets_df[['colors']].to_numpy()
model_cb.fit(X, y, silent=True)

# Cross-validate the results on independently generated prices.
accuracies=[]
for i in range(10):
    _, logs_rets_df = generate_stock_prices(return_params, seed=i*666+777)
    X = log_rets_df[used_features].to_numpy()
    y = log_rets_df[['colors']].to_numpy()
    y = y[:,0]  # Pandas' to_numpy() returns a nested array (one vector for each column), so we need to un-nest.

    preds = model_cb.predict(X)
    preds = np.transpose(preds)       # Results come out as a column vector, transpose them.
    diffs = np.abs(preds - y) / 2     # We divide by 2 because the labels are in {-1, +1}.
    accuracy = 1 - np.sum(diffs) / len(diffs)
    accuracies.append(accuracy)

print("Accuracy with", len(used_features), "features:", round(np.mean(accuracies), 5), "+/-", round(np.std(accuracies), 5))

<img src="desktop-computer-icon.png" width="90" style="float:left; margin-right: 10px;">
<h1> &nbsp; Section C: Principal Components Analysis.</h1>
Please use the Sklearn PCA routine to check how well the classifier performs using any number of components.

In [None]:
# We now start our dimensionality reduction quest with the old-school workhorse: PCA.
pca_components = 2

_, logs_rets_df = generate_stock_prices(return_params, seed=2)

# The to-be-embedded data is the original log-returns dataframe, with the "colors" (i.e., labels) removed.
X_df = log_rets_df.loc[:, log_rets_df.columns != 'colors']
y = log_rets_df[['colors']]

pca = PCA(n_components=pca_components)
pca.fit(X_df)

# Perform the UMAP dimensionality reduction via the reducers trained on the same data.
X_reduced = pca.transform(X_df)

# Check outcome (graphically).
fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(X_reduced[:,0], X_reduced[:,1], s=30, cmap='viridis', alpha=1.0, c=log_rets_df['colors'])
ax.set_title('Log-returns embedded via PCA\n Notice the unequal axes');

In [None]:
# Construct a nice classifier on the lower dimensional unsupervised space.
pca_model = CatBoostClassifier(iterations=400, silent=True)
pca_model.fit(X_reduced, y)

# Cross-validate the results on independently generated prices.
accuracies=[]
for i in range(10):
    _, logs_rets_df = generate_stock_prices(return_params, seed=i*111+222)
    
    # The to-be-embedded data is the original log-returns dataframe, with the "colors" (i.e., labels) removed.
    X_df = log_rets_df.loc[:, log_rets_df.columns != 'colors']
    y = log_rets_df[['colors']].values
    y = y[:,0]  # Pandas' to_numpy() returns a nested array (one vector for each column), so we need to un-nest.

    X_reduced = pca.transform(X_df)

    preds = pca_model.predict(X_reduced)
    preds = np.transpose(preds)       # Results come out as a column vector, transpose them.
    diffs = np.abs(preds - y) / 2     # We divide by 2 because the labels are in {-1, +1}.
    accuracy = 1 - np.sum(diffs) / len(diffs)
    accuracies.append(accuracy)

print("PCA accuracy with", pca_components,"components:", round(np.mean(accuracies), 3), "+/-", round(np.std(accuracies), 5))

<img src="panicked_cat.png" width="90" style="float:left; margin-right: 10px;">
<h1> &nbsp; Section D: UMAP.</h1>
Please use the UMAP package to perform classification via dimensionality reduction.<br>
Please check the classification performance and explain the results.

In [None]:
# Re-generate the time series (this time we're only interested in the features).
_, logs_rets_df = generate_stock_prices(return_params, seed=2000)

# The to-be-embedded data is the original log-returns dataframe, with the "colors" (i.e., labels) removed.
X_df = log_rets_df.loc[:, log_rets_df.columns != 'colors']
y = log_rets_df[['colors']]

# Create both unsupervised & supervised dimensionality reducers.
unsup_reducer = umap.UMAP()
unsup_reducer.fit(X_df)
sup_reducer = umap.UMAP()
sup_reducer.fit(X_df, y=y)

# Perform the UMAP dimensionality reduction via the reducers trained on the same data.
X_unsup = unsup_reducer.transform(X_df)
X_sup = sup_reducer.transform(X_df)

# Check outcome (graphically).
fig, axs = plt.subplots(1, 2, figsize=(20, 8))
axs[0].scatter(X_unsup[:,0], X_unsup[:,1], s=30, cmap='viridis', alpha=1.0, c=log_rets_df['colors'])
axs[1].scatter(X_sup[:,0], X_sup[:,1], s=30, cmap='viridis', alpha=1.0, c=log_rets_df['colors'])
axs[0].set_title('Log-returns embedded via unsupervised UMAP')
axs[1].set_title('Log-returns embedded via supervised UMAP');

In [None]:
# Create an independent data set, extract the features.
_, log_rets_df = generate_stock_prices(return_params, seed=3000)
X_df = log_rets_df.loc[:, log_rets_df.columns != 'colors']

# Perform the UMAP dimensionality reduction via the reducers trained on the same data.
X_unsup = unsup_reducer.transform(X_df)
X_sup = sup_reducer.transform(X_df)

fig, axs = plt.subplots(1, 2, figsize=(20, 8))
axs[0].scatter(X_unsup[:,0], X_unsup[:,1], s=30, cmap='viridis', alpha=1.0, c=log_rets_df['colors'])
axs[1].scatter(X_sup[:,0], X_sup[:,1], s=30, cmap='viridis', alpha=1.0, c=log_rets_df['colors'])
axs[0].set_title('Log-returns embedded via unsupervised UMAP')
axs[1].set_title('Log-returns embedded via supervised UMAP');

In [None]:
# Create an independent data set, extract the features.
_, log_rets_df = generate_stock_prices(return_params, seed=4000)
X_df = log_rets_df.loc[:, log_rets_df.columns != 'colors']
y = log_rets_df[['colors']]

# Perform the UMAP dimensionality reduction via the reducers trained on the same data.
X_unsup = unsup_reducer.transform(X_df)
X_sup = sup_reducer.transform(X_df)

# This time, construct a nice classifier on the lower dimensional unsupervised space.
unsup_model = CatBoostClassifier(iterations=400, silent=True)
unsup_model.fit(X_unsup, y)
# And one trained on the supervised space.
sup_model = CatBoostClassifier(iterations=400, silent=True)
sup_model.fit(X_sup, y)

fig, axs = plt.subplots(1, 2, figsize=(20, 8))
axs[0].scatter(X_unsup[:,0], X_unsup[:,1], s=30, cmap='viridis', alpha=1.0, c=log_rets_df['colors'])
axs[1].scatter(X_sup[:,0], X_sup[:,1], s=30, cmap='viridis', alpha=1.0, c=log_rets_df['colors'])
axs[0].set_title('Log-returns embedded via unsupervised UMAP')
axs[1].set_title('Log-returns embedded via supervised UMAP');

In [None]:
# Cross-validate the results on independently-generated prices.
sup_accuracies=[]
unsup_accuracies=[]
for i in tqdm(range(5)):

    _, log_rets_df = generate_stock_prices(return_params, seed=i*555+444)
    X_df = log_rets_df.loc[:, log_rets_df.columns != 'colors']
    y = log_rets_df[['colors']].values
    y = y[:,0]  # Again, we need to un-nest.
    
    # Use the existing ("trained") dimensionality reduction (UMAP) objects.
    X_sup = sup_reducer.transform(X_df)
    X_unsup = unsup_reducer.transform(X_df)

    sup_preds = sup_model.predict(X_sup)
    unsup_preds = unsup_model.predict(X_unsup)
    
    sup_preds = np.transpose(sup_preds)        # Results come out as a column vector, transpose them.
    unsup_preds = np.transpose(unsup_preds)
    
    sup_diffs = np.abs(sup_preds - y) / 2      # We divide by 2 because the labels are in {-1, +1}.
    unsup_diffs = np.abs(unsup_preds - y) / 2 
    
    sup_accuracy = 1 - np.sum(sup_diffs) / len(sup_diffs)
    sup_accuracies.append(sup_accuracy)
    unsup_accuracy = 1 - np.sum(unsup_diffs) / len(unsup_diffs)
    unsup_accuracies.append(unsup_accuracy)
print("Supervised dim. reduct. accuracy:", round(np.mean(sup_accuracies), 3), "+/-", round(np.std(sup_accuracies), 3))
print("Unsupervised dim. reduct. accuracy:", round(np.mean(unsup_accuracies), 3), "+/-", round(np.std(unsup_accuracies), 3))

<img src="desktop-computer-icon.png" width="90" style="float:left; margin-right: 10px;">
<h1> &nbsp; Section E: UMAP Outlier Detection.</h1>
Now use the UMAP package to perform outlier detection via dimensionality reduction.<br>
How would we go about using the results to detect outliers in real time?

In [None]:
# Show off the time series.
prices_df, _ = generate_stock_prices(return_params, seed=5001, add_tail=True)

plt.rcParams['figure.figsize'] = [20, 8]
fig, ax = plt.subplots()
ax.plot(prices_df['AAPL'], label='AAPL')

# Show the actual regime transition days.
for regime_start_day in regime_start_days:
    ax.axvline(regime_start_day, linestyle=':')

ax.set(xlabel='Day Seq. Number', title='AAPL Prices');
plt.legend();

In [None]:
# Re-generate the time series (this time we're only interested in the features).
prices_df, log_rets_df = generate_stock_prices(return_params, seed=2021, add_tail=True)
y = log_rets_df['colors']

# The to-be-embedded data is the original log-returns dataframe, with the "colors" (i.e., labels) removed.
X_df = log_rets_df.loc[:, log_rets_df.columns != 'colors']

# Create both unsupervised & supervised dimensionality reducers.
unsup_reducer = umap.UMAP()
unsup_reducer.fit(X_df)
sup_reducer = umap.UMAP()
sup_reducer.fit(X_df, y=y)

# Perform the UMAP dimensionality reduction via the reducers trained on the same data.
X_unsup = unsup_reducer.transform(X_df)
X_sup = sup_reducer.transform(X_df)

# Check outcome (graphically).
fig, axs = plt.subplots(1, 2, figsize=(20, 8))
axs[0].scatter(X_unsup[:,0], X_unsup[:,1], s=10, cmap='viridis', alpha=1.0, c=log_rets_df['colors'])
axs[1].scatter(X_sup[:,0], X_sup[:,1], s=10, cmap='viridis', alpha=1.0, c=log_rets_df['colors'])
axs[0].set_title('Log-returns embedded via unsupervised UMAP')
axs[1].set_title('Log-returns embedded via supervised UMAP');