<a href="https://colab.research.google.com/github/markoo26/thehappymountain/blob/main/Experimentation_(part_4_Contextual_Bandits).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title üöö Imports

import numpy as np
import scipy
import scipy.stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [1]:
#@title üå± Random seed setup

np.random.seed(42)

NameError: name 'np' is not defined

In [2]:
#@title üîß Functions

def measure_viewing_time(context, action_weights):
    """Simulates how long a user views content based on context features and action weights.
    Returns an exponential of the dot product plus some noise to model realistic viewing behavior."""
    return np.exp( (context*action_weights).mean() + 0.1*np.random.normal())


class Sample:
    """Stores a single logged interaction with context features, the action taken, and the reward received.
    This is the basic data structure for offline learning."""
    def __init__(self, context, action, reward):
        self.context = context
        self.action = action
        self.reward = reward

def collect_logs_by_action(num_actions, logs):
    """Organizes logged samples by grouping rewards and contexts according to which action was taken.
    Returns separate lists of outcomes and features for each action."""
    samples_y = [[] for _ in range(num_actions)]
    samples_x = [[] for _ in range(num_actions)]
    for sample in logs:
        samples_y[sample.action].append(sample.reward)
        samples_x[sample.action].append(sample.context)
    return samples_y, samples_x

def build_models(num_features, samples_y, samples_x):
    """Fits a linear regression model for each action using least squares.
    If an action has no data, it returns a zero vector for that action's coefficients."""
    betas = []
    ngood=0
    nbad=0
    for y, x in zip(samples_y, samples_x): # for each action
        y = np.array(y)
        x = np.array(x)
        if len(y) > 0:
            beta = np.linalg.pinv(x.T@x) @ x.T@y
            ngood+=1
        else:
            beta = np.zeros(shape=(num_features,))
            nbad+=1
        betas.append(beta)
    return betas

class RecommenderGreedy:
    """A greedy recommender that always picks the action with highest predicted reward.
    Uses linear models fitted on historical data to estimate viewing times."""
    def __init__(self, num_features, num_actions):
        self._num_features = num_features
        self._num_actions = num_actions

    def reset(self):
        """Initializes random model coefficients for each action."""
        self._betas = [np.random.normal(size=(num_features, )) for _ in range(self._num_actions)]

    def fit_offline(self, logs):
        """Trains linear models on logged data to predict rewards for each action."""
        samples_y, samples_x = collect_logs_by_action(num_actions, logs)
        self._betas = build_models(self._num_features, samples_y, samples_x)

    def policy(self, context):
        """Selects the action with maximum predicted viewing time given the current context."""
        viewing_max = -np.inf
        for action in range(self._num_actions):
            viewing_hat = context @ self._betas[action]
            if viewing_hat > viewing_max:
                action_best = action
                viewing_max = viewing_hat
        return action_best

def log_production_data(action_weights, recommender):
    """Simulates a day of production where the recommender makes decisions and collects interaction data.
    Returns average viewing time and the collected logs."""
    logs = []
    total_viewing_time = 0
    num_decisions = 100
    for _ in range(num_decisions):
        context = np.random.randint(2, size=(len(action_weights),)) # features describing user
        context[0] = 1 # first "feature" is just a constant / intercept term / offset
        action = recommender.policy(context) # choose best post
        viewing_time = measure_viewing_time(context=context, action_weights=action_weights[:, action])
        logs.append( Sample(context, action, viewing_time) )
        total_viewing_time += viewing_time
    avg_viewing_time = (total_viewing_time)/num_decisions
    return avg_viewing_time, logs

def run_experiment_sequence(action_weights, num_actions, recommender):
    """Runs a 14-day simulation where the recommender learns from accumulating data each day.
    Returns the average viewing times across all days."""
    num_days = 14
    num_features = action_weights.shape[0]
    avg_viewing_times = []
    all_logs = []
    recommender.reset()
    for _ in range(num_days):  # one month
        avg_viewing_time, logs = log_production_data(action_weights, recommender)
        avg_viewing_times.append(avg_viewing_time)
        all_logs.extend(logs)
        recommender.fit_offline(all_logs)  # all data from day one till now

    avg_viewing_times = np.array(avg_viewing_times)
    return avg_viewing_times

def run_sequences(action_weights, num_actions, recommender):
    """Runs multiple experiment sequences and aggregates results to compute mean and standard error.
    This helps evaluate recommender performance with statistical confidence."""
    avg_viewing_times = []
    num_runs = 10
    for _ in range(num_runs):
        avg_viewing_times.append(run_experiment_sequence(action_weights, num_actions, recommender))
    avg_viewing_times = np.array(avg_viewing_times)
    mean = avg_viewing_times.mean(axis=0)
    se = avg_viewing_times.std(axis=0)/np.sqrt(num_runs)
    return mean, se

class RecommenderEpsilonGreedy:
    """An epsilon-greedy recommender that explores by taking random actions with probability epsilon.
    Otherwise behaves like the greedy recommender to balance exploration and exploitation."""
    def __init__(self, num_features, num_actions, eps=0.1):
        self._num_features = num_features
        self._num_actions = num_actions
        self._eps = eps

    def reset(self):
        """Initializes random model coefficients for each action."""
        self._betas = [np.random.normal(size=(num_features, )) for _ in range(self._num_actions)]

    def fit_offline(self, logs):
        """Trains linear models on logged data to predict rewards for each action."""
        samples_y, samples_x = collect_logs_by_action(num_actions, logs)
        self._betas = build_models(self._num_features, samples_y, samples_x)

    def policy(self, context):
        """Picks a random action with probability epsilon, otherwise selects the best predicted action."""
        viewing_max = -np.inf
        if np.random.uniform(0,1) < self._eps:
            action_best = np.random.randint(0, self._num_actions)
        else:
            for action in range(self._num_actions):
                viewing_hat = context @ self._betas[action]
                if viewing_hat > viewing_max:
                    action_best = action
                    viewing_max = viewing_hat
        return action_best

class RecommenderThompsonSampling:
    """A Thompson sampling recommender that maintains uncertainty via bootstrap samples of the models.
    Explores by randomly sampling from the distribution of possible models each decision."""
    def __init__(
        self, num_features, num_actions,
        num_bs_samples
    ):
        self._num_features = num_features
        self._num_actions = num_actions
        self._num_bs_samples = num_bs_samples

    def reset(self):
        """Initializes multiple bootstrap samples of random model coefficients."""
        self._betas = []
        for _ in range(self._num_bs_samples):
            self._betas.append([
                np.random.normal(size=(num_features,))
                for _ in range(self._num_actions)
            ] )

    def _bs_sample(self, samples_y, samples_x):
        """Creates a bootstrap resample of the data by sampling with replacement for each action."""
        bs_samples_y = []
        bs_samples_x = []
        for action in range(self._num_actions):
            y = np.array(samples_y[action])
            x = np.array(samples_x[action])
            if len(y)>0:
                i = np.random.randint(0, len(y), size=(len(y),))
                y = y[i]
                x = x[i,:]
            bs_samples_y.append(y)
            bs_samples_x.append(x)
        return bs_samples_y, bs_samples_x

    def fit_offline(self, logs):
        """Trains multiple models on bootstrap resamples of the data to capture uncertainty."""
        fit_logs = logs
        samples_y, samples_x = collect_logs_by_action(
            num_actions, fit_logs
        )
        self._betas = []
        for _ in range(self._num_bs_samples):
            bs_samples_y, bs_samples_x = self._bs_sample(
                samples_y, samples_x
            )
            self._betas.append(build_models(
                self._num_features, bs_samples_y, bs_samples_x
            ))

    def policy(self, context):
        """Randomly samples one of the bootstrap models and picks its best action for the given context."""
        i_beta = np.random.randint(0, self._num_bs_samples)
        beta = self._betas[i_beta]
        viewing_max = -np.inf
        for action in range(self._num_actions):
            viewing_hat = context @ beta[action]
            if viewing_hat > viewing_max:
                action_best = action
                viewing_max = viewing_hat
        return action_best

class RecommenderThompsonSamplingInstrumented:
    """Thompson sampling recommender with additional tracking to measure decision confidence.
    Records the probability that each chosen action is considered best across bootstrap samples."""
    def __init__(
        self, num_features, num_actions,
        num_bs_samples
    ):
        self._num_features = num_features
        self._num_actions = num_actions
        self._num_bs_samples = num_bs_samples

    def reset(self):
        """Initializes bootstrap samples and tracking variables for confidence metrics."""
        self._betas = []
        for _ in range(self._num_bs_samples):
            self._betas.append([
                np.random.normal(size=(num_features,))
                for _ in range(self._num_actions)
            ] )
        self._p_best = []
        self.mean_vs_day = []

    def _bs_sample(self, samples_y, samples_x):
        """Creates a bootstrap resample of the data by sampling with replacement for each action."""
        bs_samples_y = []
        bs_samples_x = []
        for action in range(self._num_actions):
            y = np.array(samples_y[action])
            x = np.array(samples_x[action])
            if len(y)>0:
                i = np.random.randint(0, len(y), size=(len(y),))
                y = y[i]
                x = x[i,:]
            bs_samples_y.append(y)
            bs_samples_x.append(x)
        return bs_samples_y, bs_samples_x

    def fit_offline(self, logs):
        """Trains bootstrap models and records the mean confidence from the previous day's decisions."""
        self.mean_vs_day.append(np.array(self._p_best).mean())

        fit_logs = logs
        samples_y, samples_x = collect_logs_by_action(
            num_actions, fit_logs
        )
        self._betas = []
        for _ in range(self._num_bs_samples):
            bs_samples_y, bs_samples_x = self._bs_sample(
                samples_y, samples_x
            )
            self._betas.append(build_models(
                self._num_features, bs_samples_y, bs_samples_x
            ))

    def _best_post(self, context, beta):
        """Finds the action with highest predicted reward for a given context and model."""
        viewing_max = -np.inf
        for action in range(self._num_actions):
            viewing_hat = context @ beta[action]
            if viewing_hat > viewing_max:
                action_best = action
                viewing_max = viewing_hat
        return action_best

    def policy(self, context):
        """Selects an action via Thompson sampling and records what fraction of models agree it's best.
        This confidence metric helps understand exploration vs exploitation behavior."""
        best_posts = [
            self._best_post(context, self._betas[i_beta])
            for i_beta in range(self._num_bs_samples)
        ]

        i_beta = np.random.randint(self._num_bs_samples)
        action_best = best_posts[i_beta]
        num = 0
        for bp in best_posts:
            if bp == action_best:
                num += 1

        p_post = num / self._num_bs_samples
        self._p_best.append(p_post)
        return action_best

In [3]:
#@title üìä Histogram of action weights
action_weights = np.random.normal(size=(5,))
viewing_time = [measure_viewing_time(context=np.random.normal(size=(5,)), action_weights=action_weights)
     for _ in range(1000)]

df = pd.DataFrame({
    "viewing_time": viewing_time,
})

px.histogram(df)

In [4]:
#@title üßë‚Äç‚úàÔ∏è Run first RecommenderGreedy and plot
num_features = 5
num_actions = 30

np.random.seed(17)
action_weights = np.random.normal(size=(num_features, num_actions)) # the dgp; fixed values
recommender = RecommenderGreedy(num_features, num_actions)
mean, se = run_sequences(action_weights, num_actions, recommender)
betas_g = recommender._betas

df = pd.DataFrame({
    "episode": list(range(len(mean))),
    "mean": mean,
    "std": se,
})

for method in ['mean']:
  df[f'{method}_upper'] = df[f'mean'] + df[f'std'] / 2
  df[f'{method}_lower'] = df[f'mean'] - df[f'std'] / 2


df_means = df.melt(id_vars=['episode'], value_vars=['mean'])
asymp = mean[4:].mean()
fig = px.line(df_means, x='episode', y='value', color='variable', markers=True)

fig.add_traces([
    go.Scatter(
        x=np.concatenate([df["episode"], df["episode"][::-1]]),
        y=np.concatenate([df["mean_upper"], df["mean_lower"][::-1]]),
        fill='toself',
        fillcolor='rgba(0, 0, 255, 0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=True,
        name="95% AB Testing CI"
    )
])

fig.add_traces([
    go.Scatter(
        x=np.concatenate([df["episode"], df["episode"][::-1]]),
        y=np.concatenate([df["mean_upper"], df["mean_lower"][::-1]]),
        fill='toself',
        fillcolor='rgba(255, 0, 0, 0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=True,
        name="95% Epsilon Greedy CI"
    )
])


fig.add_hline(y=asymp, line_dash='dot')

fig.add_annotation(
    x=7,
    ax=2,
    y=1.7,
    text="asymptote, mean vieving time = 1.67",
    font=dict(size=12, color="black"),
    showarrow=True,
    arrowhead=3
)
fig.show()

In [5]:
#@title üí∞ Epsilon-greedy exploration
# action = 1, no missing data
contexts = [
    [1, 0, 0],
    [0, 1, 0],
    [0, 0, 1]
]
rewards = [
    .6,
    .9,
    1.3
]
x = np.array(contexts)
y = np.array(rewards)
beta_1 = np.linalg.pinv(x.T @ x) @ (x.T@y)
print (beta_1)

# predicted response of user a to action=1
context_a = [0,0,1]
print (context_a @ beta_1)

# predicted response of user b to action=1
context_b = [1,0,1]
print (context_b @ beta_1)


# action = 1, missing data about feature #3
contexts = [
    [1, 0, 0],
    [0, 1, 0]
]
rewards = [
    0.6,
    0.9
]
x = np.array(contexts)
y = np.array(rewards)
beta_1m = np.linalg.pinv(x.T @ x) @ (x.T@y)
print (beta_1m)

# predicted response of user a to action=1
print (context_a @ beta_1m)

# predicted response of user b to action=1
print (context_b @ beta_1m)

[0.6 0.9 1.3]
1.3
1.9
[0.6 0.9 0. ]
0.0
0.6


In [6]:
#@title üëç Run first RecommenderEpsilonGreedy and plot
recommender = RecommenderEpsilonGreedy(num_features, num_actions, eps=0.1)
mean_eps, se_eps = run_sequences(action_weights, num_actions, recommender)
betas_eg = recommender._betas

df = pd.DataFrame({
    "episode": list(range(len(mean))),
    "mean": mean,
    "std": se,
    "mean_eps": mean_eps,
    "std_eps": se_eps,
})

for method in ['', '_eps']:
  df[f'mean{method}_upper'] = df[f'mean{method}'] + df[f'std{method}'] / 2
  df[f'mean{method}_lower'] = df[f'mean{method}'] - df[f'std{method}'] / 2


df_means = df.melt(id_vars=['episode'], value_vars=['mean', 'mean_eps'])
asymp = mean[4:].mean()
fig = px.line(df_means, x='episode', y='value', color='variable', markers=True)

fig.add_traces([
    go.Scatter(
        x=np.concatenate([df["episode"], df["episode"][::-1]]),
        y=np.concatenate([df["mean_upper"], df["mean_lower"][::-1]]),
        fill='toself',
        fillcolor='rgba(0, 0, 255, 0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=True,
        name="95% AB Testing CI"
    )
])

fig.add_traces([
    go.Scatter(
        x=np.concatenate([df["episode"], df["episode"][::-1]]),
        y=np.concatenate([df["mean_upper"], df["mean_lower"][::-1]]),
        fill='toself',
        fillcolor='rgba(255, 0, 0, 0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=True,
        name="95% Epsilon Greedy CI"
    )
])

fig.add_traces([
    go.Scatter(
        x=np.concatenate([df["episode"], df["episode"][::-1]]),
        y=np.concatenate([df["mean_eps_upper"], df["mean_eps_lower"][::-1]]),
        fill='toself',
        fillcolor='rgba(255, 0, 0, 0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=True,
        name="95% AB Testing CI"
    )
])

fig.add_traces([
    go.Scatter(
        x=np.concatenate([df["episode"], df["episode"][::-1]]),
        y=np.concatenate([df["mean_eps_upper"], df["mean_eps_lower"][::-1]]),
        fill='toself',
        fillcolor='rgba(255, 0, 0, 0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=True,
        name="95% Epsilon Greedy CI"
    )
])


fig.add_hline(y=asymp, line_dash='dot')

fig.add_annotation(
    x=7,
    ax=2,
    y=1.7,
    text="asymptote, mean vieving time = 1.67",
    font=dict(size=12, color="black"),
    showarrow=True,
    arrowhead=3
)
fig.show()

In [7]:
#@title üë©‚Äçüè´ Thompson sampling, plot bootstraped_visits (bs_visits)

visits = np.array([3 + int(5*np.random.uniform()) for _ in range(100)])
i = np.random.randint(len(visits), size=(len(visits,)))
bs_visits = visits[i]
print (visits.mean(), visits.std())
print (bs_visits.mean(), bs_visits.std())

df = pd.DataFrame({
    "visits": visits,
    "bs_visits": bs_visits,
})

df = df.melt(value_vars = ['visits', 'bs_visits'])
px.histogram(df, x='value', color='variable', facet_col='variable')

5.1 1.438749456993816
5.16 1.4402777509911067


In [8]:
#@title üõπ Run first RecommenderThompsonSampling and plot
recommender = RecommenderThompsonSampling(num_features, num_actions, num_bs_samples=30)
mean_ts, se_ts = run_sequences(action_weights, num_actions, recommender)
df = pd.DataFrame({
    "episode": list(range(len(mean))),
    "mean": mean,
    "std": se,
    "mean_eps": mean_eps,
    "std_eps": se_eps,
    "mean_ts": mean_ts,
    "std_ts": se_ts,
})

for method in ['', '_eps', '_ts']:
  df[f'mean{method}_upper'] = df[f'mean{method}'] + df[f'std{method}'] / 2
  df[f'mean{method}_lower'] = df[f'mean{method}'] - df[f'std{method}'] / 2


df_means = df.melt(id_vars=['episode'], value_vars=['mean', 'mean_eps', 'mean_ts'])
asymp = mean[4:].mean()
fig = px.line(df_means, x='episode', y='value', color='variable', markers=True)

fig.add_traces([
    go.Scatter(
        x=np.concatenate([df["episode"], df["episode"][::-1]]),
        y=np.concatenate([df["mean_upper"], df["mean_lower"][::-1]]),
        fill='toself',
        fillcolor='rgba(0, 0, 255, 0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=True,
        name="95% AB Testing CI"
    )
])

fig.add_traces([
    go.Scatter(
        x=np.concatenate([df["episode"], df["episode"][::-1]]),
        y=np.concatenate([df["mean_upper"], df["mean_lower"][::-1]]),
        fill='toself',
        fillcolor='rgba(255, 0, 0, 0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=True,
        name="95% Epsilon Greedy CI"
    )
])

fig.add_traces([
    go.Scatter(
        x=np.concatenate([df["episode"], df["episode"][::-1]]),
        y=np.concatenate([df["mean_eps_upper"], df["mean_eps_lower"][::-1]]),
        fill='toself',
        fillcolor='rgba(255, 0, 0, 0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=True,
        name="95% AB Testing CI"
    )
])

fig.add_traces([
    go.Scatter(
        x=np.concatenate([df["episode"], df["episode"][::-1]]),
        y=np.concatenate([df["mean_eps_upper"], df["mean_eps_lower"][::-1]]),
        fill='toself',
        fillcolor='rgba(255, 0, 0, 0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=True,
        name="95% Epsilon Greedy CI"
    )
])

fig.add_traces([
    go.Scatter(
        x=np.concatenate([df["episode"], df["episode"][::-1]]),
        y=np.concatenate([df["mean_ts_upper"], df["mean_ts_lower"][::-1]]),
        fill='toself',
        fillcolor='rgba(0, 255, 0, 0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=True,
        name="95% AB Testing CI"
    )
])

fig.add_traces([
    go.Scatter(
        x=np.concatenate([df["episode"], df["episode"][::-1]]),
        y=np.concatenate([df["mean_ts_upper"], df["mean_ts_lower"][::-1]]),
        fill='toself',
        fillcolor='rgba(0, 255, 0, 0.2)',
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=True,
        name="95% Epsilon Greedy CI"
    )
])


# fig.add_hline(y=asymp, line_dash='dot')

# fig.add_annotation(
#     x=7,
#     ax=2,
#     y=1.7,
#     text="asymptote, mean vieving time = 1.67",
#     font=dict(size=12, color="black"),
#     showarrow=True,
#     arrowhead=3
# )
fig.show()

In [9]:
#@title üé∞ Run and plot RecommenderThompsonSamplingInstrumented and show AVG p(best) action
recommender = RecommenderThompsonSamplingInstrumented(num_features, num_actions, num_bs_samples=30)
run_sequences(action_weights, num_actions, recommender)
df = pd.DataFrame({'mean_vs_day': recommender.mean_vs_day})
px.line(df, markers=True, title='Avg p(best) action')