In [None]:
import lightning as pl
import logging
from torch.utils.data import DataLoader, Subset
import pandas as pd
from tqdm import tqdm
import numpy as np

from neural_bandits.bandits.linear_ts_bandit import LinearTSBandit
from neural_bandits.benchmark.datasets.statlog import StatlogDataset

from neural_bandits.benchmark.environment import BanditBenchmarkEnvironment
from neural_bandits.utils.selectors import ArgMaxSelector

In [None]:
dataset = StatlogDataset()
print(dataset.context_size)
print(len(dataset))

In [None]:
train_loader = DataLoader(Subset(dataset, range(10000)), batch_size=32, shuffle=True)

accelerator = "cpu"
env = BanditBenchmarkEnvironment(train_loader, device=accelerator)

bandit_module = LinearTSBandit(
    n_features=dataset.context_size,
    selector=ArgMaxSelector(),
    lazy_uncertainty_update=True,
).to(accelerator)

logging.getLogger("lightning.pytorch.utilities.rank_zero").setLevel(logging.FATAL)

In [None]:
rewards = np.array([])
regrets = np.array([])
progress = tqdm(iter(env), total=len(env))
for contextualized_actions in progress:
    chosen_actions, _ = bandit_module.forward(contextualized_actions)

    trainer = pl.Trainer(
        max_epochs=1,
        enable_progress_bar=False,
        enable_model_summary=False,
        accelerator=accelerator,
    )
    chosen_contextualized_actions, realized_rewards = env.get_feedback(chosen_actions)
    batch_regret = env.compute_regret(chosen_actions)

    rewards = np.append(rewards, realized_rewards.cpu().numpy())
    regrets = np.append(regrets, batch_regret.cpu().numpy())
    
    progress.set_postfix({
        "reward": realized_rewards.mean().item(),
        "regret": batch_regret.mean().item(),
        "avg_regret": regrets.mean()
    })

    bandit_module.record_feedback(chosen_contextualized_actions, realized_rewards)
    trainer.fit(bandit_module)
    # Because of this: https://github.com/Lightning-AI/pytorch-lightning/issues/10294,
    # we need to move the model to the desired device.
    bandit_module = bandit_module.to(accelerator)
metrics = pd.DataFrame({
    "reward": rewards,
    "regret": regrets,
})
metrics

In [None]:
# load metrics from the logger and plot
import pandas as pd
import numpy as np

cumulative_reward = np.cumsum(metrics["reward"][:5000])
cumulative_regret = np.cumsum(metrics["regret"][:5000].dropna())

In [None]:
import matplotlib.pyplot as plt

plt.plot(cumulative_reward, label="reward")
plt.plot(cumulative_regret, label="regret")
plt.xlabel("steps")
plt.ylabel("cumulative reward/regret")
plt.legend()
plt.show()

In [None]:
# average reward
print(sum(metrics["reward"][:100]) / 100)
print(sum(metrics["reward"][:1000]) / 1000)
print(sum(metrics["reward"][:10000]) / 10000)
print(sum(metrics["regret"][:100].dropna()) / 100)
print(sum(metrics["regret"][:1000].dropna()) / 1000)
print(sum(metrics["regret"][:10000].dropna()) / 10000)