# Debugging Differences

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from collections import defaultdict
import random
import os
from linucb import linucb_all

In [None]:
np.random.seed(42)
random.seed(42)

In [2]:
X_context = np.load("processed_top3/context_reduced.npy")
actions = np.load("processed_top3/actions.npy")
rewards = np.load("processed_top3/rewards.npy")
pscores = np.load("processed_top3/pscores.npy")
meta = np.genfromtxt("processed_top3/meta.txt", skip_header=1)

n_rounds, context_dim = X_context.shape
n_arms = len(np.unique(actions))

print(f"Context shape: {X_context.shape}, #Arms: {n_arms}")

Context shape: (53988, 12), #Arms: 3


In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Fix seed, logging data
np.random.seed(123)
rng = np.random.default_rng(123)
a_logged = actions
r_logged = rewards
X = X_context
p = pscores

# One-hot encode
onehot = OneHotEncoder(sparse_output=False)
a_logged_1hot = onehot.fit_transform(a_logged.reshape(-1, 1))
X_train = np.hstack([X, a_logged_1hot])

# Train reward model with fixed seed
reward_model = RandomForestRegressor(n_estimators=100, random_state=123)
reward_model.fit(X_train, r_logged)

# Simulated random policy (fixed random actions)
a_sim = rng.integers(0, n_arms, size=len(X))
a_sim_1hot = onehot.transform(a_sim.reshape(-1, 1))
X_sim = np.hstack([X, a_sim_1hot])
r_hat_sim = reward_model.predict(X_sim)

matches = a_sim == a_logged
correction = np.zeros_like(r_logged)
correction[matches] = (r_logged[matches] - r_hat_sim[matches]) / p[matches]
dr_sim = r_hat_sim + correction

print("Fixed DR reward mean for random policy:", dr_sim.mean())

Fixed DR reward mean for random policy: 0.06645223966291249


In [2]:
context = np.load("processed_top3/context_reduced.npy")
action = np.load("processed_top3/actions.npy")
reward = np.load("processed_top3/rewards.npy")
pscore = np.load("processed_top3/pscores.npy")

In [8]:
n_actions = 3
context_dim = context.shape[1]
n_arms = n_actions

In [9]:
X_context = context 
a_logged = np.array(action)
r_logged = np.array(reward)
pscore = np.array(pscore)
#a_target = np.array(results["chosen_actions"]) 

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Fix seed, logging data
np.random.seed(123)
rng = np.random.default_rng(123)
a_logged = action
r_logged = reward
X = X_context
p = pscore

# One-hot encode
onehot = OneHotEncoder(sparse_output=False)
a_logged_1hot = onehot.fit_transform(a_logged.reshape(-1, 1))
X_train = np.hstack([X, a_logged_1hot])

# Train reward model with fixed seed
reward_model = RandomForestRegressor(n_estimators=100, random_state=123)
reward_model.fit(X_train, r_logged)

# Simulated random policy (fixed random actions)
a_sim = rng.integers(0, n_arms, size=len(X))
a_sim_1hot = onehot.transform(a_sim.reshape(-1, 1))
X_sim = np.hstack([X, a_sim_1hot])
r_hat_sim = reward_model.predict(X_sim)

matches = a_sim == a_logged
correction = np.zeros_like(r_logged)
correction[matches] = (r_logged[matches] - r_hat_sim[matches]) / p[matches]
dr_sim = r_hat_sim + correction

print("Fixed DR reward mean for random policy:", dr_sim.mean())

Fixed DR reward mean for random policy: 0.06645223966291249
