In [5]:
!pip install numpy matplotlib

Defaulting to user installation because normal site-packages is not writeable
Collecting numpy
  Downloading numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting matplotlib
  Downloading matplotlib-3.9.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.60.2-cp39-cp39-macosx_10_9_universal2.whl.metadata (113 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.7-cp39-cp39-macosx_11_0_arm64.whl.metadata (6.3 kB)
Collecting pillow>=8 (from matplotlib)
  Downloading pillow-11.3.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (9.0 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Downloading pyparsing-3.3.1-py3-none-any.whl.metadata (5.6 kB)
Collecti

In [6]:
import numpy as np

class LinUCBAgent:
    def __init__(self, d, alpha, lambda_reg):
        self.d = d
        self.alpha = alpha 
        self.lambda_reg = lambda_reg

        #initialize : 
        #   theta_hat with all zeros of 1xd dimension
        #   Gt as lambda_reg*I (dxd)

        self.theta_hat = np.zeros(d)
        self.inv_G = (1.0 / lambda_reg) * np.identity(d)  

        self.S = np.zeros(d)

    def select(self, action_set_t):  
        # action_set_t - K*d
        # P dim : Kx1
        P = action_set_t @ self.theta_hat 
        # B dim : K x K
        uncertainty = np.sqrt(np.sum((action_set_t @ self.inv_G) * action_set_t, axis=1))
        bonus = self.alpha * uncertainty 

        ucb_val = P + bonus

        return np.argmax(ucb_val)  

    def update_all(self, a, reward):
        # a is At - action played at t
        v = self.inv_G @ a  
        # intermediate (d,) vector
        numerator = np.outer(v, v)
        denominator = 1.0 + np.dot(a, v) 
        
        # Sherman-Morrison Update
        self.inv_G -= numerator / denominator
        
        # Update S and theta_hat
        self.S += reward * a
        self.theta_hat = self.inv_G @ self.S


In [None]:
def test_linucb_simulation(d=3, K=4, T=500, alpha=1.0, lambda_reg=1.0, noise_std=0.1, seed=42):
    
    np.random.seed(seed)
    
    # true theta (unknown to the agent)
    true_theta = np.array([0.5, -0.3, 0.8])[:d]  # Use first d elements
    
    agent = LinUCBAgent(d=d, alpha=alpha, lambda_reg=lambda_reg)
    
    # Generate action set 
    action_set = np.random.randn(K, d)
    # Normalize feature vectors
    action_set = action_set / np.linalg.norm(action_set, axis=1, keepdims=True)
    
    # Storage for results
    rewards = []
    selected_arms = []
    cumulative_regret = []
    total_regret = 0.0
    
    print(f"Running LinUCB simulation with {K} arms, {T} time steps...")
    print(f"True theta: {true_theta}\n")
    
    for t in range(T):
        # Select arm
        arm_idx = agent.select(action_set)
        selected_arms.append(arm_idx)
        
        # Get feature vector of selected arm
        arm_features = action_set[arm_idx]
        
        # Compute true expected reward
        true_reward = np.dot(arm_features, true_theta)
        
        # Add noise
        observed_reward = true_reward + np.random.normal(0, noise_std)
        rewards.append(observed_reward)
        
        # Find best arm with true theta
        best_reward = max([np.dot(action_set[i], true_theta) for i in range(K)])
        regret_t = best_reward - observed_reward
        total_regret += regret_t
        cumulative_regret.append(total_regret)
        
        # Update agent
        agent.update_all(arm_features, observed_reward)
    
    # Print results
    print(f"Total regret: {total_regret:.2f}")
    print(f"Average reward: {np.mean(rewards):.4f}")
    print(f"Final theta_hat estimate: {agent.theta_hat}")
    print(f"True theta: {true_theta}")
    

In [19]:
# Run the test simulation
results = test_linucb_simulation(d=3, K=4, T=500, alpha=1.0, lambda_reg=1.0)

Running LinUCB simulation with 4 arms, 500 time steps...
True theta: [ 0.5 -0.3  0.8]

Total regret: 1.25
Average reward: 0.9735
Final theta_hat estimate: [ 0.55989419 -0.16019796  0.78092025]
True theta: [ 0.5 -0.3  0.8]
