## Project Brief: 
E-commerce Description Optimizer. You will simulate an e-commerce website where an LLM generates product descriptions for users, and the goal is to maximize conversion (purchase) while respecting cost. Three different LLMs (of varying cost and quality) are available. Users have different preferences. Youâ€™ll build a contextual bandit agent to route and prompt the LLMs optimally.

In [9]:
cost_llms = [0.01, 0.02, 0.1]


#hidden reward function 

import numpy as np
import random

class EcommerceEnv:
    def __init__(self):
        # Model Costs: [Cheap (A), Moderate (B), Expensive (C)]
        self.costs = [0.01, 0.02, 0.1]
        self.context_dim = 5
        self.n_actions = 3

    def generate_user_context(self):
        #users have 2 features - budget and quality seekers
        #product say has 3 features 
        # together context is a 5 dim vector
        persona = random.choice([[1,0], [0,1]])
        
        category = random.choice([[1,0,0], [0,1,0], [0,0,1]]) 
        
        return np.array(persona + category)

    def get_reward(self, context, action):
        """
        Stochastic binary reward (1 = buy, 0 = no buy)
        """
        if action == 0: 
            if context[0] == 1: 
                p_buy = 0.15
            else :
                p_buy = 0.05
        elif action == 2: 
            if context[2] == 1: 
                p_buy = 0.15
            else : 
                p_buy = 0.05
        else : 
            p_buy = 0.1
        
        # Simulate the binary outcome (Conversion)
        actual_buy = np.random.binomial(1, p_buy)
        
        # Proxy Reward (LLM Judge Score)
        # Correlated with P_buy but with noise
        proxy_score = np.clip(p_buy + np.random.normal(0, 0.02), 0, 1)
        
        return actual_buy, proxy_score


In [None]:
# random policy for logging data 

num_interactions = 1000
env = EcommerceEnv()
d = 5       # Dimension of context

for i in range(num_interactions): 
    context = env.generate_user_context()
    assert context.shape == (d,)

    action = np.random.randint(0, 3)
    actual_reward, proxy_reward = env.get_reward(context, action)
    


In [None]:
import numpy as np

# --- Setup from our previous steps ---
lam = 0.1   # Regularization (Î»)
alpha = 1.0 # Exploration (scaled from your 'beta')
k = 3       # Number of actions (Models A, B, C)
num_interactions = 1000

# --- Initialize LinUCB Disjoint Matrices ---
# Each arm needs its own A (covariance) and b (reward vector)
A = [np.identity(d) * lam for _ in range(k)]
b = [np.zeros(d) for _ in range(k)]

# Metrics tracking
total_conversions = 0
total_profit = 0
history = []

print(f"ðŸš€ Starting LinUCB simulation for {num_interactions} interactions...")

for i in range(num_interactions):
    # 1. Observe Context (x_t)
    context = env.generate_user_context()
    assert context.shape == (d,)
    
    ucbs = []
    for a in range(k):
        # 2. Calculate theta_hat and UCB for each arm
        A_inv = np.linalg.inv(A[a])
        theta_hat = A_inv @ b[a]
        
        # Expected reward + exploration bonus
        expected_reward = context @ theta_hat
        uncertainty = alpha * np.sqrt(context @ A_inv @ context)
        
        ucbs.append(expected_reward + uncertainty)
    
    # 3. Choose Action (argmax UCB)
    chosen_action = np.argmax(ucbs)
    
    # 4. Observe Outcomes
    # actual_reward = conversion (0 or 1), proxy = LLM Judge score (0.0 to 1.0)
    actual_reward, proxy_reward = env.get_reward(context, chosen_action)
    
    # 5. Update the selected arm using the PROXY reward
    # We use proxy_reward here to simulate real-time learning 
    A[chosen_action] += np.outer(context, context)
    b[chosen_action] += proxy_reward * context
    
    # 6. Track Business Metrics (using actual conversion and costs)
    cost = env.costs[chosen_action]
    profit = (actual_reward * 1.0) - cost # Assume $1.00 revenue per conversion
    
    total_conversions += actual_reward
    total_profit += profit

# --- Final Results ---
print("-" * 30)
print(f"Simulation Complete.")
print(f"Total Conversions: {total_conversions}")
print(f"Total Profit:      ${total_profit:.2f}")
print(f"Conversion Rate:   {(total_conversions/num_interactions)*100:.2f}%")

ðŸš€ Starting LinUCB simulation for 1000 interactions...
------------------------------
Simulation Complete.
Total Conversions: 103
Total Profit:      $63.93
Conversion Rate:   10.30%


In [None]:
#OPE 

