In [1]:
import numpy as np
from MABModel import MABModel

In [2]:
import math

In [3]:
def exp3(n_arms, T, gamma = 0.2):
    ws = [1.0] * n_arms
    
    ## instantiate model 
    x = MABModel(n_arms)
    cum_reward = 0.0
    
    for i in range(0, T):
        p = computeP(gamma, ws, n_arms)
        next_arm = np.random.choice(n_arms, 1, p=p)[0]
        reward = x.getArm(next_arm)
        
        estimated_reward = reward/p[next_arm]        
        ws[next_arm] = ws[next_arm]*math.pow(math.e, gamma*estimated_reward/n_arms)
        
        cum_reward += reward
        
    return cum_reward

def computeP(gamma, weights, n_arms):
    total_w = np.sum(weights)
    return [((1-gamma)*w)/(total_w) + (gamma/float(n_arms)) for w in weights]

In [37]:
np.sqrt(4)

2.0

In [33]:
def exp3P(n_arms, T, gamma = 0.2, beta = 0.2):
#     ws = [1.0] * n_arms
    
    
    G = [0.0 for i in range(0, n_arms)]
    alpha = 0.5 
    
    ## instantiate model 
    x = MABModel(n_arms)
    cum_reward = 0.0
    
    for i in range(0, T):
        p = computeP(gamma, ws, n_arms)
        next_arm = np.random.choice(n_arms, 1, p=p)[0]
        reward = x.getArm(next_arm)
        
        estimated_reward = (reward+beta)/p[next_arm]
        G[next_arm] += estimated_reward 
        
        exp_term = (gamma/(3*n_arms))*(estimated_reward + (alpha/(p[next_arm]*np.sqrt(n_arms*T))))
        ws[next_arm] = ws[next_arm]*math.pow(math.e, )
        
        cum_reward += reward
        
    return cum_reward

In [7]:
def Sexp3(n_arms, T, gamma = 0.2):
    
    ws = [1.0] * n_arms
    
    # a context is just a previous arm
    context_ws = [ws.copy() for i in range(0, n_arms)] 
    
    ## instantiate model 
    x = MABModel(n_arms)
    cum_reward = 0.0
    prev_arm = np.random.randint(0, n_arms)
    
    arms = []
    
    for i in range(0, T):
        
        # here we have to index into context weights i think
        p = computeP(gamma, context_ws[prev_arm], n_arms)
        next_arm = np.random.choice(n_arms, 1, p=p)[0]
        reward = x.getArm(next_arm)
        
        estimated_reward = reward/p[next_arm]
        
        temp_ws = context_ws[prev_arm]
        context_ws[prev_arm][next_arm] = context_ws[prev_arm][next_arm]*math.pow(math.e, gamma*estimated_reward/n_arms)
        prev_arm = next_arm
        
        cum_reward += reward
        arms.append(next_arm)
        
    return (cum_reward, context_ws, arms)

In [30]:
num_samples = 25
n_arms = 5
T = 5000

for gamma in [0.01, 0.05, 0.10, 0.20, 0.30, 0.40, 0.50]:
    total_rew = 0.0 
    for i in range(0, num_samples):
        rew = exp3(n_arms, T, gamma)
        total_rew += rew

    print(total_rew/num_samples)

2923.78
3502.2
3250.86
3267.68
3326.22
3045.26
2973.66


In [31]:
num_samples = 25
n_arms = 5
T = 5000

for gamma in [0.01, 0.05, 0.10, 0.20, 0.30, 0.40, 0.50]:
    total_rew = 0.0 
    for i in range(0, num_samples):
        (rew, context_ws, arms) = Sexp3(n_arms, T, gamma)
        total_rew += rew

    print(total_rew/num_samples)

2660.96
3198.1
3369.98
3268.96
3159.2
3057.68
3082.82


In [36]:
num_samples = 25
n_arms = 5
T = 5000

for gamma in [0.01, 0.05, 0.10, 0.20, 0.30, 0.40, 0.50]:
    total_rew = 0.0 
    for i in range(0, num_samples):
        rew = exp3P(n_arms, T, gamma)
        total_rew += rew

    print(total_rew/num_samples)

2818.32
3493.28
3374.52
3437.2
3097.64
3026.74
2968.9
