# Contextual Policies

Analyze the performance of our Whittle and Adaptive Policies

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import random 
import matplotlib.pyplot as plt
import json 
import argparse 
import sys
import secrets
from itertools import combinations
import gurobipy as gp
from gurobipy import GRB

In [3]:
import sys
sys.path.append('/usr0/home/naveenr/projects/food_rescue_preferences')

In [4]:
from rmab.simulator import run_multi_seed
from rmab.whittle_policies import *
from rmab.baseline_policies import *
from rmab.mcts_policies import *
from rmab.utils import get_save_path, delete_duplicate_results, restrict_resources

In [5]:
is_jupyter = 'ipykernel' in sys.modules

In [6]:
if is_jupyter: 
    seed        = 50
    n_arms      = 4
    volunteers_per_arm = 1
    budget      = 2
    discount    = 0.9
    alpha       = 3 
    n_episodes  = 105
    episode_len = 50 
    n_epochs    = 1
    save_with_date = False 
    lamb = 0.5
    prob_distro = 'uniform'
    reward_type = "linear"
    reward_parameters = {'universe_size': 20, 'arm_set_low': 0, 'arm_set_high': 1}
    out_folder = 'iterative'
    time_limit = 100
    context_dim = n_arms*volunteers_per_arm
else:
    parser = argparse.ArgumentParser()
    parser.add_argument('--n_arms',         '-N', help='num beneficiaries (arms)', type=int, default=2)
    parser.add_argument('--volunteers_per_arm',         '-V', help='volunteers per arm', type=int, default=5)
    parser.add_argument('--episode_len',    '-H', help='episode length', type=int, default=50)
    parser.add_argument('--n_episodes',     '-T', help='num episodes', type=int, default=105)
    parser.add_argument('--budget',         '-B', help='budget', type=int, default=3)
    parser.add_argument('--n_epochs',       '-E', help='number of epochs (num_repeats)', type=int, default=1)
    parser.add_argument('--discount',       '-d', help='discount factor', type=float, default=0.9)
    parser.add_argument('--alpha',          '-a', help='alpha: for conf radius', type=float, default=3)
    parser.add_argument('--lamb',          '-l', help='lambda for matching-engagement tradeoff', type=float, default=0.5)
    parser.add_argument('--universe_size', help='For set cover, total num unvierse elems', type=int, default=10)
    parser.add_argument('--arm_set_low', help='Least size of arm set, for set cover', type=float, default=3)
    parser.add_argument('--arm_set_high', help='Largest size of arm set, for set cover', type=float, default=6)
    parser.add_argument('--reward_type',          '-r', help='Which type of custom reward', type=str, default='set_cover')
    parser.add_argument('--seed',           '-s', help='random seed', type=int, default=42)
    parser.add_argument('--prob_distro',           '-p', help='which prob distro [uniform,uniform_small,uniform_large,normal]', type=str, default='uniform')
    parser.add_argument('--out_folder', help='Which folder to write results to', type=str, default='iterative')
    parser.add_argument('--time_limit', help='Online time limit for computation', type=float, default=100)
    parser.add_argument('--use_date', action='store_true')

    args = parser.parse_args()

    n_arms      = args.n_arms
    volunteers_per_arm = args.volunteers_per_arm
    budget      = args.budget
    discount    = args.discount
    alpha       = args.alpha 
    seed        = args.seed
    n_episodes  = args.n_episodes
    episode_len = args.episode_len
    n_epochs    = args.n_epochs
    lamb = args.lamb
    save_with_date = args.use_date
    prob_distro = args.prob_distro
    out_folder = args.out_folder
    reward_type = args.reward_type
    reward_parameters = {'universe_size': args.universe_size,
                        'arm_set_low': args.arm_set_low, 
                        'arm_set_high': args.arm_set_high}
    time_limit = args.time_limit 
    context_dim = n_arms*volunteers_per_arm

save_name = secrets.token_hex(4)  

In [7]:
results = {}
results['parameters'] = {'seed'      : seed,
        'n_arms'    : n_arms,
        'volunteers_per_arm': volunteers_per_arm, 
        'budget'    : budget,
        'discount'  : discount, 
        'alpha'     : alpha, 
        'n_episodes': n_episodes, 
        'episode_len': episode_len, 
        'n_epochs'  : n_epochs, 
        'lamb': lamb,
        'prob_distro': prob_distro, 
        'reward_type': reward_type, 
        'universe_size': reward_parameters['universe_size'],
        'arm_set_low': reward_parameters['arm_set_low'], 
        'arm_set_high': reward_parameters['arm_set_high'],
        'time_limit': time_limit, 
        'context_dim': context_dim,
        } 

## Non-RMAB Max Problem

In [8]:
N = 10
K = 5
max_values = np.array([np.random.random() for i in range(N)])

In [9]:
upper_bounds = np.array([1 for i in range(N)])
T = 20
best_score = np.max(max_values)*T

scores = []
for i in range(T):
    sorted_indices = np.argsort(upper_bounds)[::-1]
    top_k_indices = sorted_indices[:K]
    value = max(max_values[top_k_indices])
    scores.append(value)
    for j in top_k_indices:
        upper_bounds[j] = min(upper_bounds[j],value)
best_score-np.sum(scores)

6.259550028200387

## Non-RMAB Subset Problem

In [13]:
N = 10
K = 3
context_size = 5
T = 100
hidden_contexts = [[random.randint(0,1) for i in range(context_size)] for _ in range(N)]
hidden_contexts = np.array(hidden_contexts)
arm_rewards = []

upper_bounds = np.array([[1 for i in range(context_size)] for _ in range(N)])

In [14]:
N = len(upper_bounds)
value_list = []
action_list = []
rand_context_list = []

max_values = [] 

for _ in range(T):
    rand_context = [random.randint(0,1) for i in range(context_size)]
    # Create a new model
    model = gp.Model("linear_program")

    # Add variables
    z = model.addVars(len(rand_context), lb=0, ub=1, name="z")
    x = model.addVars(N, lb=0, ub=1, name="x")
    A = model.addVars(N,len(rand_context), lb=0, ub=1, name="A")
    B = model.addVars(N,len(rand_context), lb=0, ub=1, name="B")

    # Set the objective function
    model.setObjective(gp.quicksum(z[i] * rand_context[i] for i in range(len(rand_context))), GRB.MAXIMIZE)

    # Add constraints
    for i in range(len(rand_context)):
        for j in range(N):
            model.addConstr(z[i] >= A[j,i], f"c1_{i}_{j}")
            model.addConstr(A[j,i] <= x[j], f"c2_{i}_{j}")
            model.addConstr(A[j,i] >= 0, f"c3_{i}_{j}")
            model.addConstr(A[j,i] <= B[j,i], f"c4_{i}_{j}")
            model.addConstr(A[j,i] >= B[j,i] + x[j] - 1, f"c5_{i}_{j}")
            model.addConstr(B[j,i] >= 0, f"c6_{i}_{j}")
            model.addConstr(B[j,i] <= 1, f"c7_{i}_{j}")

        model.addConstr(z[i] <= gp.quicksum(A[j,i] for j in range(N)), f"c8_{i}")
    # Add the additional constraint: sum z[i] <= K
    model.addConstr(gp.quicksum(x[i] for i in range(N)) == K, "sum_z_constraint")

    for constraint_num in range(len(value_list)):
        z_hat = model.addVars(len(rand_context), lb=0, ub=1, name="z_hat_{}".format(constraint_num))

        for i in range(len(rand_context)):
            for j in range(N):
                model.addConstr(z_hat[i] >= action_list[constraint_num][j]*B[j,i], f"c1_{i}_{j}")
            model.addConstr(z_hat[i] <= gp.quicksum(action_list[constraint_num][j]*B[j,i] for j in range(N)), f"c8_{i}")
        model.addConstr(gp.quicksum(z_hat[j]*rand_context_list[constraint_num][j] for j in range(len(rand_context))) <= value_list[constraint_num], f"c8_{i}")

    # Optimize the model
    model.optimize()

    if model.status == GRB.OPTIMAL:
        print("Optimal objective value:", model.objVal)
        print("z values:", [z[i].X for i in range(len(rand_context))])
        print("x values:", [x[j].X for j in range(N)])

    action = [int(x[j].X) for j in range(N)]

    max_coverage = np.max(hidden_contexts[np.array(action) == 1],axis=0)
    value = np.sum(max_coverage.dot(rand_context))

    value_list.append(value)
    action_list.append(action)
    rand_context_list.append(rand_context)
    max_values.append(sum(rand_context))

Gurobi Optimizer version 10.0.3 build v10.0.3rc0 (linux64)

CPU model: Intel(R) Core(TM) i7-7700K CPU @ 4.20GHz, instruction set [SSE2|AVX|AVX2]
Thread count: 4 physical cores, 8 logical processors, using up to 8 threads

Optimize a model with 356 rows, 115 columns and 665 nonzeros
Model fingerprint: 0xead24a46
Coefficient statistics:
  Matrix range     [1e+00, 1e+00]
  Objective range  [1e+00, 1e+00]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 3e+00]
Presolve removed 250 rows and 50 columns
Presolve time: 0.00s
Presolved: 106 rows, 65 columns, 265 nonzeros

Iteration    Objective       Primal Inf.    Dual Inf.      Time
       0    4.0000000e+00   6.000000e+00   0.000000e+00      0s
       6    4.0000000e+00   0.000000e+00   0.000000e+00      0s

Solved in 6 iterations and 0.01 seconds (0.00 work units)
Optimal objective  4.000000000e+00
Optimal objective value: 4.0
z values: [1.0, 1.0, 1.0, 0.0, 1.0]
x values: [1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Gurobi


CPU model: Intel(R) Core(TM) i7-7700K CPU @ 4.20GHz, instruction set [SSE2|AVX|AVX2]
Thread count: 4 physical cores, 8 logical processors, using up to 8 threads

Optimize a model with 636 rows, 140 columns and 1102 nonzeros
Model fingerprint: 0x7c1435f3
Coefficient statistics:
  Matrix range     [1e+00, 1e+00]
  Objective range  [1e+00, 1e+00]
  Bounds range     [1e+00, 1e+00]
  RHS range        [1e+00, 4e+00]
Presolve removed 329 rows and 0 columns
Presolve time: 0.00s
Presolved: 307 rows, 140 columns, 768 nonzeros

Iteration    Objective       Primal Inf.    Dual Inf.      Time
       0    2.0000000e+00   1.400000e+01   0.000000e+00      0s
      28    2.0000000e+00   0.000000e+00   0.000000e+00      0s

Solved in 28 iterations and 0.01 seconds (0.00 work units)
Optimal objective  2.000000000e+00
Optimal objective value: 2.0
z values: [0.0, 0.0, 1.0, 1.0, 0.0]
x values: [1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Gurobi Optimizer version 10.0.3 build v10.0.3rc0 (linux64)

CPU

In [15]:
np.mean(value_list), np.mean(max_values)

(2.36, 2.44)