# Learning to Be Cautious

- This Notebook includes:

1- Loading datasets (MNIST, Fashion-MNIST, and E-MNIST) and converting it to a multi-armed bandit setting (convert labels $\to$ arms or actions)

2- Train Deep Ensemble for be a reward distribution to caputure the epistemic uncertainty (train number of Neural Neworks with the same training data but with different initialization for the networks).

3- Approximate Percentile Optimization with k-of-n game. to get a robust policy.

4- Show different robust policies' behavior in training and out-of-distribution data and compare it with normal RL.

## 1- Loading datasets and converting it to a multi-armed bandit setting

In [None]:
import torch
import torchvision
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms as transforms
from IPython import display

### Load and transform data

In [None]:
transform = transforms.ToTensor()
# MNIST train
mnist_train = torchvision.datasets.MNIST(root='datasets', train=True, download=True, transform=transform)

### Convert it to a multi-armed bandit setting

- This scenario represents a situation where the agent has normal arms [0-9] plus a new arm could be represented as help arm -We don't tell the agent that the new arm is a help arm, agent should learn that by itself- We expect the cautious policy will choose the right arm when it faced with MNIST's images and to choose new arm -help arm- for any image looks different from MNIST's images.


R(arm)= \begin{cases}
    1, & \text{if arm = true label}\\
    0.25, & \text{if arm = help arm}\\
    0, & \text{otherwise}.
\end{cases} 

In [None]:
def action_to_reward (labels, n_arms=11, last_action_value=0.25):
    '''
    convert each action into a reward vector has size equal to number of actions (one-hot encoding)
    Args:
    labels: (int) MNIST's label 
    n_arms: (int) number of arms =11 arms [0-9] + last_arm 
    last_action_value: (float) last action fixed reward
    return:
    rewards: (tensor) arms reward for each example in MNIST has a size (n_samples, n_arms)
    '''
    rewards  = last_action_value*torch.ones((1, n_arms))
    rewards[0, :-1] = 1.0*(labels == torch.arange(n_arms-1)).float()
    return rewards
# convert MNIST training dataset to img+rewards    
mnist_training_set = np.zeros((len(mnist_train) , 795))
for i, data in enumerate (mnist_train):
    img, label = data
    mnist_training_set[i, : 784] = img.view(-1).numpy()
    mnist_training_set[i ,784:] = action_to_reward(label).numpy()

## 2- Train Deep Ensemble for be a reward distribution to caputure the epistemic uncertainty

In [None]:
# Neural Network hyper-parameters
device = "cuda" # "cpu"
n_models = 1 # number of models in the deep ensemble
n_epochs = 100 # number of epochs each model trained for
batch_size = 512 # batch size
learning_rate = 1.6e-3 # Learning rate
l2 = 0.0 # L2 regularization
loss_fun = nn.MSELoss() # loss function Mean Square Error
training_loss = np.zeros((n_models, n_epochs)) # training loss for each model in the deep ensemble
models = [] # list of all models's parameters in the deep ensemble
models_saving_dir = "models/last_action" # directory where each model's papamters will be saved

### Train neural networks

In [None]:
for m in range (n_models):
    model = nn.Sequential(
        nn.Conv2d(1, 64, (4,4)),
        nn.MaxPool2d((2,2)),
        nn.ReLU(),

        nn.Conv2d(64, 16, (4,4)),
        nn.MaxPool2d((2,2)),
        nn.ReLU(),

        nn.Flatten(),

        nn.Linear(256,50),
        nn.ReLU(),
        
        nn.Linear(50,15),
        nn.ReLU(),
        
        nn.Linear(15,11),
    ).to(device)
    opt = torch.optim.Adam(params = model.parameters(), lr=learning_rate, weight_decay=l2)

    for ep in range (n_epochs):
        np.random.shuffle(mnist_training_set)
        epoch_loss=0
        n_batches=0
        for batch in range (0, mnist_training_set.shape[0] , batch_size):
            n_batches+=1
            x = torch.tensor(mnist_training_set[batch : batch + batch_size:, :784], device=device, dtype=torch.float)
            x = x.view(x.shape[0], 1, 28, 28)
            y = torch.tensor(mnist_training_set[batch : batch + batch_size, 784:], device=device,  dtype=torch.float)
            loss = loss_fun(model(x), y)

            epoch_loss+=loss.item()

            opt.zero_grad()
            loss.backward()
            opt.step()
        print("[model]: %i , [EPOCH]: %i, [training LOSS]: %.5f" % (m, ep+1, epoch_loss/n_batches))
        display.clear_output(wait=True)
        training_loss[m, ep] = epoch_loss/n_batches

    torch.save(model, "{}/ensemble_model_{}".format(models_saving_dir, m))
    models.append(model)
np.save("{}/training_loss".format(models_saving_dir), training_loss)

## 3- Construct robust policies with k-of-n game

In [None]:
# load k-of-n functions from k_of_n script
from k_of_n import sample_rewards_from_ensemble, sort_and_k_least, run_k_of_n

In [None]:
method = "TEST" # or "test" if you want to play k-of-n a sample by sample in dataset, "TEST" play k-of-n for full datatset
ks = [10,5,1] # k values
ns = [10, 10,10] # n values
n_itr = 10 # number of itration for k-of-n game
n_runs = 1 # how many times you want to repeat each k-of-n policy
batch_size = 1024 # batch size for k-of-n
gpu = 0 # number of gpus -1 if cpu
n_actions = 11 # number of actions or arms
replacment = False # True if you want to sample reward functions from reward distribution with replacment
output_policies_dir = "models/actions/{}/last_action".format(method) # diroctory path where you want to save k-of-n policies

In [None]:
# check hayper-parameters
if len (ks) != len (ns):
    raise ValueError ("ks's length is not equal ns's length")
    
if not replacment:
    if (max(ns)* n_itr) > n_models:
         raise ValueError ("without replacment requreies more models")
            
for k in range (len(ks)):
    if ns[k] < ks[k]:
        raise ValueError ("n value={} should be greater than or equal to k value={}".format(ns[k],  ks[k]))

### Run k-of-n game

In [None]:
# MNIST
dataset = "MNIST" # or "E-MNIST", "MNIST-Fashion" 
run_k_of_n(ks, ns, n_runs, n_itr, method, n_models,batch_size, models_saving_dir, output_policies_dir, device, dataset, n_actions)

# Fashion-MNIST
dataset = "MNIST-Fashion"
run_k_of_n(ks, ns, n_runs, n_itr, method, n_models,batch_size, models_saving_dir, output_policies_dir, device, dataset, n_actions)

# E-MNIST
dataset = "E-MNIST"
run_k_of_n(ks, ns, n_runs, n_itr, method, n_models,batch_size, models_saving_dir, output_policies_dir, device, dataset, n_actions)

## 4- Show different robust policies's behavior

### Mean and std of probabilities of correct arms and help arm of k-of-n policies for MNIST, Fashion-MNIST and E-MNIST

In [None]:
mnist_arg = np.zeros((n_runs,len(ks), 10000, 11))
emnist_arg = np.zeros((n_runs,len(ks), 20800, 11))
fashion_arg = np.zeros((n_runs,len(ks), 10000, 11))


mnist_m = np.zeros(len(ks))
emnist_m = np.zeros(len(ks))
fashion_m = np.zeros(len(ks))
mnist_acc_m = np.zeros(len(ks))

for i in range (n_runs):
    for j in range (len(ks)):
        mnist_arg[i,j] = np.load("models/actions/{}/run_{}_mnist_actions_{}-of-{}_n_itr_{}.npy".format(dir_path, i, ks[j], ns[j], n_itr))
        emnist_arg[i,j] = np.load("models/actions/{}/run_{}_emnist_actions_{}-of-{}_n_itr_{}.npy".format(dir_path, i, ks[j], ns[j], n_itr))
        fashion_arg[i,j] = np.load("models/actions/{}/run_{}_fashion_actions_{}-of-{}_n_itr_{}.npy".format(dir_path, i, ks[j], ns[j], n_itr))
        
for j in range (len(ks)):        
    mnist_m [j]= 100*np.round(np.mean(np.mean(mnist_arg[:, j,:, -1 ], 1)),4)
    emnist_m [j]= 100*np.round(np.mean(np.mean(emnist_arg[:, j,:, -1 ], 1)),4)
    fashion_m [j]= 100*np.round(np.mean(np.mean(fashion_arg[:, j,:, -1 ], 1)),4)  
    acc = np.zeros(n_runs)
    for i in range (n_runs):
        acc [i]= np.sum(np.equal( np.argmax(mnist_arg[i, j], axis=1) , testing_set_y.numpy())+ np.zeros((testing_set_y.shape[0])))/testing_set_y.shape[0]
    mnist_acc_m[j] = 100*np.round(np.mean(acc, axis=0), 4)

In [None]:
print("MNIST accuracy", 100*mnist_acc_m)
print("MNIST P(help)", 100*mnist_m)
print("Fashion-MNIST P(help)", 100*fashion_m)
print("E-MNIST P(help)", 100*emnist_m)

### heat-map for E-MNIST

In [None]:
emnist_test  = torchvision.datasets.EMNIST(root="datasets", train=False, transform, target_transform=None, download=True, split="letters")

emnist_labels = torch.zeros(len(emnist_test), dtype=torch.int)

emnist_labels_names = list("abcdefghijklmnopqrstuvwxyz")

for i in range (len(emnist_test)):
    emnist_labels[i] = emnist_test[i][1]    

### load saved policies

In [None]:
output_policies_dir = "TEST/risk_reward_balanced"

emnist_policies = np.zeros((n_runs,len(ks), len(emnist_test), n_actions))

for i in range (n_runs):
    for j in range (len(ks)):
        emnist_policies[i,j] = np.load("models/actions/{}/run_{}_emnist_actions_{}-of-{}_n_itr_{}.npy".format(dir_path, i, ks[j], ns[j], n_itr))

In [None]:
map_emnist_m = np.zeros((len(ks), len(emnist_labels_names), n_actions))

for k in range (len(ks)):
    for i in range (len(emnist_labels_names)):
        map_emnist_m[k, i] = np.mean(np.mean(emnist_policies[:, k, np.where(emnist_labels.numpy()==(i+1))[0]], 1),0)

if n_actions==11:
    labelss = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "help"]

### plot heat-map figures for each policy

In [None]:
for i in range (len(ks)):
    plt.figure(figsize=(10,10))
    frame1 = plt.gca()
    sns.heatmap(np.round(map_emnist_m[i], 2), annot=True,  linewidths=.5,  cmap="YlGnBu" , vmin=0, vmax=1)
    plt.ylabel("Letter")
    plt.yticks(np.arange(len(emnist_labels_names))+0.5, emnist_labels_names, rotation='horizontal')
    plt.xlabel("Actions")
    plt.xticks(np.arange(n_actions)+0.5, labelss)  
    plt.tight_layout()
#     plt.savefig("fig/{}/emnist_{}-of-{}_prob_mean_heat_map_T_{}.pdf".format(dir_path, ks[i], ns[i], n_itr), dpi=300)