<a href="https://colab.research.google.com/github/markhalka/Paper_Imlimentations/blob/main/CME.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
import numpy as np
import torch.nn.functional as F
import torch
import multiprocessing

"""
A simple cross entropy method for optomizing a reinforcement learning task
"""

MAX_STEPS = 100
N_TEST = 2

env = gym.make("FrozenLake-v0", is_slippery=False)
nS = env.observation_space.n
nA = env.action_space.n
weights_dim = nS*nA
n_weights = 10

def collect_samples():
    mean = 0.0       
    std = 1.0
    weights = [mean + std*np.random.randn(weights_dim) for i_weight in range(n_weights)]
    return weights

def get_best_action(state, weight):
    return np.argmax([weight[state*nA + action] for action in range(nA)])

def get_action(state, weight):
    weights = [weight[state * nA + a] for a in range(nA)]
    weights = F.softmax(torch.tensor(weights))
    return np.random.choice(np.arange(nA), p=weights)

def test(weight):
    state = env.reset()
    done = False
    steps = 0
    total_reward = 0.0
    while not done and steps < MAX_STEPS:
        steps += 1
        action = get_action(state, weight)
        next_s, reward, done, _ = env.step(action)
        total_reward += reward
        state = env.reset() if done else next_s
    return total_reward

def evaluate_agent(weight):
    total_reward = 0.0
    for i in range(N_TEST):
        total_reward += test(weight)

    return total_reward / N_TEST 

def find_elites(rewards, weights_pop):
    n_elite = 4
    elite_idxs = np.array(rewards).argsort()[-n_elite:]
    elite_weights = [weights_pop[idx] for idx in elite_idxs]
    return elite_weights

def fit_distribution(elite_weights):
    means = np.zeros(nS*nA)
    stds = np.zeros(nS*nA)
    n_elite_weights = len(elite_weights)
    new_weights = n_weights - n_elite_weights
    weights = np.zeros((n_weights, weights_dim))
    for i in range(nS):
        for j in range(nA):
            lst = np.array([elite_weights[k][i*nA + j] for k in range(n_elite_weights)])
            mean = lst.mean()
            std = lst.std() + 0.5
            rand_vals = mean + std*np.random.randn(n_weights)
            for k in range(new_weights):
                weights[k][i*nA + j] = rand_vals[k]
    for i in range(n_elite_weights):
        weights[i+new_weights] = elite_weights[i]

    return weights

def pretty_print(weights):
    new_w = np.ones((4,4), dtype=np.float32)
    print(weights)
    for s in range(16):
        best_a = np.argmax([weights[s*nA+a] for a in range(nA)])
        row = s // 4
        col = s % 4
        new_w[row,col] = best_a
    print(new_w)


def run():
    weights = collect_samples()
    with multiprocessing.Pool(10) as p:
        for i in range(10000):
            rewards = p.map(evaluate_agent, [w for w in weights])
            if i % 100 == 0:
                print("iter {0}, max_reward: {1}".format(i, np.max(rewards)))
                pretty_print(weights[0])

            elites = find_elites(rewards, weights)
            weights = fit_distribution(elites)

run()