# OpenAI Gym

## Imports

In [1]:
import gym
import numpy as np
from model import DQN
from gym.wrappers import AtariPreprocessing, FrameStack
import matplotlib.pyplot as plt
from tqdm import tqdm

  f"Custom namespace `{spec.namespace}` is being overridden "
  import imp


## Parameters

## Initiate Environment

In [2]:
env = gym.make("MsPacmanNoFrameskip-v4", full_action_space=False)
env = AtariPreprocessing(env, noop_max=30)
env = FrameStack(env, num_stack=4)
env.reset()

<gym.wrappers.frame_stack.LazyFrames at 0x2072ae10778>

## Environment Details

In [3]:
n_actions = env.action_space.n
actions_meanings = env.env.get_action_meanings()
state_dim = env.observation_space.shape
print(f"Number of actions: {n_actions}")
print(f"Action meanings: {actions_meanings}")
print(f"State dimensions: {state_dim}")

Number of actions: 9
Action meanings: ['NOOP', 'UP', 'RIGHT', 'LEFT', 'DOWN', 'UPRIGHT', 'UPLEFT', 'DOWNRIGHT', 'DOWNLEFT']
State dimensions: (4, 84, 84)


## DQN Model Architecture

In [4]:
model = DQN()
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 20, 20, 32)        8224      
                                                                 
 batch_normalization (BatchN  (None, 20, 20, 32)       128       
 ormalization)                                                   
                                                                 
 activation (Activation)     (None, 20, 20, 32)        0         
                                                                 
 conv2d_1 (Conv2D)           (None, 9, 9, 64)          32832     
                                                                 
 batch_normalization_1 (Batc  (None, 9, 9, 64)         256       
 hNormalization)                                                 
                                                                 
 activation_1 (Activation)   (None, 9, 9, 64)          0

## Play Breakout

In [5]:
def plot_frames(frames):
    fig, axs = plt.subplots(1, frames.shape[2])
    for i, ax in enumerate(axs.flat):
        ax.imshow(frames[:,:,i], cmap="gray")
        ax.axis("off")
        ax.set_title(f"frame {i+1}")
    plt.savefig("processed_input.png")

In [6]:
def get_frames(observation):
    observation = observation.__array__().transpose(1,2,0)
    observation = np.expand_dims(observation, axis=0)
    
    return observation

In [7]:
def episode(model, max_step=1000):
    env = gym.make("MsPacmanNoFrameskip-v4", full_action_space=False, difficulty=0)#, render_mode='human')
    env = AtariPreprocessing(env, noop_max=30)
    env = FrameStack(env, num_stack=4)
    frames = get_frames(env.reset())

    episode_reward = 0
    step = 0

    while step < max_step:
        # env.render(mode="rgb_array")
        step += 1

        action = np.argmax(model(frames).numpy())
        # print(model(frames).numpy())
        # print(action)
        frames, reward, done, info = env.step(action)
        frames = get_frames(frames)

        episode_reward += reward
        if done: 
            frames = get_frames(env.reset())

    return episode_reward

    

In [8]:
def get_weights(parents):
    W = [np.log(parents - 0.5) - np.log(i) for i in range(1, parents+1)]
    W /= np.sum(W)
    
    return W

In [9]:
def get_start_parameters(model):
    parameters = model.trainable_weights
    parameters = np.concatenate(parameters, axis=None)
    start_weights = np.random.normal(0, 0.05, parameters.shape)

    return start_weights


In [10]:
def get_model_weights(theta, mut_stepsize, e):
    model = DQN(n_actions=n_actions)
    parameters = model.trainable_weights
    start_idx = 0
    w = theta + mut_stepsize * e

    for p in parameters:
        n = len(p.numpy().flatten()) if len(p.shape) > 1 else len(p.numpy())
        p.assign(w[start_idx:(start_idx+n)].reshape(p.shape))
        start_idx += n
    
    return model

In [11]:
def CES(model, mut_stepsize, parents, n_offspring, iterations):
    theta = get_start_parameters(model)
    W = get_weights(parents)
    best_r = np.zeros((iterations))
    print(theta[:10])
    
    for t in range(iterations):
        print('Iteration: ',t+1)
        e = np.zeros((n_offspring, theta.shape[0]))
        r = np.zeros((n_offspring))

        for i in tqdm(range(n_offspring)):
            e[i] = np.random.normal(0, 1, size=theta.shape)
            new_model = get_model_weights(theta, mut_stepsize, e[i])
            r[i] = episode(new_model)

        best_rs = r.argsort()
        best_r[t] = np.max(r)
        print(f"best reward: {best_r[t]}")
        best_es = e[best_rs][:parents]
        
        theta += mut_stepsize * np.sum([W[i] * best_es[i] for i in range(len(W))], axis=0)
        print(theta[:10])

    return theta, best_r

In [12]:
model = DQN(n_actions=n_actions)
theta, rewards = CES(model, 0.1, 5, 50, 10)

[-0.05205936  0.04070268 -0.0220608   0.08905913  0.04503125  0.07260447
 -0.04710801 -0.07290872 -0.02711515  0.03268756]
Iteration:  1


100%|██████████| 50/50 [26:06<00:00, 31.33s/it]


best reward: 990.0
[-0.02566253  0.1137029   0.10761209  0.1574765   0.00022604  0.03814883
 -0.06759338 -0.01607048 -0.04650887  0.03660903]
Iteration:  2


100%|██████████| 50/50 [15:48<00:00, 18.98s/it]


best reward: 2100.0
[-0.06526319  0.01778081 -0.06980343  0.11568959 -0.00729357  0.01925948
 -0.20770697 -0.08873295  0.01296887 -0.03633232]
Iteration:  3


100%|██████████| 50/50 [06:16<00:00,  7.53s/it]


best reward: 810.0
[-0.07095615  0.07632264 -0.03121932  0.08742649 -0.124931   -0.06529702
 -0.26579492 -0.25498386  0.11274938 -0.08395683]
Iteration:  4


100%|██████████| 50/50 [06:02<00:00,  7.26s/it]


best reward: 910.0
[-0.0026821   0.08951439 -0.07962415  0.16489577 -0.26696471 -0.02695128
 -0.28409693 -0.22147186  0.17435753 -0.15848622]
Iteration:  5


100%|██████████| 50/50 [06:11<00:00,  7.43s/it]


best reward: 900.0
[ 0.07862381  0.12079719 -0.02554231  0.05975577 -0.19666054 -0.11967771
 -0.27316958 -0.30201955  0.19578908 -0.0803045 ]
Iteration:  6


100%|██████████| 50/50 [06:30<00:00,  7.81s/it]


best reward: 720.0
[ 0.07089142  0.04107431 -0.02489426  0.06171787 -0.20741754 -0.12004632
 -0.22222532 -0.29690343  0.12829221 -0.10407775]
Iteration:  7


100%|██████████| 50/50 [06:20<00:00,  7.61s/it]


best reward: 810.0
[ 0.15769772  0.07741152  0.00151499  0.13965578 -0.40896551 -0.10349411
 -0.2911036  -0.35633912  0.20050068 -0.03206126]
Iteration:  8


100%|██████████| 50/50 [06:18<00:00,  7.57s/it]


best reward: 880.0
[ 0.13583175 -0.03198896 -0.12007729  0.11511771 -0.47215069 -0.21351076
 -0.28118361 -0.31055183  0.16207608  0.06658774]
Iteration:  9


100%|██████████| 50/50 [06:19<00:00,  7.59s/it]


best reward: 1160.0
[ 0.05028823  0.00685562  0.09845939  0.08804207 -0.55685729 -0.25314192
 -0.29756117 -0.24171206  0.18645404  0.03453927]
Iteration:  10


100%|██████████| 50/50 [06:16<00:00,  7.53s/it]


best reward: 460.0
[ 0.06501877  0.00083214  0.07168865  0.12378689 -0.58913027 -0.18212103
 -0.31133053 -0.26802314  0.21158104  0.09380176]


In [13]:
print(rewards)

[ 990. 2100.  810.  910.  900.  720.  810.  880. 1160.  460.]


In [14]:
model = get_model_weights(theta, 0, theta)
episode(model)

120.0

In [15]:
env.unwrapped.ale.getAvailableDifficulties()

[0]