# Training an Agent to play Super Mario
In this exercise you are going to train a Q-Learning agent on the ```gym-marioai``` domain.  
gym_marioai provides a python interface to interact with the MarioAI engine in a comfortable way. The engine itself is implemented in java, and the ```.jar``` of the engine needs to be started separately.  

### Installation
Requirements: Java 8 runtime environment, python 3.?  
You will be provided with both the .jar and the gym-marioai python package.

In [None]:
# install the gym-environment
# navigate to the source folder, then run:
# pip install ./path/to/gym-marioai

In [36]:
!python --version

Python 3.9.1


In [3]:
import gym
import gym_marioai
import numpy as np
from random import random

### Running the MarioAI server:
navigate to the folder containing ```marioai-server.jar```, then run the following:  
```java -jar ./marioai-server.jar```

### python-client demo setup:
make sure the demo is running...

In [4]:
# initialize the environment
env = gym.make('Marioai-v0', render=True, level_path=gym_marioai.levels.cliff_level)

# run random episodes
for episode in range(2):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        action = env.action_space.sample()
        state, reward, done, info = env.step(action)
        total_reward += reward

    print(f'finished episode {episode}, total_reward: {total_reward}')

print('finished demo')
env.teardown()

KeyboardInterrupt: 

## Representation of the Q-table
You will experience some of the shortcomings of tabular reinforcement learning methods. With marioai, the observation space will be very large, resulting in
- longer training duration (no interpolation of the policy between similar observations, each state needs to be explored separately)
- large amount of memory required to store the Q-table, if implemented naively

However, we can assume that only a subset of the observation space will be visited.  

Task: Implement a representation of the Q-table that stores observations 'on-demand'.  
Optional: Think of a way to store and reuse the trained model.


In [4]:
class QTable:
    """
    data structure to store the Q function for hashable state representations
    """
    def __init__(self, n_actions, initial_capacity=100):
        self.capacity = initial_capacity
        self.num_states = 0
        self.state_index_map = {}
        self.table = np.zeros([initial_capacity, n_actions])

    def __getitem__(self, state):
        """ access state directly using [] notation """
        if state not in self.state_index_map:
            self._init_state(state)
        return self.table[self.state_index_map[state]]

    def _init_state(self, state):
        if self.num_states == self.capacity:
            self.table = np.concatenate((self.table, np.zeros_like(self.table)))
            self.capacity *= 2

        self.state_index_map[state] = self.num_states
        self.num_states += 1

## Training a Q-learner

In [5]:
#####################################
#   Training Parameters
#####################################
n_episodes = 15000
alpha = 0.1
gamma = 0.99
lmbda = 0.75
epsilon_start = 0.5
epsilon_end = 0.001
epsilon_decay_length = n_episodes * 0.8
epsilon_slope = (epsilon_end - epsilon_start) / epsilon_decay_length

#####################################
#   Environment/Reward Settings
#####################################
trace = 2
rf_width = 20
rf_height = 10
prog = 1
timestep = -1
cliff = 1000
win = -20
dead = -10
path = gym_marioai.levels.one_cliff_level

In [8]:
SAVE_FREQ = 100

def eps_greedy(state, env, Q, epsilon):
    if np.random.rand() < epsilon:
        return env.action_space.sample()
    else:
        return int(np.argmax(Q[state]))

    
def train():
    """
    Q Learning with epsilon decay and (replacing) eligibility traces
    """
    #log_path = f'{level}_{rf_width}x{rf_height}_trace{trace}_prog{prog}_cliff{cliff}_win{win}_dead{dead}-0'
    #logger = Logger(log_path)
    # collect some training statistics
    all_rewards = np.zeros([SAVE_FREQ])
    all_wins = np.zeros([SAVE_FREQ])
    all_steps = np.zeros([SAVE_FREQ])
    all_gap_jumps = np.zeros([SAVE_FREQ])

    ###################################
    #       environment setup
    ###################################
    reward_settings = gym_marioai.RewardSettings(progress=prog, timestep=timestep, cliff=cliff, win=win, dead=dead)
    env = gym.make('Marioai-v0', render=False,
                   level_path=path,
                   reward_settings=reward_settings,
                   compact_observation=True,
                   trace_length=trace,
                   rf_width=rf_width, rf_height=rf_height)

    ####################################
    #       Q-learner setup
    #####################################
    Q = QTable(env.n_actions, 128)
    etrace = {}

    ####################################
    #      Training Loop
    ####################################
    for e in range(n_episodes+1):
        done = False
        info = {}
        total_reward = 0
        steps = 0

        # exponential decay
        #epsilon = (epsilon_end / epsilon_start) ** (e / n_episodes) * epsilon_start
        epsilon = max(epsilon_start + e * epsilon_slope, epsilon_end)

        state = env.reset()
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = int(np.argmax(Q[state]))

        while not done:
            next_state, reward, done, info = env.step(action)
            total_reward += reward

            # choose a' from a Policy derived from Q
            best_next_action = int(np.argmax(Q[next_state]))  # greedy
            if np.random.rand() < epsilon:
                next_action = env.action_space.sample()
            else:
                next_action = int(np.argmax(Q[state]))
            
            # calculate the TD error
            td_error = reward + gamma * Q[next_state][best_next_action] - Q[state][action]

            # reset eligibility trace for (s,a) using replacing strategy
            etrace[(state, action)] = 1

            # perform Q update
            if best_next_action == next_action:
                for (s, a), eligibility in etrace.items():
                    Q[s][a] += alpha * eligibility * td_error
                    etrace[(s, a)] *= gamma * lmbda
            else:
                for (s, a), eligibility in etrace.items():
                    Q[s][a] += alpha * eligibility * td_error
                etrace = {}

            steps += 1
            state, action = next_state, next_action

        all_rewards[e % SAVE_FREQ] = total_reward
        all_wins[e % SAVE_FREQ] = 1 if info['win'] else 0
        all_steps[e % SAVE_FREQ] = info['steps']
        all_gap_jumps[e % SAVE_FREQ] = info['cliff_jumps']

        if e % SAVE_FREQ == 0 and e > 0:
            print(f'finished #{e}. eps: {epsilon:.3f} avg_R: {all_rewards.mean():>4.2f} '
                  f'avg_steps: {all_steps.mean():>4.2f} '
                  f'win_rate: {all_wins.mean():3.2f} gap_jumps: {all_gap_jumps.mean():.1f} '
                  f'states: {Q.num_states}')
    print('training finished.')
    env.teardown()
    return Q


In [9]:
Q = train()


finished #100. eps: 0.496 avg_R: -268.02 avg_steps: 296.82 win_rate: 0.01 gap_jumps: 0.0 states: 791
finished #200. eps: 0.492 avg_R: -197.65 avg_steps: 246.20 win_rate: 0.03 gap_jumps: 0.1 states: 926
finished #300. eps: 0.488 avg_R: -160.23 avg_steps: 208.84 win_rate: 0.03 gap_jumps: 0.1 states: 979
finished #400. eps: 0.483 avg_R: -71.88 avg_steps: 160.12 win_rate: 0.07 gap_jumps: 0.1 states: 1035
finished #500. eps: 0.479 avg_R: -150.44 avg_steps: 208.90 win_rate: 0.05 gap_jumps: 0.1 states: 1064
finished #600. eps: 0.475 avg_R: -56.20 avg_steps: 233.75 win_rate: 0.13 gap_jumps: 0.2 states: 1086
finished #700. eps: 0.471 avg_R: -13.96 avg_steps: 182.11 win_rate: 0.08 gap_jumps: 0.2 states: 1104
finished #800. eps: 0.467 avg_R: -21.78 avg_steps: 180.02 win_rate: 0.11 gap_jumps: 0.2 states: 1124


KeyboardInterrupt: 

In [10]:
Q.table[:10]

array([[ 273.11867125,  198.69165036,  257.80384889,  235.5508301 ,
         994.88994152,  184.15786209,  119.20277414,  189.28542897,
         164.92359604],
       [ 821.57713453,  958.10053996,  944.98641802,  767.18062244,
         941.96757806,  960.25557447,  945.05896179,  970.95820646,
         946.27578999],
       [ 541.46669231,  557.83105983,  945.83589326,  962.07512853,
         887.95144566,  602.16231956,  966.0081283 ,  990.89349073,
         916.69277353],
       [ 291.89122076,  353.19764055,  379.72456276,  949.47176187,
         289.47465368,  598.76241304,  493.98726499,  339.41658906,
         299.60595278],
       [ 256.86752917,  326.5520689 ,  314.67358927,  989.17271385,
         208.80959422,  339.16957282,  259.85740258,  295.24205572,
         333.56878072],
       [ 223.37780599,  278.20134585,  176.53362895, 1002.55647602,
         134.45094642,  204.54100502,  244.95155386,  204.71361493,
         233.31677016],
       [ 256.9199174 ,  251.67220108,  1

# Replay

In [12]:
reward_settings = gym_marioai.RewardSettings(progress=prog, timestep=timestep,
                                             cliff=cliff, win=win, dead=dead)
env = gym.make('Marioai-v0', render=True,
               level_path=path,
               reward_settings=reward_settings,
               compact_observation=True,
               trace_length=trace,
               rf_width=rf_width, rf_height=rf_height)

while True:
    done = False
    info = {}
    total_reward = 0
    steps = 0
    state = env.reset()

    while not done:
        action = int(np.argmax(Q[state]))  # greedy
        state, reward, done, info = env.step(action)
        total_reward += reward
        steps += 1

    print(f'finished episode. reward: {total_reward:4.2f}\t steps: {steps:4.2f}\t'
          f'win: {info["win"]}\t gap jumps: {info["cliff_jumps"]}')

ConnectionResetError: [Errno 104] Connection reset by peer

## Plotting the training results

In [None]:
# TODO