# Training Reinforcement Learning Agent
This notebook contains all the necessary code to train different policies and compare them to analyze the performance of the agent.

## Imports and global settings
The following section imports all necessary modules to train the agent and sets global settings.

In [1]:
from monte_carlo_eval import eval
import pandas as pd

Afterwards we remove and add a directory which is used to save the metrics for a trainings-run.

In [2]:
!rm -r ./performance-logs
!mkdir ./performance-logs

The following lines of code set the global parameters for the Monte Carlo simulation. The parameters are:

In [3]:
num_episodes = 10_000
checkpoint = 200

Lastly, the following method performs the evaluation and saves them into dataframes for further analysis.

In [4]:
def addMetrics(map, epsilon, checkpoint, df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness):
    # Evaluate the performance of the agent
    total_time, epoch_times, overall_effectiveness, checkpoint_effectiveness = eval(map, epsilon, num_episodes, checkpoint)

    # Add the performance metrics to the dataframes
    df_total_times = pd.concat([pd.DataFrame([[map, epsilon, total_time]], columns=df_total_times.columns), df_total_times], ignore_index=True)
    df_overall_performance = pd.concat([pd.DataFrame([[map, epsilon, overall_effectiveness]], columns=df_overall_performance.columns), df_overall_performance], ignore_index=True)
    for i, epoch_time in enumerate(epoch_times):
        df_epoch_times = pd.concat([pd.DataFrame([[map, epsilon, i+1, epoch_time]], columns=df_epoch_times.columns), df_epoch_times], ignore_index=True)

    for i, epoch_time in enumerate(checkpoint_effectiveness):
        df_checkpoint_effectiveness = pd.concat([pd.DataFrame([[map, epsilon, (i+1)*checkpoint, epoch_time]], columns=df_checkpoint_effectiveness.columns), df_checkpoint_effectiveness], ignore_index=True)
    
    return df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness 

## Map 1
This section performs the evaluation of the reinforcement agent for `maps/map1.txt` for epsilon `0.9`, `0.7` and `0.5` respectively.

In [5]:
df_total_times = pd.DataFrame(columns = ['map', 'epsilon', 'total_time'])
df_epoch_times = pd.DataFrame(columns = ['map', 'epsilon', 'episode', 'epoch_time'])
df_overall_performance = pd.DataFrame(columns = ['map', 'epsilon', 'overall_performance'])
df_checkpoint_effectiveness = pd.DataFrame(columns = ['map', 'epsilon', 'episode', 'checkpoint_effectiveness'])

In [6]:
map_name = "map1"

### Epsilon 0.9

In [7]:
df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness = addMetrics(map_name, 0.9, checkpoint, df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness)

100%|██████████| 10000/10000 [42:14<00:00,  3.95it/s] 
  df_total_times = pd.concat([pd.DataFrame([[map, epsilon, total_time]], columns=df_total_times.columns), df_total_times], ignore_index=True)
  df_overall_performance = pd.concat([pd.DataFrame([[map, epsilon, overall_effectiveness]], columns=df_overall_performance.columns), df_overall_performance], ignore_index=True)
  df_epoch_times = pd.concat([pd.DataFrame([[map, epsilon, i+1, epoch_time]], columns=df_epoch_times.columns), df_epoch_times], ignore_index=True)
  df_checkpoint_effectiveness = pd.concat([pd.DataFrame([[map, epsilon, (i+1)*checkpoint, epoch_time]], columns=df_checkpoint_effectiveness.columns), df_checkpoint_effectiveness], ignore_index=True)


### Epsilon 0.7

In [8]:
df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness = addMetrics(map_name, 0.7, checkpoint, df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness)

100%|██████████| 10000/10000 [03:40<00:00, 45.37it/s]


### Epsilon 0.5

In [9]:
df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness = addMetrics(map_name, 0.5, checkpoint, df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness)

100%|██████████| 10000/10000 [01:15<00:00, 131.97it/s]


### Caching of results

In [10]:
df_total_times.to_csv(f'./performance-logs/df_total_times_{map_name}.csv', index=False)
df_epoch_times.to_csv(f'./performance-logs/df_epoch_times_{map_name}.csv', index=False)
df_overall_performance.to_csv(f'./performance-logs/df_overall_performance_{map_name}.csv', index=False)
df_checkpoint_effectiveness.to_csv(f'./performance-logs/df_checkpoint_effectiveness_{map_name}.csv', index=False)

## Map 2
This section performs the evaluation of the reinforcement agent for `maps/map2.txt` for epsilon `0.9`, `0.7` and `0.5` respectively.

In [11]:
df_total_times = pd.DataFrame(columns = ['map', 'epsilon', 'total_time'])
df_epoch_times = pd.DataFrame(columns = ['map', 'epsilon', 'episode', 'epoch_time'])
df_overall_performance = pd.DataFrame(columns = ['map', 'epsilon', 'overall_performance'])
df_checkpoint_effectiveness = pd.DataFrame(columns = ['map', 'epsilon', 'episode', 'checkpoint_effectiveness'])

In [12]:
map_name = "map2"

### Epsilon 0.9

In [13]:
df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness = addMetrics(map_name, 0.9, checkpoint, df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness)

100%|██████████| 10000/10000 [1:31:45<00:00,  1.82it/s] 
  df_total_times = pd.concat([pd.DataFrame([[map, epsilon, total_time]], columns=df_total_times.columns), df_total_times], ignore_index=True)
  df_overall_performance = pd.concat([pd.DataFrame([[map, epsilon, overall_effectiveness]], columns=df_overall_performance.columns), df_overall_performance], ignore_index=True)
  df_epoch_times = pd.concat([pd.DataFrame([[map, epsilon, i+1, epoch_time]], columns=df_epoch_times.columns), df_epoch_times], ignore_index=True)
  df_checkpoint_effectiveness = pd.concat([pd.DataFrame([[map, epsilon, (i+1)*checkpoint, epoch_time]], columns=df_checkpoint_effectiveness.columns), df_checkpoint_effectiveness], ignore_index=True)


### Epsilon 0.7

In [14]:
df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness = addMetrics(map_name, 0.7, checkpoint, df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness)

100%|██████████| 10000/10000 [04:24<00:00, 37.79it/s]


### Epsilon 0.5

In [15]:
df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness = addMetrics(map_name, 0.5, checkpoint, df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness)

100%|██████████| 10000/10000 [02:13<00:00, 74.89it/s]


### Caching of results

In [16]:
df_total_times.to_csv(f'./performance-logs/df_total_times_{map_name}.csv', index=False)
df_epoch_times.to_csv(f'./performance-logs/df_epoch_times_{map_name}.csv', index=False)
df_overall_performance.to_csv(f'./performance-logs/df_overall_performance_{map_name}.csv', index=False)
df_checkpoint_effectiveness.to_csv(f'./performance-logs/df_checkpoint_effectiveness_{map_name}.csv', index=False)

## Map 3
This section performs the evaluation of the reinforcement agent for `maps/map3.txt` for epsilon `0.9`, `0.7` and `0.5` respectively.

In [17]:
df_total_times = pd.DataFrame(columns = ['map', 'epsilon', 'total_time'])
df_epoch_times = pd.DataFrame(columns = ['map', 'epsilon', 'episode', 'epoch_time'])
df_overall_performance = pd.DataFrame(columns = ['map', 'epsilon', 'overall_performance'])
df_checkpoint_effectiveness = pd.DataFrame(columns = ['map', 'epsilon', 'episode', 'checkpoint_effectiveness'])

In [18]:
map_name = "map3"

### Epsilon 0.9

In [19]:
df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness = addMetrics(map_name, 0.9, checkpoint, df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness)

  0%|          | 0/10000 [00:00<?, ?it/s]

100%|██████████| 10000/10000 [37:10<00:00,  4.48it/s] 
  df_total_times = pd.concat([pd.DataFrame([[map, epsilon, total_time]], columns=df_total_times.columns), df_total_times], ignore_index=True)
  df_overall_performance = pd.concat([pd.DataFrame([[map, epsilon, overall_effectiveness]], columns=df_overall_performance.columns), df_overall_performance], ignore_index=True)
  df_epoch_times = pd.concat([pd.DataFrame([[map, epsilon, i+1, epoch_time]], columns=df_epoch_times.columns), df_epoch_times], ignore_index=True)
  df_checkpoint_effectiveness = pd.concat([pd.DataFrame([[map, epsilon, (i+1)*checkpoint, epoch_time]], columns=df_checkpoint_effectiveness.columns), df_checkpoint_effectiveness], ignore_index=True)


### Epsilon 0.7

In [20]:
df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness = addMetrics(map_name, 0.7, checkpoint, df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness)

100%|██████████| 10000/10000 [02:50<00:00, 58.48it/s]


### Epsilon 0.5

In [21]:
df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness = addMetrics(map_name, 0.5, checkpoint, df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness)

100%|██████████| 10000/10000 [00:37<00:00, 264.75it/s]


### Caching of results

In [22]:
df_total_times.to_csv(f'./performance-logs/df_total_times_{map_name}.csv', index=False)
df_epoch_times.to_csv(f'./performance-logs/df_epoch_times_{map_name}.csv', index=False)
df_overall_performance.to_csv(f'./performance-logs/df_overall_performance_{map_name}.csv', index=False)
df_checkpoint_effectiveness.to_csv(f'./performance-logs/df_checkpoint_effectiveness_{map_name}.csv', index=False)

## Map 3 - Reduced
This section performs the evaluation of the reinforcement agent for `maps/map3_red.txt` for epsilon `0.9`, `0.7` and `0.5` respectively.

In [23]:
df_total_times = pd.DataFrame(columns = ['map', 'epsilon', 'total_time'])
df_epoch_times = pd.DataFrame(columns = ['map', 'epsilon', 'episode', 'epoch_time'])
df_overall_performance = pd.DataFrame(columns = ['map', 'epsilon', 'overall_performance'])
df_checkpoint_effectiveness = pd.DataFrame(columns = ['map', 'epsilon', 'episode', 'checkpoint_effectiveness'])

In [24]:
map_name = "map3_red"

### Epsilon 0.9

In [25]:
df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness = addMetrics(map_name, 0.9, checkpoint, df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness)

100%|██████████| 10000/10000 [11:55<00:00, 13.98it/s]
  df_total_times = pd.concat([pd.DataFrame([[map, epsilon, total_time]], columns=df_total_times.columns), df_total_times], ignore_index=True)
  df_overall_performance = pd.concat([pd.DataFrame([[map, epsilon, overall_effectiveness]], columns=df_overall_performance.columns), df_overall_performance], ignore_index=True)
  df_epoch_times = pd.concat([pd.DataFrame([[map, epsilon, i+1, epoch_time]], columns=df_epoch_times.columns), df_epoch_times], ignore_index=True)
  df_checkpoint_effectiveness = pd.concat([pd.DataFrame([[map, epsilon, (i+1)*checkpoint, epoch_time]], columns=df_checkpoint_effectiveness.columns), df_checkpoint_effectiveness], ignore_index=True)


### Epsilon 0.7

In [26]:
df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness = addMetrics(map_name, 0.7, checkpoint, df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness)

100%|██████████| 10000/10000 [01:08<00:00, 145.89it/s]


### Epsilon 0.5

In [27]:
df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness = addMetrics(map_name, 0.5, checkpoint, df_total_times, df_epoch_times, df_overall_performance, df_checkpoint_effectiveness)

100%|██████████| 10000/10000 [00:21<00:00, 456.05it/s]


### Caching of results

In [28]:
df_total_times.to_csv(f'./performance-logs/df_total_times_{map_name}.csv', index=False)
df_epoch_times.to_csv(f'./performance-logs/df_epoch_times_{map_name}.csv', index=False)
df_overall_performance.to_csv(f'./performance-logs/df_overall_performance_{map_name}.csv', index=False)
df_checkpoint_effectiveness.to_csv(f'./performance-logs/df_checkpoint_effectiveness_{map_name}.csv', index=False)

## Performance evaluation
This section plots and compares all the different performance metrics to get a better understanding of our implementation.

### Data Preparation
This section combines all the previously generated dataframes to be able to work with all collected datapoints.

In [29]:
# work in progress