```
Copyright 2022 DeepMind Technologies Limited.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
```

# Melting Pot Evaluation Results

This Colab plots results of the MAPLA evaluations outlined in the [Melting Pot 2.0 Tech Report](https://arxiv.org/abs/2211.13746).

1.  Click "Connect" in the top right corner.
2.  Select "Runtime -> Run all".

In [None]:
# @title Installs

%pip install --quiet colabtools
%pip install --quiet matplotlib
%pip install --quiet numpy
%pip install --quiet pandas
%pip install --quiet seaborn

## Setup

In [None]:
# @title Imports

import dataclasses
import re
import sys
from unittest import mock
import urllib

import IPython
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from google.colab import widgets

In [None]:
# @title Setup

def no_vertical_scrollbar():
  """Disable scroll-in-the-scroll."""
  javascript = 'google.colab.output.setIframeHeight(0, true, {interactive: true, maxHeight: 9999})'
  display(IPython.display.Javascript(javascript))


# No vertical scrollbars.
get_ipython().events.register('pre_run_cell', no_vertical_scrollbar)

# Allow higher resolution plots.
IPython.display.set_matplotlib_formats('retina')

In [None]:
# @title Utilities

def display(dataframe):
  """Displays dataframe, regardless of size.

  Args:
    dataframe: dataframe to display.
  """
  with pd.option_context('display.max_rows', None, 'display.max_columns', None,
                         'display.max_colwidth', None):
    IPython.display.display(dataframe)


def _heatmap(data, **kwargs):
  """Plots a heatmap of the data.

  Args:
    data: Data to plot
    **kwargs: forwarded to sns.heatmap

  Returns:
    The axes of the heatmap.
  """
  max_abs_value = np.nanmax(np.abs(data))
  kwargs.setdefault('cbar', False)
  kwargs.setdefault('linewidth', 1)
  kwargs.setdefault('annot', True)
  if max_abs_value >= 10000:
    kwargs.setdefault('fmt', '.1g')
  elif max_abs_value >= 100:
    kwargs.setdefault('fmt', '.0f')
  elif max_abs_value >= 10:
    kwargs.setdefault('fmt', '.1f')
  else:
    kwargs.setdefault('fmt', '.2f')

  ax = sns.heatmap(data, **kwargs)
  plt.tick_params(
      which='both', left=False, right=False, bottom=False, top=False)
  plt.setp([tick.label1 for tick in ax.xaxis.get_major_ticks()],
           rotation=45,
           ha='right',
           va='center',
           rotation_mode='anchor')
  plt.setp([tick.label2 for tick in ax.xaxis.get_major_ticks()],
           rotation=45,
           ha='left',
           va='center',
           rotation_mode='anchor')
  return ax


def heatmap(data, left=None, right=None, top=None, bottom=None, **kwargs):
  row_labels = list(data.index)
  col_labels = list(data.columns)
  data = data.to_numpy()

  if top is None:
    top_rows = 0
  elif not top.empty:
    row_labels = list(top.index) + [''] + row_labels
    data = np.vstack([
        top.to_numpy(),
        np.zeros(top.iloc[0].shape) + np.nan,
        data,
    ])
    top_rows = top.shape[0] + 1

  if bottom is None:
    bottom_rows = 0
  elif not bottom.empty:
    row_labels = row_labels + [''] + list(bottom.index)
    data = np.vstack([
        data,
        np.zeros(bottom.iloc[0].shape) + np.nan,
        bottom.to_numpy(),
    ])
    bottom_rows = bottom.shape[0] + 1

  if left is None:
    pass
  elif not left.empty:
    col_labels = list(left.columns) + [''] + col_labels
    data = np.hstack([
        np.vstack([
            np.zeros([top_rows, left.shape[1]]) + np.nan,
            left.to_numpy(),
            np.zeros([bottom_rows, left.shape[1]]) + np.nan,
        ]),
        np.zeros([data.shape[0], 1]) + np.nan,
        data,
    ])

  if right is None:
    pass
  elif not top.empty:
    col_labels = col_labels + [''] + list(right.columns)
    data = np.hstack([
        data,
        np.zeros([data.shape[0], 1]) + np.nan,
        np.vstack([
            np.zeros([top_rows, right.shape[1]]) + np.nan,
            right.to_numpy(),
            np.zeros([bottom_rows, right.shape[1]]) + np.nan,
        ]),
    ])

  with plt.rc_context({
      'font.size': 15,
      'xtick.labeltop': True,
      'xtick.labelbottom': True,
      'ytick.labelleft': True,
      'ytick.labelright': True,
  }):
    plt.figure(figsize=(data.shape[1], data.shape[0] * 0.6))
    kwargs.setdefault('vmin', 0)
    kwargs.setdefault('vmax', 1)
    kwargs.setdefault('cmap', 'coolwarm')
    return _heatmap(
        data=data,
        xticklabels=col_labels,
        yticklabels=row_labels,
        **kwargs,
    )

## Fetch results

In [None]:
# @title Load scenario results
path = 'https://storage.googleapis.com/dm-meltingpot/meltingpot-results-2.1.1.feather'  # @param {type: 'string'}

def load_scenario_results(path):
  results = pd.read_feather(path)
  # Drop training scores
  scenario_results = results.drop(
      labels=set(results.substrate.unique()),
      axis=1,
      errors='ignore')
  return scenario_results.set_index(['scenario', 'substrate', 'mapla', 'training_run'])


scenario_results = load_scenario_results(path)

In [None]:
# @title Make assumptions about missing prosocial runs

print("""
NOTE: For the collective-return substrates, the prosocial MAPLA receive rewards 
identical to those received by a non-prosocial variants (except for a scale
factor). Thus, for these substrates, the prosocial MAPLA is identical to the
non-prosocial variant, and we expect they would therefore achieved the same
performance. We therefore copy the non-prosocial scores for this situation.
""")

_COLLECTIVE_RETURN_SUBSTATES = frozenset({
    'collaborative_cooking__asymmetric',
    'collaborative_cooking__circuit',
    'collaborative_cooking__cramped',
    'collaborative_cooking__crowded',
    'collaborative_cooking__figure_eight',
    'collaborative_cooking__forced',
    'collaborative_cooking__ring',
})


def add_prosocial_performance(results):
  df = results.reset_index()
  df = df[df.substrate.isin(_COLLECTIVE_RETURN_SUBSTATES)]
  df = df[df.mapla.isin(['acb', 'opre'])]
  df = df.assign(mapla=df.mapla.map(lambda x: x + '_prosocial'))
  df = df.set_index(['scenario', 'substrate', 'mapla', 'training_run'])
  return pd.concat([results, df]).sort_index()


scenario_results = add_prosocial_performance(scenario_results)

In [None]:
# @title Keep only best exploiter for each scenario

def keep_best_exploiter(scenario_results):
  df = scenario_results.reset_index()
  idx = df.mapla.map(lambda x: x.startswith('exploiter_'))
  exploiters = df[idx]
  non_exploiters = df[~idx]

  performance = exploiters.groupby(['scenario', 'substrate', 'mapla']).focal_per_capita_return.max()
  best_exploiter = performance.unstack('mapla').idxmax(axis=1)

  idx = exploiters.apply(lambda row: best_exploiter.loc[row.scenario, row.substrate] == row.mapla, axis=1)
  exploiters = exploiters[idx].assign(mapla='exploiter')
  
  recombined = pd.concat([non_exploiters, exploiters])
  return recombined.set_index(['scenario', 'substrate', 'mapla', 'training_run']).sort_index()


scenario_results = keep_best_exploiter(scenario_results)

## Calculate scores

In [None]:
# @title Normalize focal_per_capita_return statistics

def normalize(performance_per_run):
  raw = performance_per_run.unstack(['mapla', 'training_run'])
  lower = raw.min(axis=1) - 1e-8
  upper = raw.max(axis=1)
  scale = upper - lower

  normalized = raw.subtract(lower, axis=0).divide(scale, axis=0)
  normalized = normalized.stack(['mapla', 'training_run'])
  normalized = normalized.sort_index()
  normalized.name = 'score'
  return normalized


scenario_scores_per_run = normalize(scenario_results.focal_per_capita_return)

In [None]:
# @title Calculate per-substrate scores

def get_substrate_scores_per_run(scenario_scores_per_run):
  grouped = scenario_scores_per_run.groupby(['substrate', 'mapla', 'training_run'])
  scenarios_per_substrate = grouped.count().groupby('substrate').max()
  substrate_scores_per_run = grouped.sum() / scenarios_per_substrate
  substrate_scores_per_run.name = 'score'
  return substrate_scores_per_run


substrate_scores_per_run = get_substrate_scores_per_run(scenario_scores_per_run)

In [None]:
# @title Calculate overall scores

def get_overall_scores_per_run(substrate_scores_per_run):
  grouped = substrate_scores_per_run.groupby(['mapla', 'training_run'])
  substrates = grouped.count().max()
  overall_scores_per_run = grouped.sum() / substrates
  overall_scores_per_run.name = 'score'
  return overall_scores_per_run


overall_scores_per_run = get_overall_scores_per_run(substrate_scores_per_run)

## Plot results

In [None]:
# @title Plot scores

def plot_scores(scenario_scores_per_run, substrate_scores_per_run, overall_scores_per_run):
  overall_scores = overall_scores_per_run.groupby(['mapla']).mean()
  overall_scores = overall_scores.sort_values(ascending=False)

  substrate_scores = substrate_scores_per_run.groupby(['substrate', 'mapla']).mean()
  substrate_scores = substrate_scores.unstack('mapla')
  substrate_scores = substrate_scores.reindex(columns=overall_scores.index)

  scenario_scores = scenario_scores_per_run.groupby(['scenario', 'substrate', 'mapla']).mean()
  scenario_scores = scenario_scores.unstack('mapla')
  scenario_scores = scenario_scores.reindex(columns=overall_scores.index)

  tabs = widgets.TabBar(['summary', 'breakdown'])

  with tabs.output_to('summary'):
    top = pd.DataFrame.from_dict({'overall score': overall_scores}, orient='index')
    heatmap(substrate_scores, top=top)

  with tabs.output_to('breakdown', select=False):
    substrates = scenario_scores.index.unique('substrate')
    subtabs = widgets.TabBar(sorted(substrates))
    for substrate, df in scenario_scores.groupby(level='substrate'):
      with subtabs.output_to(substrate, select=False):
        df = df.droplevel('substrate')

        top = pd.DataFrame.from_dict({
            'all substrates': overall_scores,
            substrate: substrate_scores.loc[substrate]
        }, orient='index')
        heatmap(df, top=top)



plot_scores(scenario_scores_per_run, substrate_scores_per_run, overall_scores_per_run)