## Authentication
Import the necessary libraries and authenticate the user to access the Google Sheet:

In [None]:
from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

## Code availability analysis

Added BEFORE the original analysis so original analysis may overwrite code, variables but not the other way

###Loading repository information from github  
**!! Credentials Required !!**

In [None]:
# WORKS ONLY IN Google Colab

# Sheet names to analyze
INLG_2023_PAPERS = "INLG 2023 Papers"
GENERATION = "Generation"
CODE_SHEETS = [INLG_2023_PAPERS, GENERATION]
# CODE_SHEETS = ["INLG 2023 Papers"]

# List the column names as constants to prevent typos
TITLE = "Title"
ACL_ID = "ACL ID"
PAPER_TYPE = "Paper Type"
PAPER_LINK = "Paper Link"
CHECKED_BY = "Checked By"
COMPLETE = "Complete"
LINK_TO_CODE = "Link to Code"  #github, some other url, "None first view"
NONE_FIRST_VIEW = "None first view"  # value of LINK_TO_CODE if no code was found
PROMISED_DELIVERED = "Promised Delivered"
INSTALLATION = "Installation"
EXPERIMENTS_COVERED = "Experiments Covered"
SCRIPTS_DOCUMENTATION = "Scripts Documentation"
BEST_PRACTICES = "Best Practices"
ACADEMIA_INDUSTRY = "Academia/Industry"
STARS = "stars"
Citations = "Citations"

# TODO: put your github credentials here
gh_username = ""
secret_token = ""

import pandas as pd
import matplotlib.pyplot as plt

def sheet2df(sheet):
  worksheet = gc.open_by_url('https://docs.google.com/spreadsheets/d/10Yvxn7sb78cZmpmM0RYoAuVHoaP_KiDc970vg9pSa2c/').worksheet(sheet)

  # get_all_values gives a list of rows.
  rows = worksheet.get_all_values()

  # Convert to a DataFrame and render.
  df = pd.DataFrame.from_records(rows[1:], columns=rows[0])
  return df

codedfs = {sheet: sheet2df(sheet) for sheet in CODE_SHEETS}

# RUN THE CELL ABOVE AND save the files or load them if you are on google colab or local
# from google.colab import files
import requests
from urllib.parse import urlparse


def get_github_stars(url):
    try:
        # Parse the URL to get the path
        path = urlparse(url).path
        # Split the path to get the repository owner and name
        owner, repo = path.strip("/").split("/")[0:2]
        # Make a GET request to the GitHub API
        response = requests.get(f"https://api.github.com/repos/{owner}/{repo}", auth=(gh_username, secret_token))
        # Extract the number of stars from the response
        stars = response.json()["stargazers_count"]
    except Exception as e:
        print(f"{url=} {owner=} {repo=}\n{response.json()=}\n{e}", flush=True)
        raise e
    return stars

if codedfs is not None:  # Hack to detect that we are in google colab where the gsheet api works
    from google.colab import files

    for sheet in codedfs:
        df = codedfs[sheet]
        df[PROMISED_DELIVERED] = df[PROMISED_DELIVERED].astype(str)
        df['stars'] = df[~df[PROMISED_DELIVERED].str.contains('404') & df[LINK_TO_CODE].str.contains('github')][LINK_TO_CODE].apply(get_github_stars)

        ##### HERE YOU CAN SAVE THE DATA AND LOAD IT AGAIN
        # codedfs[sheet].to_csv(f"{sheet}.csv")
        # files.download(f"{sheet}.csv")
else:
    #  assuming you have downloaded the files loads them
    codedfs = {}
    for sheet in CODE_SHEETS:
        codedfs[sheet] = pd.read_csv(f"{sheet}.csv")

# codedfs[INLG_2023_PAPERS].head()

### Analysis of stars

In [None]:
# Create the histograms
inlgcdf = codedfs[INLG_2023_PAPERS]
aclcdf = codedfs[GENERATION]

from numpy import log10

logone = lambda x: log10(x+1)
inv_logone = lambda x: 10**x - 1


print(inlgcdf[inlgcdf["Paper Type"] == "Long"]["stars"].mean())
print(inlgcdf[inlgcdf["Paper Type"] == "Long"]["stars"].median())
print(inlgcdf[inlgcdf["Paper Type"] == "Short"]["stars"].mean())
print(inlgcdf[inlgcdf["Paper Type"] == "Short"]["stars"].median())
plt.hist(inlgcdf[inlgcdf["Paper Type"] == "Long"]["stars"].apply(logone), bins=10, alpha=0.5, label='INLG Long Papers', color="#b8cdab")
plt.hist(inlgcdf[inlgcdf["Paper Type"] == "Short"]["stars"].apply(logone), bins=10, alpha=0.5, label='INLG Short Papers', color="#e5c185")
locs, _ = plt.xticks()
plt.xticks(locs, [str(int(inv_logone(loc))) for loc in locs])
plt.xlabel("GitHub Stars [log scale]")
plt.ylabel("# GitHub Repos in Bin")
plt.legend(loc='upper right')

In [None]:
print(aclcdf[aclcdf["Paper Type"] == "Long"]["stars"].mean())
print(aclcdf[aclcdf["Paper Type"] == "Long"]["stars"].median())
print(aclcdf[aclcdf["Paper Type"] == "Short"]["stars"].mean())
print(aclcdf[aclcdf["Paper Type"] == "Short"]["stars"].median())

plt.hist(aclcdf[aclcdf["Paper Type"] == "Long"]["stars"].apply(logone), bins=10, alpha=0.5, label='ACL Long Papers', color="#b8cdab")
plt.hist(aclcdf[aclcdf["Paper Type"] == "Short"]["stars"].apply(logone), bins=10, alpha=0.5, label='ACL Short Papers', color="#e5c185")
locs, _ = plt.xticks()
plt.xticks(locs, [str(int(inv_logone(loc))) for loc in locs])
plt.xlabel("GitHub Stars [log scale]")
plt.ylabel("# GitHub Repos in Bin")
plt.legend(loc='upper right', title="GitHub Stars")

In [None]:
acl_stars_mean = aclcdf["stars"].mean()
inlg_stars_mean = inlgcdf["stars"].mean()
print(f"mean acl {acl_stars_mean:.2f} vs ingl {inlg_stars_mean:.2f}")
acl_stars_q50 = aclcdf["stars"].median()
inlg_stars_q50 = inlgcdf["stars"].median()
print(f"q50 acl {acl_stars_q50:.2f} vs ingl {inlg_stars_q50:.2f}")


aclcdf["stars"].apply(logone).hist(bins=10, alpha=0.8, label="ACL", color="#b8cdab")
inlgcdf["stars"].apply(logone).hist(bins=10, alpha=0.8, label="INLG", color="#e5c185")
locs, _ = plt.xticks()
plt.xticks(locs, [str(int(inv_logone(loc))) for loc in locs])
plt.xlabel("GitHub Stars [log scale]")
plt.ylabel("# GitHub Repos in Bin")
plt.legend(loc='upper right', title="GitHub Stars for papers")
print("fig:stars_acl_inlg")

In [None]:
sheet = GENERATION
aidf = codedfs[sheet]

acl_mispromised_industry_mean = aidf[(aidf[ACADEMIA_INDUSTRY] == "Academia") & aidf[PROMISED_DELIVERED].str.startswith("NOT FILLED")]["stars"].mean()
acl_mispromised_industry_q50 = aidf[(aidf[ACADEMIA_INDUSTRY] == "Academia") & aidf[PROMISED_DELIVERED].str.startswith("NOT FILLED")]["stars"].median()

print(f"{acl_mispromised_industry_mean:.2f} {acl_mispromised_industry_q50:.2f}")

aidf[(aidf[LINK_TO_CODE] != NONE_FIRST_VIEW) & (~aidf[PROMISED_DELIVERED].str.startswith("NOT FILLED"))]["stars"].apply(logone).hist(bins=10, alpha=0.5, label="Delivered", color="#779af5")
aidf[(aidf[ACADEMIA_INDUSTRY] == "Academia") & aidf[PROMISED_DELIVERED].str.startswith("NOT FILLED")]["stars"].apply(logone).hist(bins=10, alpha=0.8, label="Missing in Academia", color="#b8cdab")
aidf[(aidf[ACADEMIA_INDUSTRY] == "Industry") & aidf[PROMISED_DELIVERED].str.startswith("NOT FILLED")]["stars"].apply(logone).hist(bins=10, alpha=0.8, label="Missing in Industry", color='#e5c185')
plt.legend(loc='upper right', title="ACL GitHub Stars" if sheet == GENERATION else "INLG GitHub Stars")
locs, _ = plt.xticks()
plt.xticks(locs, [str(int(inv_logone(loc))) for loc in locs])
plt.xlabel("GitHub Stars [log scale]")
plt.ylabel("# GitHub Repos in Bin")

print("fig:stars_mispromised")

In [None]:
sheet = INLG_2023_PAPERS
aidf = codedfs[sheet]
aidf[(aidf[LINK_TO_CODE] != NONE_FIRST_VIEW) & (~aidf[PROMISED_DELIVERED].str.startswith("NOT FILLED"))]["stars"].apply(logone).hist(bins=10, alpha=0.5, label="Delivered", color="#779af5")
aidf[(aidf[ACADEMIA_INDUSTRY] == "Academia") & aidf[PROMISED_DELIVERED].str.startswith("NOT FILLED")]["stars"].apply(logone).hist(bins=10, alpha=0.8, label="Missing in Academia", color="#b8cdab")
aidf[(aidf[ACADEMIA_INDUSTRY] == "Industry") & aidf[PROMISED_DELIVERED].str.startswith("NOT FILLED")]["stars"].apply(logone).hist(bins=10, alpha=0.8, label="Missing in Industry", color='#e5c185')
plt.legend(loc='upper right', title="ACL GitHub Stars" if sheet == GENERATION else "INLG GitHub Stars")
locs, _ = plt.xticks()
plt.xticks(locs, [str(int(inv_logone(loc))) for loc in locs])
plt.xlabel("GitHub Stars [log scale]")
plt.ylabel("# GitHub Repos in Bin")
print("fig:stars_mispromised")

In [None]:
sheet = INLG_2023_PAPERS
aidf = codedfs[sheet]
aidf[(aidf[INSTALLATION] == "None")& (aidf[LINK_TO_CODE] != NONE_FIRST_VIEW) & (~aidf[PROMISED_DELIVERED].str.startswith("NOT FILLED"))]["stars"].apply(logone).hist(bins=10, alpha=0.2, label="None Instruction", color="grey")
aidf[(aidf[INSTALLATION] == "Basic")& (aidf[LINK_TO_CODE] != NONE_FIRST_VIEW) & (~aidf[PROMISED_DELIVERED].str.startswith("NOT FILLED"))]["stars"].apply(logone).hist(bins=10, alpha=0.2, label="Basic Instruction", color="red")
aidf[(aidf[INSTALLATION] == "Detailed")& (aidf[LINK_TO_CODE] != NONE_FIRST_VIEW) & (~aidf[PROMISED_DELIVERED].str.startswith("NOT FILLED"))]["stars"].apply(logone).hist(bins=10, alpha=0.2, label="Detailed Instruction", color="green")
plt.legend(loc='upper right', title="ACL GitHub Stars" if sheet == GENERATION else "INLG GitHub Stars")
locs, _ = plt.xticks()
plt.xticks(locs, [str(int(inv_logone(loc))) for loc in locs])
plt.xlabel("GitHub Stars [log scale]")
plt.ylabel("# GitHub Repos in Bin")

### Analysis of code availability and quality

In [None]:
mapping = {"ACL": "Generation", "INLG":  "INLG 2023 Papers"}
inlg = codedfs[mapping["INLG"]]
inlg_missing = inlg[inlg[PROMISED_DELIVERED].str.startswith("NOT FILLED")][ACADEMIA_INDUSTRY].value_counts()


acl = codedfs[mapping["ACL"]]
acl_missing = acl[acl[PROMISED_DELIVERED].str.startswith("NOT FILLED")][ACADEMIA_INDUSTRY].value_counts()


comparison = pd.DataFrame({'INLG 2023': inlg_missing, 'ACL 2023': acl_missing}).fillna(0).T
ax = comparison.plot(kind='bar', stacked=True, ylim=(0,16), color=["#e5c185", "#b8cdab", "#779af5"], figsize=(5,5))

labels = []

plt.ylabel('# of Papers')


plt.legend(title='Institution Type', loc='upper left', ncols=1)
plt.xticks(rotation='horizontal')
plt.tight_layout()

totals = comparison.sum(axis=1)

# Annotate the bars with the percentage values
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy()

    # Get the conference corresponding to this bar
    conference = ax.get_xticklabels()[int(x + width / 2)].get_text()

    # Calculate the percentage relative to the total for that conference
    total = totals[conference]
    percentage = (height / total) * 100 if total > 0 else 0

    # Display the percentage
    if height > 0:
        ax.text(x + width / 2, y + height / 2, f'{percentage:.1f}%', ha='center', va='center')


plt.show()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
counts = []  # accumulator for preprocess raw data LATER USED FOR PLOTTING GRAPHS format is tuple (Column Investigated, Conference name, Label, Label_count)

In [None]:
label = "Open Source?"

def missing(comment):
    return "Missing" if comment.startswith('NOT FILLED') else "Delivered"

for sheet in codedfs:
    df = codedfs[sheet]
    df[PROMISED_DELIVERED] = df[PROMISED_DELIVERED].fillna('None')
    df['Category'] = df[PROMISED_DELIVERED].apply(missing)
    missing_series = df['Category'].value_counts()

    dset = "ACL" if sheet == GENERATION else "INLG"
    for k, v in missing_series.items():
        counts.append((label, dset, k, v))


def classify_code(link):
    if "github" in link:
        return "GitHub"
    elif link == NONE_FIRST_VIEW:
        return "None"
    else:
        return "Other"

for sheet in codedfs:
    df = codedfs[sheet]
    code_series = df[LINK_TO_CODE].apply(classify_code).value_counts()


    dset = "ACL" if sheet == GENERATION else "INLG"
    for k, v in code_series.items():
        if k == "None":
            counts.append((label, dset, "Not Published", v))

In [None]:
label = "Installation Instructions"
for sheet in codedfs:
    df = codedfs[sheet]
    df['Installation'] = df['Installation'].fillna('None')
    print(f"Unique values for 'Installation' in {sheet}: {df['Installation'].unique()}")
    installation_series = df['Installation'].value_counts()

    dset = "ACL" if sheet == GENERATION else "INLG"
    for k, v in installation_series.items():
        counts.append((label, dset, k, v))

    installation_series.plot(kind='pie', autopct='%1.1f%%')
    plt.title(f"'Installation' distribution in {sheet}")
    plt.show()

    print("fig:install_instr")

In [None]:
label = f"Found Experiments"
for sheet in codedfs:
    df = codedfs[sheet]
    df['Experiments Covered'] = df['Experiments Covered'].fillna('None')
    print(f"Unique values for 'Experiments Covered' in {sheet}: {df['Experiments Covered'].unique()}")
    experiments_series = df['Experiments Covered'].value_counts()
    experiments_series.plot(kind='pie', autopct='%1.1f%%')

    dset = "ACL" if sheet == GENERATION else "INLG"
    for k, v in experiments_series.items():
        counts.append((label, dset, k, v))


    plt.title(f"'Experiments Covered' distribution in {sheet}")
    plt.show()

    print("fig_experiments")

In [None]:
label = f"Documentation of Code"

for sheet in codedfs:
    df = codedfs[sheet]
    df['Scripts Documentation'] = df['Scripts Documentation'].fillna('None')
    print(f"Unique values for 'Scripts Documentation' in {sheet}: {df['Scripts Documentation'].unique()}")
    scripts_doc_series = df['Scripts Documentation'].value_counts().sort_index()
    scripts_doc_series.plot(kind='pie', autopct='%1.1f%%')

    dset = "ACL" if sheet == GENERATION else "INLG"
    for k, v in scripts_doc_series.items():
        counts.append((label, dset, k, v))
    plt.title(f"'Scripts Documentation' distribution in {sheet}")
    plt.show()
    print("fig:documentation")

In [None]:
confs = [t[1] for t in counts]

# fig, axs = plt.subplots(len(labels), figsize=(10, 15))

label = 'Open Source?'
d = {}
for conf in confs:
  d[f"{conf} 2023"] = dict(t[2:4] for t in counts if t[1] == conf and t[0] == label)
comparison = pd.DataFrame(d).fillna(0).T

ax = comparison.plot(kind='bar', stacked=True, color=["#e5c185", "#b8cdab", "#779af5"], figsize=(5,5))

labels = list(d[f"{confs[0]} 2023"].keys())

plt.ylabel('# of Papers')

# Shrink current axis's height by 10% on the bottom
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.1,
              box.width, box.height * 0.9])

plt.legend(title=label, loc='upper left', labels=labels, ncols=2)
plt.xticks(rotation='horizontal')
plt.tight_layout()

plt.legend(title='Was code delivered?', loc='upper left', ncols=1)
plt.xticks(rotation='horizontal')
plt.tight_layout()

totals = comparison.sum(axis=1)

# Annotate the bars with the percentage values
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy()

    # Get the conference corresponding to this bar
    conference = ax.get_xticklabels()[int(x + width / 2)].get_text()

    # Calculate the percentage relative to the total for that conference
    total = totals[conference]
    percentage = (height / total) * 100 if total > 0 else 0

    # Display the percentage
    if height > 0:
        ax.text(x + width / 2, y + - 0.1 + height / 2, f'{percentage:.1f}%', ha='center', va='center', fontsize=10)


plt.show()
print("fig_nocode_deliver_promised")

In [None]:
confs = [t[1] for t in counts]
label = 'Documentation of Code'
d = {}

for conf in confs:
  d[f"{conf} 2023"] = dict(t[2:4] for t in counts if t[1] == conf and t[0] == label)
comparison = pd.DataFrame(d).fillna(0).T


ax = comparison.plot(kind='bar', stacked=True, color=["#e5c185", "#b8cdab", "#779af5"], figsize=(5,5))

labels = list(d[f"{confs[0]} 2023"].keys())

plt.ylabel('# of Papers')

# Shrink current axis's height by 10% on the bottom
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.1,
              box.width, box.height * 0.9])

plt.legend(title="Documentation Status", loc='upper left', labels=labels, ncols=1)
plt.xticks(rotation='horizontal')
plt.tight_layout()

plt.xticks(rotation='horizontal')
plt.tight_layout()

totals = comparison.sum(axis=1)

# Annotate the bars with the percentage values
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy()

    # Get the conference corresponding to this bar
    conference = ax.get_xticklabels()[int(x + width / 2)].get_text()

    # Calculate the percentage relative to the total for that conference
    total = totals[conference]
    percentage = (height / total) * 100 if total > 0 else 0

    # Display the percentage
    if height > 0:
        ax.text(x + width / 2, y + - 0.1 + height / 2, f'{percentage:.1f}%', ha='center', va='center', fontsize=10)


plt.show()
print("fig_documentation")

In [None]:
confs = [t[1] for t in counts]
label = 'Installation Instructions'
d = {}

for conf in confs:
  d[f"{conf} 2023"] = dict(t[2:4] for t in counts if t[1] == conf and t[0] == label)
comparison = pd.DataFrame(d).fillna(0).T

ax = comparison.plot(kind='bar', stacked=True, color=["#e5c185", "#b8cdab", "#779af5"], figsize=(5,5))

labels = list(d[f"{confs[0]} 2023"].keys())

plt.ylabel('# of Papers')

# Shrink current axis's height by 10% on the bottom
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.1,
              box.width, box.height * 0.9])

plt.legend(title="Installation Instructions", loc='upper left', labels=labels, ncols=1)
plt.xticks(rotation='horizontal')
plt.tight_layout()

plt.xticks(rotation='horizontal')
plt.tight_layout()

totals = comparison.sum(axis=1)

# Annotate the bars with the percentage values
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy()

    # Get the conference corresponding to this bar
    conference = ax.get_xticklabels()[int(x + width / 2)].get_text()

    # Calculate the percentage relative to the total for that conference
    total = totals[conference]
    percentage = (height / total) * 100 if total > 0 else 0

    # Display the percentage
    if height > 0:
        ax.text(x + width / 2, y + - 0.1 + height / 2, f'{percentage:.1f}%', ha='center', va='center', fontsize=10)


plt.show()
print("fig_install_instr")

In [None]:
confs = [t[1] for t in counts]
label = 'Found Experiments'
d = {}

for conf in confs:
  d[f"{conf} 2023"] = dict(t[2:4] for t in counts if t[1] == conf and t[0] == label)
comparison = pd.DataFrame(d).fillna(0).T


#comparison_normalized = comparison.div(comparison.sum(axis=1), axis=0) * 100
ax = comparison.plot(kind='bar', stacked=True, color=["#e5c185", "#b8cdab", "#779af5"], figsize=(5,5))

labels = list(d[f"{confs[0]} 2023"].keys())

plt.ylabel('# of Papers')

# Shrink current axis's height by 10% on the bottom
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.1,
              box.width, box.height * 0.9])

plt.legend(title="Code for Experiments", loc='upper left', labels=labels, ncols=1)
plt.xticks(rotation='horizontal')
plt.tight_layout()

plt.xticks(rotation='horizontal')
plt.tight_layout()

totals = comparison.sum(axis=1)

# Annotate the bars with the percentage values
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy()

    # Get the conference corresponding to this bar
    conference = ax.get_xticklabels()[int(x + width / 2)].get_text()

    # Calculate the percentage relative to the total for that conference
    total = totals[conference]
    percentage = (height / total) * 100 if total > 0 else 0

    # Display the percentage
    if height > 0:
        ax.text(x + width / 2, y + - 0.1 + height / 2, f'{percentage:.1f}%', ha='center', va='center', fontsize=10)


plt.show()
print("fig_experiments")

## Main analysis

### Initialization

In [None]:
# Sheet names to analyze
SHEETS = ["INLG 2023", "ACL 2023"]
PROPERTIES = "Metric properties"

# List the column names as constants to prevent typos
ID = "ACL Paper ID"
METRIC_NAME = "Metric name"
NEWLY = "Newly introduced?"
APPENDIX = "Appendix"
TASK = "Updated Task"
#TASK_OLD = "Task"
#INK_TO_METRIC = "Link to the Metric Paper"
#PAPER_LINK = "Link to the Paper"
CORRELATED = "Corrleated w/ Human Evaluation?"
ANNOTATOR = "Annotator"
METRIC_IMPL = "Metric Implementations"
IMPL = "Metric Implementations"
RATIONALE = "Notes: Rational"
COMMENTS = "Comments"
CONF = "Conf"
SURVEY = "Survey"
FAMILY = "Metric Family"
DISPLAY = 'Display Name'
NORM = 'Normalized name'
TRAIN = 'Trainable?'
SRC = 'Uses source?'
REF = 'Uses ref?'
SRC_REF = 'Uses source or reference?'
ADHOC = 'Is ad-hoc?'

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.colors as mcolors
import re
# The following snippet checks for the number of annotations per one paper per annotator and reports discrepancies
def print_metric_counts(df):
  counts = df.groupby([ID, ANNOTATOR]).size().reset_index(name='count')
  agreed = disagreed = 0
  for i, group in counts.groupby(ID):
      unique_counts = group['count'].nunique()
      if unique_counts > 1:
          disagreed += 1
          annotations_info = []
          for annotator, count in zip(group[ANNOTATOR], group['count']):
              annotations_info.append(f"{annotator} reported {count} metrics")
          print(f"For paper id {i}, {'; '.join(annotations_info)}")
      else:
        agreed += 1
  print(f'With {agreed} agreements and {disagreed} disagreements, annotators agreed in {100 * agreed / float(agreed + disagreed)}% cases.')

# Normalize the metric string
def normalize_metric(metric):
  metric = re.sub('[- +@]+', '', metric) # Remove spaces and other special symbols that might occur - keeping parentheses deliberately
  metric = metric.lower() # Lowercase everything
  metric = re.sub('(#survey|\(corpus\))', '', metric)
  if metric in names_to_split.keys():
    return names_to_split[metric]
  if metric in metric_mapping.keys():
    return metric_mapping[metric]
  return metric

# Some metrics were reported as several metrics in one line, split them to keep them consistent
def split_grouped_metrics(df):
  return df.explode(METRIC_NAME)

def assign_family(metric):
  if metric in metric_families.keys():
    return metric_families[metric]
  if 'human' in metric:
    return 'Human'
  return metric

# Normalize URLs
def normalize_urls(url):
  return re.sub('(/|\.pdf)$', '', url)

def normalize_task(taskstring):
  if taskstring is None:
    return frozenset()
  if isinstance(taskstring, frozenset):
    return taskstring
  subtasks = re.split('[:;,]', taskstring)
  updated = []
  for t in subtasks:
    task = t.strip()
    if task != "":
      if task in task_mapping.keys():
        task = task_mapping[task]
      updated.append(task)

  return frozenset(updated)

### Data loading
Open the worksheet and make it into a DataFrame + Normalization

In [None]:
def sheet2df(sheet):
  worksheet = gc.open_by_url('https://docs.google.com/spreadsheets/d/10Yvxn7sb78cZmpmM0RYoAuVHoaP_KiDc970vg9pSa2c/').worksheet(sheet)

  # get_all_values gives a list of rows.
  rows = worksheet.get_all_values()

  # Convert to a DataFrame and render.
  df = pd.DataFrame.from_records(rows[1:], columns=rows[0])
  return df

In [None]:
# Loading all the annotation data
dfs = []
for sheet in SHEETS:
  df = sheet2df(sheet)
  # Sometimes there will be blank rows with "Updated Task"
  df = df[df[ID] != ""]
  df[CONF] = sheet
  df.reset_index(inplace=True, drop=True)
  dfs.append(df)
df = pd.concat(dfs, ignore_index=True)

In [None]:
# Making paper URLs easy to get -- getting them from the other sheets
papers_list = pd.concat([sheet2df('Generation')[['ACL ID', 'Paper Link']], sheet2df('INLG 2023 Papers')[['ACL ID', 'Paper Link']]], ignore_index=True)

def id2link(paper_id):
  try:
    return list(papers_list[papers_list['ACL ID'] == paper_id]['Paper Link'])[0]
  except:
    return paper_id

### Checksums & normalizations

#### Metric name normalization

In [None]:
names_to_split = {
    "accuracy/p/r/f1": ["accuracy", "precision", "recall", "f1"],
    "bleu{1,2}": ["bleu1", "bleu2"],
    "distinctngrams(dist{1,2,3})": ['distinctunigrams', 'distinctbigrams', 'distincttrigrams'],
    "dist{1,2,3}": ['distinctunigrams', 'distinctbigrams', 'distincttrigrams'],
    "repnmetricsforn=2,3,4": ['bigramrepetition', 'trigramrepetition', '4gramrepetition'],
    "rouge{1,2,l}": ["rouge1", "rouge2", "rougel"],
    "rouge{1,2}": ["rouge1", "rouge2"],
    "human(fluency,faithfulness,coverage,repetition)": ["human(fluency)", "human(faithfulness)", "human(coverage)", "human(repetition)"],
    "human(fluency,relatedness,correctness,diversity)": ["human(fluency)", "human(relatedness)", "human(correctness)", "human(diversity)"],
    "human(rationality,fluency)": ["human(rationality)", "human(fluency)"],
    "human(simplicitiy,correctness,fluency)": ["human(simplicity)", "human(correctness)", "human(fluency)"]
}

In [None]:
metric_mapping = {
    'harmonicmean(hmean)between(1−pbleu)andbleu': 'harmonicmean(pbleubleu)',
    'harmonicmeanof1pbleuandbleu': 'harmonicmean(pbleubleu)',
    'hmeanbetween(1pbleu)andbleu':'harmonicmean(pbleubleu)',
    'harmonicmeanofbleu4andstyleaccuracy': 'harmonicmean(bleu4styleaccuracy)',
    'pairwisebleu': 'pbleu',
    'pbleu(selfbleu)': 'pbleu',
    'em': 'exactmatch',
    'exactmatch(em)': 'exactmatch',
    'inform(rate)': 'inform',
    'success(rate)': 'success',
    'combinescore(informandrate)': 'combinedscore(informandrate)',
    'bleu(4)': 'bleu4',
    'accuracy(?)': 'accuracy',
    'macroaveragedf1score(f1)': 'f1',
    'sensitivity': 'demetrbenchmarksensitivityscores',
    'bleurtbase': 'bleurt',
    'allmpnetbasev2': 'mpnetcosinesimilarity',
    'negmpnet': 'negmpnetcosinesimilarity',
    'distinct1': 'distinctunigrams',
    'distinct2': 'distinctbigrams',
    'distinct4': 'distinct4grams',
    'dist1': 'distinctunigrams',
    'dist2': 'distinctbigrams',
    'dist3': 'distincttrigrams',
    'distinct3': 'distincttrigrams',
    "distn(4?)": 'distinct4grams',
    'bleuscore': 'bleu',
    'corpusbleu': 'bleu',
    'rquge': 'rouge',
    'bertscorefscore': 'bertscoref1',
    'bertscorep': 'bertscoreprecision',
    'bertscorer': 'bertscorerecall',
    'beatf1': 'bertscoref1',
    'bertscorefmeasure': 'bertscoref1',
    'bleurtscore': 'bleurt',
    'human(creativeness)': 'human(creativity)',
    'human(informativity)': 'human(informativeness)',
    'grammar(gram)': 'grammaticality',
    'human(intrestingness)': 'human(interesting)',
    'human(overalquality/preference)': 'human(overall)',
    'humanfluency': 'human(fluency)',
    'humaninformativeness': 'human(informativeness)',
    'lr(lexicalrepetition)': 'lexicalrepetition',
    'mauvescore': 'mauve',
    'rouge2(r2)': 'rouge',
    'cosinedistance': 'cosinesimilarity',
    'auroc': 'auc',
    'detoxification': 'detoxify',
    'fleschkincaidgradelevelreadability(fkgl)': 'fleschkincaidgradelevel',
    'fkgl': 'fleschkincaidgradelevel',
    'fleschkincaidgradelevel(fkgl)': 'fleschkincaidgradelevel',
    'repeatedtrigrams': 'trigramrepetition'
}

In [None]:
task_mapping = {
    "natural language entailment)": "natural language inference",
    "data-text generation": "data-to-text generation",
    "data-to-text": "data-to-text generation",
    "dialogue generation": "dialogue turn generation",
    "dialogue response": "dialogue turn generation",
    "dialouge": "dialogue turn generation",
    "open-ended dialogue": "dialogue turn generation",
    "task-oriented dialouge": "dialogue turn generation",
    "paraphrase generation": "paraphrasing / lossless simplification",
    "paraphrasing/lossless simplification": "paraphrasing / lossless simplification",
    "text simplification": "compression / lossy simplification",
    "question-generation": "question generation",
    "quora question pairs": "question answering",
    "and question answering": "question answering",
    "simile generation": "simile generation (text-to-text)",
    "story-generation": "story generation",
    "text summarization": "summarisation (text-to-text)",
    "summarisation": "summarisation (text-to-text)",
    "summarization": "summarisation (text-to-text)",
    "summarization (text-to-text)": "summarisation (text-to-text)",
    "evaluate semantic diversity between two natural language \ngeneration": "evaluate semantic diversity between two natural language generation",
    "Updated Task": "",
    "translation": "machine translation",
    "surface realisation (slr to text)": "surface realisation (SLR to text)"
}

In [None]:
# Get Metric Information from the Properties sheet
def add_properties(df):
  properties = sheet2df(PROPERTIES)
  properties[ID] = properties['Paper IDs'].apply(lambda x: x.split())
  properties = properties.explode(ID)
  df = df.merge(properties, left_on=[METRIC_NAME, ID], right_on=[METRIC_NAME, ID], how='left')
  return df

In [None]:
# Normalize
df[SURVEY] = df[METRIC_NAME].str.contains("#survey")
df[METRIC_NAME] = df[METRIC_NAME].apply(normalize_metric)
df[TASK] = df[TASK].apply(normalize_task)
df = split_grouped_metrics(df)

df = add_properties(df)
df[FAMILY] = df[FAMILY].fillna("Human")
df = df.fillna('')

# Leaving the surveys out of the analysis
df = df[df[SURVEY] == False]

#### Basic paper counts

In [None]:
# Get rid of some small categories
print(f"There are {len(df[df[CORRELATED] == 'Correlation with previous human eval'])} metrics with a correlation to previous human eval")
df = df.replace("Correlation with previous human eval", "Human evaluation, quantitative correlation")


# First take a look at papers with no metrics at all
no_metrics = df[df[METRIC_NAME] == ""]
num_no_metrics = len(no_metrics[ID].unique())
df_all = df[df[METRIC_NAME] != ""]
num_with_all = len(df_all[ID].unique())
print(f"There are {num_no_metrics} papers with no metrics, {num_with_all} papers remain for analysis.")


# Now exclude papers with only human metrics, but also report how many papers uses human metrics
hum_df = df[df[METRIC_NAME].str.contains('human')]
papers_hum = len(hum_df[ID].unique())
num_h = len(hum_df)
dist_h = len(hum_df[METRIC_NAME].unique())
print(f"{papers_hum} out of {len(df[ID].unique())} papers use human evaluation. In total, there were {num_h} instances of human metrics used, {dist_h} of those are unique.")

auto_df = df_all[~df_all[METRIC_NAME].str.contains('human')]
papers_auto = len(auto_df[ID].unique())
num_a = len(auto_df)
dist_a = len(auto_df[METRIC_NAME].unique())
print(f"{papers_auto} out of {len(df[ID].unique())} papers use automatic evaluation. In total, there were {num_a} instances of automatic metrics used, {dist_a} of those are unique.")

num_fam = len(auto_df[FAMILY].unique())
print(f"There are {num_fam} metric families.")

In [None]:
# Which papers were eliminated?
paper_ids_with_metrics = df_all[ID].unique().tolist()
missing_papers_df = papers_list[~papers_list["ACL ID"].isin(paper_ids_with_metrics)]
missing_papers_df.head(10)

In [None]:
# How many papers contain both human and automatic metrics:
hum_paper_ids = hum_df[ID].unique().tolist()
auto_paper_ids = auto_df[ID].unique().tolist()
print(f"Number of papers that have both automatic and human evaluations: {len(set(auto_paper_ids).intersection(set(hum_paper_ids)))}")

#### Metric properties sheet creation

In [None]:
# For debugging the metric families
by_family = {}
for entry in df[[ID, NEWLY, METRIC_NAME,FAMILY]].to_dict('records'):
  by_family[entry[FAMILY]] = by_family.get(entry[FAMILY], [])
  by_family[entry[FAMILY]].append({METRIC_NAME: entry[METRIC_NAME], ID: entry[ID], 'URL': id2link(entry[ID]), NEWLY: entry[NEWLY]})

In [None]:
# This is the code that produced the base of the "Metric properties" sheet
import csv

data_rows = []

for key, values in by_family.items():
  metric_names = {}
  # We only care about automatic metrics in this part
  if key == "Human":
    continue
  for value in values:
    # Multiple occurrences of the same metric: just add IDs & URLs to the 1st mention
    if value[METRIC_NAME] in metric_names:
      data_rows[metric_names[value[METRIC_NAME]]][-3] += 1
      data_rows[metric_names[value[METRIC_NAME]]][-2] += ' ' + value[ID]
      data_rows[metric_names[value[METRIC_NAME]]][-1] += ' ' + value['URL']
      continue
    # 1st occurrence: create a new row
    row = [value[METRIC_NAME], key, None, None, None, None, 1, value[ID], value['URL']]
    metric_names[value[METRIC_NAME]] = len(data_rows)  # store ref to row where it's introduced
    data_rows.append(row)

with open('dict.csv', 'w') as csv_file:
    writer = csv.writer(csv_file, delimiter='\t')
    for row in data_rows:
      writer.writerow(row)

#### Analyzing tasks

Compared to the originally defined tasks, we needed to introduce more. This code was used to analyze and unify them.

In [None]:
uni = set ()
for fs in df[TASK].unique():
  uni.update(fs)
uni

In [None]:
defined_tasks = {'data-to-text generation',
'dialogue turn generation',
'content selection/determination',
'content ordering/structuring',
'deep generation (DLR to text)',
'aggregation',
'lexicalisation',
'referring expression generation',
'surface realisation (SLR to text)',
'feature-controlled generation',
'question generation',
'question answering',
'paraphrasing / lossless simplification',
'machine translation',
'summarisation (text-to-text)',
'compression / lossy simplification',
'end-to-end text generation',
'multiple (list all)',
'other (please specify)'}

In [None]:
other = uni.difference(defined_tasks)

In [None]:
task_df = df.copy(deep=True)
task_df[TASK] = task_df[TASK].apply(lambda x: list(x))
task_df = task_df.explode(TASK).drop_duplicates(subset=[ID, TASK])
other_tasks = task_df[task_df[TASK].isin(other)]
other_tasks[TASK].value_counts()

In [None]:
other_tasks[other_tasks[TASK] == "open-ended text generation (LM sampling)"][['Annotator_x', ID, TASK, 'Link to the Paper']]

In [None]:
task_df[TASK].value_counts()

### Stats computation

#### Basic count Venn diagrams

In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

fig, axs = plt.subplots(1, 2, figsize=(10, 5), facecolor='none')


# ACL 2023 Venn diagram
axs[0].set_title("ACL 2023")
venn_acl = venn2(
    subsets=(
        set(hum_df[hum_df[CONF] == "ACL 2023"][ID].unique()),
        set(auto_df[auto_df[CONF] == "ACL 2023"][ID].unique())
    ),
    set_labels=["Human Evaluation", "Automatic Evaluation"],
    set_colors=("#A682FF", "#55C1FF"),
    ax=axs[0]
)

#Setting the dashed line for perimeter
for circle in venn_acl.patches:
    if circle:
        circle.set_edgecolor('black')
        circle.set_linestyle('--')

#manually aligning labels and titles
venn_acl.set_labels[0].set(x=-0.2, y=-0.6)
venn_acl.set_labels[1].set(x=0.2, y=-0.6)
axs[0].set_title("ACL 2023", x=0.5, y=1)

# INLG 2023 Venn diagram
axs[1].set_title("INLG 2023")
venn_inlg =venn2(
    subsets=(
        set(hum_df[hum_df[CONF] == "INLG 2023"][ID].unique()),
        set(auto_df[auto_df[CONF] == "INLG 2023"][ID].unique())
    ),
    set_labels=["Human Evaluation", "Automatic Evaluation"],
    set_colors=("#D00000", "#FFBA08"),
    ax=axs[1]
)

#Setting the dashed line for perimeter
for circle in venn_inlg.patches:
    if circle:
        circle.set_edgecolor('black')
        circle.set_linestyle('--')
axs[1].grid(True, which='both', linestyle='--', linewidth=0.5)
axs[1].patch.set_alpha(0)

#manually aligning labels and titles
venn_inlg.set_labels[0].set(x=-0.2, y=-0.62)
venn_inlg.set_labels[1].set(x=0.16, y=-0.62)
axs[1].set_title("INLG 2023", x=0.5, y=1.04)

# Adjust layout and display the plot
plt.tight_layout()
plt.show()


#### Metric use count

In [None]:
auto_df[FAMILY].value_counts().plot(kind='barh', figsize=(8, 10))

#### Metric family per venue (high-level)

In [None]:
F = "MF2"

# Showing metric families as a grouped bar chart, only top-10 metrics (ranked from INLG) shown
def plot_bar_chart_overlay_sorted_with_values(df1, df2, ax, title='', color1='blue', color2='red'):
    # Use seaborn style
    sns.set(style="whitegrid")

    # Combine unique "Metric Family" values from both DataFrames
    unique_values = np.union1d(df1[F].unique(), df2[F].unique())

    # Count occurrences of each value in 'Metric Family' column for both DataFrames
    counts_df1 = df1[F].value_counts().reindex(unique_values, fill_value=0)
    counts_df2 = df2[F].value_counts().reindex(unique_values, fill_value=0)

    # Sort "Metric Family" values based on counts in df1 and get the top 10
    top_10_values = counts_df1.sort_values(ascending=False).head(10).index

    # Sum up all other values into 'Other' category
    other_count_df1 = counts_df1[~counts_df1.index.isin(top_10_values)].sum()
    other_count_df2 = counts_df2[~counts_df2.index.isin(top_10_values)].sum()

    # Add 'Other' category to the top 10 values
    top_10_values = top_10_values.append(pd.Index(['Other']))

    # Reindex counts based on top 10 "Metric Family" values plus 'Other'
    counts_df1_sorted = pd.concat([counts_df1.reindex(top_10_values[:-1]), pd.Series({'Other': other_count_df1})])
    counts_df2_sorted = pd.concat([counts_df2.reindex(top_10_values[:-1]), pd.Series({'Other': other_count_df2})])

    # Convert counts to percentages
    values_df1 = (counts_df1_sorted / counts_df1.sum() * 100).values
    values_df2 = (counts_df2_sorted / counts_df2.sum() * 100).values
    labels = top_10_values

    x = np.arange(len(labels))  # the label locations
    width = 0.35  # the width of the bars

    # Plot data for df1 and df2
    rects1 = ax.bar(x - width/2, values_df1, width, label='INLG 2023', color=color1, alpha=1)
    rects2 = ax.bar(x + width/2, values_df2, width, label='ACL 2023', color=color2, alpha=1)

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Percentage (%)')
    ax.set_title(title)
    ax.set_xticks(x)
    ax.set_xticklabels(labels, rotation=90)
    ax.legend(loc='upper center')

    # Attach a text label above each bar in *rects*, displaying its height.
    def autolabel(rects):
        """Attach a text label above each bar in *rects*, displaying its height."""
        for rect in rects:
            height = rect.get_height()
            ax.annotate(f'{height:.1f}%',
                        xy=(rect.get_x() + rect.get_width() / 2, height),
                        xytext=(0, 3),  # 3 points vertical offset
                        textcoords="offset points",
                        ha='center', va='bottom',
                        size=8)

    autolabel(rects1)
    autolabel(rects2)
# Assuming df1 and df2 are your two DataFrames
df1 = auto_df[auto_df[CONF] == "INLG 2023"].drop_duplicates(subset=[ID, FAMILY])
df2 = auto_df[auto_df[CONF] == "ACL 2023"].drop_duplicates(subset=[ID, FAMILY])

fig, ax = plt.subplots(figsize=(8, 8))
plot_bar_chart_overlay_sorted_with_values(df1, df2, ax, title='Metric families usage across venues', color1='#bbe6ff',color2='#ec9999')
plt.show()


#### Metric per venue (top 10 + rest)

In [None]:
from collections import Counter

# Filter the dataframes based on the given conditions
df1 = auto_df[auto_df[CONF] == 'INLG 2023'].drop_duplicates(subset=[ID, FAMILY])
df2 = auto_df[auto_df[CONF] == 'ACL 2023'].drop_duplicates(subset=[ID, FAMILY])

df1_dict = df1[[FAMILY, 'MF2']].value_counts().to_dict()
df2_dict = df2[[FAMILY, 'MF2']].value_counts().to_dict()

# make the sorting normalized
summed = dict(Counter((df1[[FAMILY, 'MF2']].value_counts() / len(df1)).to_dict()) + Counter((df2[[FAMILY, 'MF2']].value_counts() / len(df2)).to_dict()))

hlf_counts = {hlf: sum({v for k, v in summed.items() if k[1] == hlf}) for hlf in {k[1] for k in summed}}
# high-level families sorted by total frequency
hlf_list = sorted(hlf_counts.keys(), key=lambda k: hlf_counts[k], reverse=True)

In [None]:
# hierarchical: hlf -> family
def get_hier_data(df_dict):
  hier = {}
  for k in summed:
    hier[k[1]] = hier.get(k[1], {})
    hier[k[1]][k[0]] = df_dict.get(k, 0)
  maxlen = max(len(v) for v in hier.values())

  vals = [[] for _ in range(maxlen)]
  labels = [[] for _ in range(maxlen)]
  for hlf in hlf_list:
      g = list(sorted(hier[hlf].items(), key=lambda i: i[1], reverse=True))
      g += [('', 0)] * (maxlen - len(g))
      for i, (k, v) in enumerate(g):
          labels[i].append(f'{hlf}-{k}' if k else '')
          vals[i].append(v)
  return vals, labels

df1_hier = get_hier_data(df1_dict)
df2_hier = get_hier_data(df2_dict)

In [None]:
# top-k + others split by hlf
limit = 10
mf_colors = ["#c7522a", "#e5c185", "#fbf2c4", "#b8cdab", "#74a892", "#008585", "#4c9eb3", "#779af5", "#a59cff", "#dbcdf0"]
topk_items = [k for k, _ in sorted(summed.items(), key=lambda i: i[1], reverse=True)[:limit]]

def get_topk_data(df_dict, colorscheme):

  vals = [[df_dict.get(k, 0) for k in topk_items]]
  labels = [[k[0] for k in topk_items]]
  colors = [[colorscheme[hlf_list.index(k[1])] for k in topk_items]]

  hlf_counts = {}
  for k, v in df_dict.items():
    if k in topk_items:
      continue
    hlf_counts[k[1]] = hlf_counts.get(k[1], 0) + v

  for k, v in sorted(hlf_counts.items(), key=lambda i: i[1], reverse=True):
    vals[-1].append(v)
    labels[-1].append("Other " + k)
    colors[-1].append(colorscheme[hlf_list.index(k)])
    vals.append([0] * len(topk_items))
    labels.append([''] * len(topk_items))
    colors.append(["#000000"] * len(topk_items))

  vals.pop()
  labels.pop()
  colors.pop()

  return vals, labels, colors

df1_topk = get_topk_data(df1_dict, mf_colors)
df2_topk = get_topk_data(df2_dict, mf_colors)


In [None]:
import pandas as pd
import copy
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.patches import Patch

# to make the pattern more subtle
plt.rcParams.update({'font.size': 12})

# Create a figure with subplots
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(6, 7))

def create_barplot(hlf_list, data, labels, colors, dflen, hatch, pos):

  bottoms = np.zeros(len(hlf_list))
  for ds, ls, cs in zip(data, labels, colors):
    ds = np.array(ds) / dflen * 100  # percentage
    axes.bar(np.arange(len(hlf_list)) - 0.15 + 0.3 * pos,
             ds,
             color=cs,
             # position=pos,
             width=0.3,
             hatch=hatch,
             edgecolor='white',
             label=ls,
             bottom=bottoms,
             alpha=0.8,
             zorder=10)
    bottoms += ds

  for x, y, s in zip(np.arange(len(hlf_list)) - 0.2 + 0.4 * pos,
                     bottoms + 0.1, bottoms):
    axes.text(x, y, f'{s:.1f}', fontsize='small', horizontalalignment='center', zorder=12)


bars_list = [i[0] for i in topk_items] + ['Other']

create_barplot(bars_list, df1_topk[0], df1_topk[1], df1_topk[2], len(df1), None, 0)
create_barplot(bars_list, df2_topk[0], df2_topk[1], df2_topk[2], len(df2), 'xxxxx', 1)

# Setting various params
axes.grid(axis='y', zorder=0)
axes.set_xlabel("Metric families")
axes.set_ylabel('% Papers Using')
axes.yaxis.set_label_coords(-0.01, -0.18)  # move y axis label down to save space
axes.set_title('Metric family use per venue')
plt.xticks(range(len(bars_list)), bars_list, rotation='vertical')  # x axis labels: metric families


legend = [Patch(facecolor='#808080', hatch=None, label='INLG 2023'),
          Patch(facecolor='#808080', hatch='xxxxx', edgecolor='white', label='ACL 2023')]
legend += [Patch(facecolor=c, label=hlf) for hlf, c in zip(hlf_list, mf_colors)]
plt.legend(handles=legend, loc='upper left', ncol=2)


plt.tight_layout()
plt.show()



#### Correlations

In [None]:
inlg_corr = auto_df[auto_df[CONF] == "INLG 2023"].drop_duplicates(subset=[ID, CORRELATED])
inlg_corr_counts = inlg_corr[CORRELATED].value_counts()


acl_corr = auto_df[auto_df[CONF] == "ACL 2023"].drop_duplicates(subset=[ID, CORRELATED])
acl_corr_counts = acl_corr[CORRELATED].value_counts()

comparison = pd.DataFrame({'INLG 2023': inlg_corr_counts, 'ACL 2023': acl_corr_counts}).fillna(0).T
comparison_normalized = comparison.div(comparison.sum(axis=1), axis=0) * 100
palette = sns.color_palette(["#c7522a","#e5c185","#fbf2c4","#b8cdab","#74a892","#008585","#4c9eb3","#779af5","#a59cff","#dbcdf0"], n_colors=4)
ax = comparison_normalized.plot(kind='bar', stacked=True, color=palette, ylim=(0,100))

labels = [
    'No Correlation',
    'No Human Evaluation',
    'Qualitative Correlation',
    'Quantitative Correlation',
          ]

plt.ylabel('Percentage')

# Shrink current axis's height by 10% on the bottom
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.1,
                 box.width, box.height * 0.9])

#plt.title('Relative Makeup of Correlation with Human Evaluation', loc='center')
plt.legend(title='Correlated with Human Evaluation?', bbox_to_anchor=(0.5, -0.1), loc='upper center',
           labels=labels,
           ncols=2)
plt.xticks(rotation='horizontal')
plt.tight_layout()

# Annotate percentages on the bars
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy()
    if height > 5:
      ax.text(x + width / 2, y + height / 2, f'{height:.1f}%', ha='center', va='center')

plt.show()

#### Top 20 metrics per conference

In [None]:
top_metrics = auto_df[auto_df[CONF] == "ACL 2023"][METRIC_NAME].value_counts().nlargest(20)

top_metrics.plot(kind='barh')

In [None]:
top_metrics = auto_df[auto_df[CONF] == "INLG 2023"][METRIC_NAME].value_counts().nlargest(20)

top_metrics.plot(kind='barh')

#### Appendix-only metrics

In [None]:
app = df_all[df_all[APPENDIX] == 'Yes']
all_inlg = len(df_all[df_all[CONF] == "INLG 2023"])
all_acl = len(df_all[df_all[CONF] == "ACL 2023"])
app_inlg = len(app[app[CONF] == "INLG 2023"])
app_acl = len(app[app[CONF] == "ACL 2023"])
print(f'{app_inlg} metrics ({app_inlg / all_inlg * 100:.2f} %) were reported in the Appendix at INLG 2023.')
print(f'{app_acl} metrics ({app_acl / all_acl * 100:.2f} %) were reported in the Appendix at ACL 2023')

In [None]:
app

#### Implementation details provided?

In [None]:
# We noted down that there were two papers (P110 - 8 metrics, P312 - 5 metrics) that used multiple implementations
# That is not enough for a graph, so we will merge them to "Implementation details provided"

auto_df[IMPL] = auto_df[IMPL].replace("Multiple implementations used", "Implementation details provided")

inlg_impl = auto_df[auto_df[CONF] == "INLG 2023"] #.drop_duplicates(subset=[ID, FAMILY])
inlg_impl_counts = inlg_impl[IMPL].value_counts()


acl_impl = auto_df[auto_df[CONF] == "ACL 2023"] #.drop_duplicates(subset=[ID, FAMILY])
acl_impl_counts = acl_impl[IMPL].value_counts()



comparison = pd.DataFrame({'INLG 2023': inlg_impl_counts, 'ACL 2023': acl_impl_counts}).fillna(0).T
comparison_normalized = comparison.div(comparison.sum(axis=1), axis=0) * 100
ax = comparison_normalized.plot(kind='bar', stacked=True, ylim=(0,100))

labels = ["No", "Yes"]

plt.ylabel('Percentage')


#plt.title('Relative Makeup of Correlation with Human Evaluation', loc='center')
plt.legend(title='Did the authors provide implementation details in the paper?', bbox_to_anchor=(0.5, -0.1), loc='upper center', ncols=2, labels=labels)
plt.xticks(rotation='horizontal')
plt.tight_layout()

# Annotate percentages on the bars
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy()
    if height > 5:
      ax.text(x + width / 2, y + height / 2, f'{height:.1f}%', ha='center', va='center')

plt.show()

In [None]:
all_impl = auto_df
bleu_impl = auto_df[auto_df[FAMILY] == "BLEU"].drop_duplicates(subset=[ID, FAMILY, IMPL])
rouge_impl = auto_df[auto_df[FAMILY] == "ROUGE"].drop_duplicates(subset=[ID, FAMILY, IMPL])
all_impl_counts = all_impl[IMPL].value_counts()
bleu_impl_counts = bleu_impl[IMPL].value_counts()
rouge_impl_counts = rouge_impl[IMPL].value_counts()

comparison = pd.DataFrame({'BLEU': bleu_impl_counts, 'ROUGE': rouge_impl_counts}).fillna(0).T
comparison_normalized = comparison.div(comparison.sum(axis=1), axis=0) * 100
ax = comparison_normalized.plot(kind='bar', stacked=True, ylim=(0,100))

labels = []

plt.ylabel('Percentage')


#plt.title('Relative Makeup of Correlation with Human Evaluation', loc='center')
plt.legend(title='Type of Metric Used', bbox_to_anchor=(0.5, -0.1), loc='upper center', ncols=2)
plt.xticks(rotation='horizontal')
plt.tight_layout()

# Annotate percentages on the bars
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy()
    if height > 5:
      ax.text(x + width / 2, y + height / 2, f'{height:.1f}%', ha='center', va='center')

plt.show()

#### BLEU & ROUGE variants

In [None]:
# What variants of BLEU are used?
bleu_types = auto_df[auto_df[FAMILY] == "BLEU"].groupby(['Display Name'])['Display Name'].count()
bleu_types = bleu_types.sort_values(ascending=False)

In [None]:
# What variants of ROUGE are used?
rouge_types = auto_df[auto_df[FAMILY] == "ROUGE"].groupby(['Display Name'])['Display Name'].count()
rouge_types = rouge_types.sort_values(ascending=False)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True)
fig.set_figwidth(10)
fig.tight_layout()
bleu_types.plot(kind="bar", xlabel="", ylabel="Count", color="#779af5", ax=axes[0])
rouge_types.plot(kind="bar", xlabel="", color="#c7522a", ax=axes[1])

In [None]:
impl_ids = {}
for impl in auto_df[IMPL].unique():
  impl_ids[impl] = list(auto_df[auto_df[IMPL] == impl][ID].unique())
import json
with open('impl_ids.json', 'w') as f:
  json.dump(impl_ids, f)

#### Rationales

In [None]:
rationales = auto_df['Notes: Rational']

In [None]:
follower_count = 0
correlate = 0
empty = 0
other = []
vals = []
v_bin = []
import re
for r in rationales:
  rl =  r.strip().lower()
  if r == "" or re.match("(none given\.?|not given)", rl):
    empty += 1
    vals.append('None')
    v_bin.append(0)
  elif 'correlat' in rl:
    correlate += 1
    vals.append('Correlation')
    v_bin.append(1)
  elif re.search('(recent|previous|earlier|following|widely|staple|commonly|conventional |20\d\d)', rl):
    follower_count += 1
    vals.append('Following')
    v_bin.append(1)
  else:
    other.append(r)
    vals.append('Quality')
    v_bin.append(1)
print(f'{follower_count} ({follower_count/len(rationales)*100:.1f} %) metrics were used because the authors followed previous work.')
print(f'{correlate} ({correlate/len(rationales)*100:.1f} %) metrics were used because they correlate with human judgment.')
print(f'{empty} ({empty/len(rationales)*100:.1f} %) metrics had no rationale for being used.')
print(f'{len(other)} ({len(other)/len(rationales)*100:.1f} %) metrics provided a rationale (other than following previous work or previously shown correlation with human judgment).')

In [None]:
len(rationales)

In [None]:
auto_df['RATB'] = v_bin
auto_df['RAT'] = vals

In [None]:
grouped_m = auto_df.groupby(by=ID)[METRIC_NAME].agg(lambda x: len(list(x))).to_frame()
grouped_r = auto_df.groupby(by=ID)['RAT'].agg(lambda x: list(x)).to_frame()
grouped_rb = auto_df.groupby(by=ID)['RATB'].agg(lambda x: 1 if sum(x) >=1 else 0).to_frame()

grouped = grouped_m.join(grouped_r)
groupedrb = grouped_m.join(grouped_r)

In [None]:


# Create a binary matrix for RAT values
all_rat_values = ['None', 'Correlation', 'Following', 'Quality']
for rat in all_rat_values:
    grouped[rat] = grouped['RAT'].apply(lambda x: x.count(rat) / len(x))

# Aggregate counts of metrics and RAT values
df_agg = grouped.groupby('Metric name')[all_rat_values].sum().reset_index()

# Plot the heatmap
df_melted = df_agg.melt(id_vars=['Metric name'], value_vars=all_rat_values,
                        var_name='Rationale', value_name='Count')

heatmap_data = df_melted.pivot(index="Metric name", columns="Rationale", values="Count")
plt.figure(figsize=(7, 7))
palette = ["#fbf2c4","#b8cdab","#74a892","#008585", "#4c9eb3"]
cmap = mcolors.LinearSegmentedColormap.from_list("n", palette)
ax = sns.heatmap(heatmap_data, annot=True, cmap=cmap)
ax.set(xlabel="What rationale was given for using a metric?", ylabel="How many metrics were used within one paper?")
#plt.title('Heatmap of Metric Counts vs Rationales given')
plt.show()

#### Trainableness

In [None]:
inlg = auto_df[auto_df[CONF] == "INLG 2023"] #.drop_duplicates(subset=[ID, FAMILY])
inlg_train_counts = inlg[TRAIN].value_counts()


acl = auto_df[auto_df[CONF] == "ACL 2023"] #.drop_duplicates(subset=[ID, FAMILY])
acl_train_counts = acl[TRAIN].value_counts()



comparison = pd.DataFrame({'INLG 2023': inlg_train_counts, 'ACL 2023': acl_train_counts}).fillna(0).T
comparison_normalized = comparison.div(comparison.sum(axis=1), axis=0) * 100
ax = comparison_normalized.plot(kind='bar', stacked=True, ylim=(0,100))

labels = []

plt.ylabel('Percentage')


#plt.title('Relative Makeup of Correlation with Human Evaluation', loc='center')
plt.legend(title='Is the metric trainable?', bbox_to_anchor=(0.5, -0.1), loc='upper center', ncols=2)
plt.xticks(rotation='horizontal')
plt.tight_layout()

# Annotate percentages on the bars
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy()
    if height > 5:
      ax.text(x + width / 2, y + height / 2, f'{height:.1f}%', ha='center', va='center')

plt.show()

#### Source v reference based metrics

In [None]:
auto_df[SRC_REF] = auto_df[SRC] + auto_df[REF]
replacements = {
    "FALSETRUE": "Reference-based metric",
    "TRUEFALSE": "Source-based metric",
    "TRUETRUE": "Metric uses both source and reference",
    "FALSEFALSE": "Metric uses output only"
}
auto_df[SRC_REF] = auto_df[SRC_REF].map(replacements)
auto_df[SRC_REF].unique()

In [None]:
inlg = auto_df[auto_df[CONF] == "INLG 2023"] #.drop_duplicates(subset=[ID, FAMILY])
inlg_srcref_counts = inlg[SRC_REF].value_counts()


acl = auto_df[auto_df[CONF] == "ACL 2023"] #.drop_duplicates(subset=[ID, FAMILY])
acl_srcref_counts = acl[SRC_REF].value_counts()

palette = sns.color_palette(["#c7522a","#e5c185","#fbf2c4","#b8cdab","#74a892","#008585","#4c9eb3","#779af5","#a59cff","#dbcdf0"], n_colors=4)

comparison = pd.DataFrame({'INLG 2023': inlg_srcref_counts, 'ACL 2023': acl_srcref_counts}).fillna(0).T
comparison_normalized = comparison.div(comparison.sum(axis=1), axis=0) * 100
ax = comparison_normalized.plot(kind='bar', stacked=True, color=palette, ylim=(0,100))

labels = []

plt.ylabel('Percentage')


#plt.title('Relative Makeup of Correlation with Human Evaluation', loc='center')
plt.legend(title='Type of Metric Used', bbox_to_anchor=(0.5, -0.08), loc='upper center', ncols=2)
plt.xticks(rotation='horizontal')
plt.tight_layout()

# Annotate percentages on the bars
for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy()
    if height > 5:
      ax.text(x + width / 2, y + height / 2, f'{height:.1f}%', ha='center', va='center')


plt.show()

#### Metric split by tasks

In [None]:
#Figure 5 aka \label{fig:metric_task_usage}
MF2 = "MF2"
auto_tasks = auto_df.copy(deep=True)
auto_tasks[TASK] = auto_tasks[TASK].apply(lambda x: list(x))
auto_tasks = auto_tasks.explode(TASK)
auto_tasks = auto_tasks[~auto_tasks[TASK].isin(['multiple (list all)', 'other (please specify)'])]

task_plot_map = {
    'question answering': 'question\nanswering',
    'machine translation': 'machine\ntranslation',
    'question generation': 'question\ngeneration',
    'paraphrasing / lossless simplification': 'paraphrasing',
    'feature-controlled generation': 'feat-controlled\ngeneration',
    'end-to-end text generation': 'end-to-end\ngeneration',
    'dialogue turn generation': 'dialogue turn\ngeneration',
    'summarisation (text-to-text)': 'summarisation',
    'data-to-text generation': 'data-to-text\ngeneration',
    'story generation': 'story\ngeneration'
}
plotting_data = dict()
task_counts = auto_tasks.drop_duplicates(subset=[ID, TASK])[TASK].value_counts()


for t in reversed(task_counts.index):
  if task_counts[t] < 6:
    continue
  largest = auto_tasks[auto_tasks[TASK] == t][MF2].value_counts().head(4).index.tolist()
  large = auto_tasks.assign(MF2 = np.where(auto_tasks[MF2].isin(largest), auto_tasks[MF2], 'Other'))
  plotting_data[task_plot_map[t]] = large[large[TASK] == t][MF2].value_counts()

comparison = pd.DataFrame(plotting_data).fillna(0).T
comparison_normalized = comparison.div(comparison.sum(axis=1), axis=0) * 100

# Sort the values within each task and move 'Other' to the last position
def sort_and_move_other(df):
    sorted_rows = []
    for idx, row in df.iterrows():
        sorted_row = row.sort_values(ascending=False)
        if 'Other' in sorted_row.index:
            other_value = sorted_row.pop('Other')
            sorted_row['Other'] = other_value
        sorted_rows.append(sorted_row)
    return pd.DataFrame(sorted_rows, index=df.index)

comparison_normalized = sort_and_move_other(comparison_normalized)

# Prepare the data for plotting
comparison_normalized = comparison_normalized.reset_index().rename(columns={'index': 'Task'})

# Set up the color palette
palette = sns.color_palette(["#c7522a","#e5c185","#fbf2c4","#b8cdab","#74a892","#008585","#4c9eb3","#779af5","#a59cff","#dbcdf0"], n_colors=comparison_normalized.shape[1] - 1)

# Plotting
fig, ax = plt.subplots(figsize=(9, 7))

# Function to plot sorted stacked bars
def plot_sorted_bars(ax, df, palette):
    bottoms = np.zeros(len(df))
    for i, column in enumerate(df.columns[1:]):
        color = palette[i] if column != 'Other' else 'lightgray'
        ax.barh(
            df['Task'], df[column],
            left=bottoms, label=column, color=color
        )
        bottoms += df[column]

plot_sorted_bars(ax, comparison_normalized, palette)

# Add annotations to the bars
for i, row in comparison_normalized.iterrows():
    cumulative_percentage = 0
    for j, (column, value) in enumerate(row.items()):
        if column == 'Task':
            continue
        if value > 5.5:
            ax.text(
                cumulative_percentage + value / 2,
                i,
                f'{value:.1f}%',
                ha='center', va='center',
                fontsize=10, color='black'
            )
        elif value > 2:
            ax.text(
                cumulative_percentage + value / 2,
                i,
                f'{value:.1f}%',
                ha='center', va='center',
                fontsize=8, color='black'
            )
        cumulative_percentage += value

ax.set_xlabel('Percentage')
ax.set_xlim([0, 100])
ax.legend(title='Metric', bbox_to_anchor=(0.43, -0.12), loc='upper center', borderaxespad=0., ncols=5)
plt.tight_layout()
plt.show()

In [None]:
# The new version with absolute values and minimal vertical space to be display over the page width
# Figure 5 aka \label{fig:metric_task_usage}

# Calculate absolute counts
comparison_counts = comparison.T

# Function to plot sorted stacked bars with absolute counts and percentages
def plot_sorted_bars(ax, df, counts_df, palette):
    bottoms = np.zeros(len(df))
    for i, column in enumerate(df.columns[1:]):
        color = palette[i] if column != 'Other' else 'lightgray'
        ax.barh(
            df['Task'], df[column],
            left=bottoms, label=column, color=color
        )
        bottoms += df[column]

    # Add annotations to the bars
    for i, row in df.iterrows():
        cumulative_percentage = 0
        for j, (column, value) in enumerate(row.items()):
            if column == 'Task':
                continue
            count_value = counts_df.loc[column, row['Task']]
            if value > 5.5:
                ax.text(
                    cumulative_percentage + value / 2,
                    i,
                    f'{value:.1f}%\n{int(count_value)}',
                    ha='center', va='center',
                    fontsize=10, color='black'
                )
            elif value > 2:
                ax.text(
                    cumulative_percentage + value / 2,
                    i,
                    f'{value:.1f}%\n{int(count_value)}',
                    ha='center', va='center',
                    fontsize=8, color='black'
                )
            cumulative_percentage += value

# Plotting
fig, ax = plt.subplots(figsize=(12, 5))
plot_sorted_bars(ax, comparison_normalized, comparison_counts, palette)


# Formatter function to add percent signs
def percent_formatter(x, pos):
    return f'{int(x)}%'

ax.set_xlim([0, 100])
from matplotlib.ticker import FuncFormatter
ax.xaxis.set_major_formatter(FuncFormatter(percent_formatter))  # Apply the percent formatter


def modify_labels(labels):  # Modify labels to include newline character
    return [label.replace(' ', '\n') for label in labels]

handles, labels = ax.get_legend_handles_labels()
labels = modify_labels(labels)
ax.legend(handles, labels, title='Metric', loc='center right', bbox_to_anchor=(0.8, 0.3, 0.4, 0.4), )
plt.tight_layout()
plt.show()

#### Heatmaps

In [None]:
# To analyze the number of metrics used per paper
acl = auto_df[auto_df[CONF] == "ACL 2023"]
inlg = auto_df[auto_df[CONF] == "INLG 2023"]

ag = acl.groupby(by=ID)[METRIC_NAME].apply(lambda x: len(list(x)))
ig = inlg.groupby(by=ID)[METRIC_NAME].apply(lambda x: len(list(x)))

ig_vals = ig.value_counts().sort_index()

In [None]:
ag_vals = ag.value_counts().sort_index()

In [None]:
grouped = auto_df.groupby(by=ID)[FAMILY].agg((lambda x: set(x)))

df_agg = grouped.reset_index()

# Transform sets to binary matrix
all_items = sorted(set().union(*df_agg[FAMILY]))
binary_matrix = df_agg[FAMILY].apply(lambda x: [1 if item in x else 0 for item in all_items])
binary_df = pd.DataFrame(binary_matrix.tolist(), columns=all_items)

# Calculate co-occurrence matrix
co_occurrence_matrix = np.dot(binary_df.T, binary_df)
# Plot the heatmap
plt.figure(figsize=(10, 8))
palette = ["#fbf2c4","#b8cdab","#74a892","#008585", "#4c9eb3"]
cmap = mcolors.LinearSegmentedColormap.from_list("n", palette)
sns.heatmap(co_occurrence_matrix, annot=True, cmap=cmap, xticklabels=all_items, yticklabels=all_items)
plt.title('Co-occurrence Heatmap')
plt.show()

In [None]:
MF2 = "MF2"
grouped = auto_df.groupby(by=ID)[MF2].agg((lambda x: set(x)))

df_agg = grouped.reset_index()

# Transform sets to binary matrix
all_items = sorted(set().union(*df_agg[MF2]))
binary_matrix = df_agg[MF2].apply(lambda x: [1 if item in x else 0 for item in all_items])
binary_df = pd.DataFrame(binary_matrix.tolist(), columns=all_items)

# Calculate co-occurrence matrix
co_occurrence_matrix = np.dot(binary_df.T, binary_df)
# Plot the heatmap
plt.figure(figsize=(10, 8))
palette = ["#fbf2c4","#b8cdab","#74a892","#008585", "#4c9eb3"]
cmap = mcolors.LinearSegmentedColormap.from_list("n", palette)
sns.heatmap(co_occurrence_matrix, annot=True, cmap=cmap, xticklabels=all_items, yticklabels=all_items)
plt.show()

In [None]:
def plot_heatmap(df, col1, col2, yname=None, xname=None, title=None, labels=None):
    """
    Plots a heatmap showing the relationship between two string columns.

    Parameters:
    df (pd.DataFrame): The input dataframe.
    col1 (str): The name of the first string column.
    col2 (str): The name of the second string column.
    """

    if xname is None:
      xname = col1
    if yname is None:
      yname = col2
    if not title:
      title = f'Heatmap of {xname} vs {yname}'
    # Create a contingency table (cross-tabulation)
    contingency_table = pd.crosstab(df[col1], df[col2])

    # Plot the heatmap
    plt.figure(figsize=(8, 6))
    palette = ["#fbf2c4","#b8cdab","#74a892","#008585", "#4c9eb3"]
    cmap = mcolors.LinearSegmentedColormap.from_list("n", palette)
    ax = sns.heatmap(contingency_table, annot=True, cmap=cmap, fmt='d')
    ax.set(xlabel=xname, ylabel=None)
    if labels is not None:
      ax.set_yticklabels(labels)
    plt.xticks()
    plt.show()

In [None]:
labels=['No Correlation with\nHuman Evaluation', 'Qualitative Correlation\nwith Human Evaluation', 'Quantitative Correlation\nwith Human Evaluation', 'No Human Evaluation']
plot_heatmap(auto_df, CORRELATED, 'RAT', 'Correlation with Human Evaluation', 'Rationale', labels=labels)