Import the necessary libraries and authenticate the user to access the Google Sheet:

In [13]:
from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()



gc = gspread.authorize(creds)

Define functions and set columns as variable names:

In [14]:
# Sheet names to analyze
SHEETS = ["INLG 2023", "ACL 2023"]

# List the column names as constants to prevent typos
ID = "ACL Paper ID"
METRIC_NAME = "Metric name"
NEWLY = "Newly introduced?"
APPENDIX = "Appendix"
TASK = "Updated Task"
#TASK_OLD = "Task"
#INK_TO_METRIC = "Link to the Metric Paper"
#PAPER_LINK = "Link to the Paper"
CORRELATED = "Corrleated w/ Human Evaluation?"
ANNOTATOR = "Annotator"
METRIC_IMPL = "Metric Implementations"
RATIONALE = "Notes: Rational"
COMMENTS = "Comments"
CONF = "Conf"
SURVEY = "Survey"

#CLOSED_CLASS_COLUMNS = [NEWLY, APPENDIX, CORRELATED]
#OPEN_CLASS_COLUMNS = [METRIC_NAME, TASK, LINK_TO_METRIC, PAPER_LINK]
# Metric needs to be evaluated separately and then we need to merge it to the paper ID to have a unique key and match corresponding lines
COLUMNS_TO_EVALUATE = [NEWLY, APPENDIX, TASK, METRIC_IMPL, CORRELATED, RATIONALE, COMMENTS]

In [15]:
import pandas as pd
import re
# The following snippet checks for the number of annotations per one paper per annotator and reports discrepancies
def print_metric_counts(df):
  counts = df.groupby([ID, ANNOTATOR]).size().reset_index(name='count')
  agreed = disagreed = 0
  for i, group in counts.groupby(ID):
      unique_counts = group['count'].nunique()
      if unique_counts > 1:
          disagreed += 1
          annotations_info = []
          for annotator, count in zip(group[ANNOTATOR], group['count']):
              annotations_info.append(f"{annotator} reported {count} metrics")
          print(f"For paper id {i}, {'; '.join(annotations_info)}")
      else:
        agreed += 1
  print(f'With {agreed} agreements and {disagreed} disagreements, annotators agreed in {100 * agreed / float(agreed + disagreed)}% cases.')

# Normalize the metric string
def normalize_metric(metric):
  metric = re.sub('[- +@]+', '', metric) # Remove spaces and other special symbols that might occur - keeping parentheses deliberately
  metric = metric.lower() # Lowercase everything
  metric = re.sub('(#survey|\(corpus\))', '', metric)
  if metric in metric_mapping.keys():
    return metric_mapping[metric]
  return metric

# Some metrics were reported as several metrics in one line, split them to keep them consistent
def split_grouped_metrics(df):



# Normalize URLs
def normalize_urls(url):
  return re.sub('(/|\.pdf)$', '', url)

def normalize_task(taskstring):
  if taskstring is None:
    return frozenset()
  if isinstance(taskstring, frozenset):
    return taskstring
  subtasks = re.split('[:;,]', taskstring)
  updated = []
  for t in subtasks:
    task = t.strip().lower()
    if task != "":
      if task in task_mapping.keys():
        task = task_mapping[task]
      updated.append(task)

  return frozenset(updated)

Open the worksheet and make it into a DataFrame + Normalization

In [16]:
dfs = []
for sheet in SHEETS:
  worksheet = gc.open_by_url('https://docs.google.com/spreadsheets/d/1NU6IlxhYg515RLjsVxNW5FS0ChWbIKi3yrrdfqEvsRM/').worksheet(sheet)

  # get_all_values gives a list of rows.
  rows = worksheet.get_all_values()

  # Convert to a DataFrame and render.
  df = pd.DataFrame.from_records(rows[1:], columns=rows[0])
  # Sometimes there will be blank rows with "Updated Task"
  df = df[df[ID] != ""]
  df[CONF] = sheet
  df.reset_index(inplace=True, drop=True)
  dfs.append(df)
df = pd.concat(dfs, ignore_index=True)

In [26]:
names_to_split = {
    "accuracy/p/r/f1": ["accuracy", "precision", "recall", "f1"],
    "bleu{1,2}": ["bleu1", "bleu2"],
    "distinctngrams(dist{1,2,3})": ['distinctunigrams', 'distinctbigrams', 'distincttrigrams'],
    "dist{1,2,3}": ['distinctunigrams', 'distinctbigrams', 'distincttrigrams'],
    "repnmetricsforn=2,3,4": ['bigramrepetition', 'trigramrepetition', '4gramrepetition'],
    "rouge{1,2,l}": ["rouge1", "rouge2", "rougel"],
    "rouge{1,2}": ["rouge1", "rouge2"]
}

In [25]:
metric_mapping = {
    'harmonicmean(hmean)between(1−pbleu)andbleu': 'harmonicmean(pbleubleu)',
    'harmonicmeanof1pbleuandbleu': 'harmonicmean(pbleubleu)',
    'hmeanbetween(1pbleu)andbleu':'harmonicmean(pbleubleu)',
    'harmonicmeanofbleu4andstyleaccuracy': 'harmonicmean(bleu4styleaccuracy)',
    'pairwisebleu': 'pbleu',
    'pbleu(selfbleu)': 'pbleu',
    'em': 'exactmatch',
    'exactmatch(em)': 'exactmatch',
    'inform(rate)': 'inform',
    'success(rate)': 'success',
    'combinescore(informandrate)': 'combinedscore(informandrate)',
    'bleu(4)': 'bleu4',
    'accuracy(?)': 'accuracy',
    'macroaveragedf1score(f1)': 'f1',
    'sensitivity': 'demetrbenchmarksensitivityscores',
    'bleurtbase': 'bleurt',
    'allmpnetbasev2': 'mpnetcosinesimilarity',
    'negmpnet': 'negmpnetcosinesimilarity',
    'distinct1': 'distinctunigrams',
    'distinct2': 'distinctbigrams',
    'distinct4': 'distinct4grams',
    'dist1': 'distinctunigrams',
    'dist2': 'distinctbigrams',
    'dist3': 'distincttrigrams',
    'distinct3': 'distincttrigrams',
    'sacrebleu': 'bleu',
    'bleuscore': 'bleu',
    'rquge': 'rouge',
    'bertscorefscore': 'bertscoref1',
    'bertscorep': 'bertscoreprecision',
    'bertscorer': 'bertscorerecall',
    'beatf1': 'bertscoref1',
    'bertscorefmeasure': 'bertscoref1',
    'bleurtscore': 'bleurt'
}

In [18]:
task_mapping = {
    "natural language entailment)": "natural language inference",
    "data-text generation": "data-to-text generation",
    "data-to-text": "data-to-text generation",
    "dialogue generation": "dialogue turn generation",
    "dialogue response": "dialogue turn generation",
    "dialouge": "dialogue turn generation",
    "open-ended dialogue": "dialogue turn generation",
    "task-oriented dialouge": "dialogue turn generation",
    "paraphrase generation": "paraphrasing / lossless simplification",
    "paraphrasing/lossless simplification": "paraphrasing / lossless simplification",
    "text simplification": "compression / lossy simplification",
    "question-generation": "question generation",
    "quora question pairs": "question answering",
    "and question answering": "question answering",
    "simile generation": "simile generation (text-to-text)",
    "story-generation": "story generation",
    "text summarization": "summarisation (text-to-text)",
    "summarisation": "summarisation (text-to-text)",
    "summarization": "summarisation (text-to-text)",
    "summarization (text-to-text)": "summarisation (text-to-text)",
    "evaluate semantic diversity between two natural language \ngeneration": "evaluate semantic diversity between two natural language generation",
    "updated task": ""
}

In [19]:
# Normalize
df[SURVEY] = df[METRIC_NAME].str.contains("#survey")
df[METRIC_NAME] = df[METRIC_NAME].apply(normalize_metric)
df[TASK] = df[TASK].apply(normalize_task)

In [24]:
print(sorted(df[METRIC_NAME].unique()))

['', 'accuracy', 'accuracy(sentiment)', 'accuracy(tense)', 'accuracy(topic)', 'accuracy/p/r/f1', 'accuracyofcomparator', 'accuracyofkeywordinclusion', 'accuracyofkeywordinclusionatspecifiedposition', 'accuracyofpredictingpvalue', 'accuracyofvehicle', 'actclassificationaccuracy(aacc)roberta', 'actmultipleattributeevaluation(amae)', 'add', 'agreementthenumberofquestionsgeneratedbygpt2(#q)matchesthenumberofgpt3annotatedquestionsforagivenproblem', 'alignscore', 'anli', 'attrauto', 'auc', 'auroc', 'average', 'averageddistinctiveness', 'averagelength', 'averageofrouge1,rouge2,androugel', 'averagesentencelength', 'backwardbleu', 'bartscore', 'bartscorefaithfulness', 'bartscorefscore', 'bartscoreprecision', 'bartscorerecall', 'beatf1', 'bertscore', 'bertscoref1', 'bertscorefmeasure', 'bertscoreprecision', 'bertscorerecall', 'bias', 'bias(absolutevalueofrelevance50)', 'bigramttr', 'blanc', 'bleu', 'bleu1', 'bleu2', 'bleu3', 'bleu4', 'bleualc(areaunderlearningcurve)', 'bleun', 'bleurt', 'bleurts

In [21]:
uni = set ()
for fs in df[TASK].unique():
  uni.update(fs)
uni

{'',
 'abductive nlg',
 'abductive nli',
 'classification as generation',
 'commonsense reasoning',
 'compare probability distribution for one step text generation (surface realisation)',
 'compression / lossy simplification',
 'content selection/determination',
 'data-to-text generation',
 'dialogue turn generation',
 'end-to-end text generation',
 'evaluate semantic diversity between two natural language generation',
 'evaluation',
 'feature-controlled generation',
 'lexicalisation',
 'machine translation',
 'mixinstruct (instruction-based nlg)',
 'multi-user dialogues',
 'multilingual semantic textual similarity',
 'multimodal-to-text',
 'multiple (list all)',
 'next action generation',
 'next word prediction',
 'open-ended text generation (lm sampling)',
 'other (please specify)',
 'paraphrasing / lossless simplification',
 'prompt continuation',
 'question answering',
 'question classification',
 'question generation',
 'referring expression generation',
 'semantic textual similar

In [22]:
# First take a look at papers with no metrics at all
no_metrics = df[df[METRIC_NAME] == ""]
num_no_metrics = len(no_metrics[ID].unique())
df_all = df[df[METRIC_NAME] != ""]
num_with_all = len(df_all[ID].unique())
print(f"There are {num_no_metrics} papers with no metrics, {num_with_all} papers remain for analysis.")

# Now exclude papers with only human metrics, but also report how many papers uses human metrics
hum_df = df[df[METRIC_NAME].str.contains('human')]
papers_hum = len(hum_df[ID].unique())
num_h = len(hum_df)
print(f"{papers_hum} out of {len(df)} papers use human evaluation. In total, there were {num_h} human metrics used.")

There are 2 papers with no metrics, 102 papers remain for analysis.
57 out of 746 papers use human evaluation. In total, there were 131 human metrics used.
