Import the necessary libraries and authenticate the user to access the Google Sheet:

In [None]:
from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()


from nltk.metrics.agreement import AnnotationTask
from nltk.metrics import edit_distance, jaccard_distance, masi_distance

gc = gspread.authorize(creds)

Define functions and set columns as variable names:

In [None]:
# List the column names as constants to prevent typos
ID = "ACL Paper ID"
METRIC_NAME = "Metric name"
NEWLY = "Newly introduced?"
APPENDIX = "Appendix"
TASK = "Updated Task"
TASK_OLD = "Task"
LINK_TO_METRIC = "Link to the Metric Paper"
PAPER_LINK = "Link to the Paper"
CORRELATED = "Corrleated w/ Human Evaluation?"
ANNOTATOR = "Annotator"
METRIC_IMPL = "Metric Implementations"

CLOSED_CLASS_COLUMNS = [NEWLY, APPENDIX, CORRELATED]
OPEN_CLASS_COLUMNS = [METRIC_NAME, TASK, LINK_TO_METRIC, PAPER_LINK]
# Metric needs to be evaluated separately and then we need to merge it to the paper ID to have a unique key and match corresponding lines
COLUMNS_TO_EVALUATE = [NEWLY, APPENDIX, TASK, METRIC_IMPL, PAPER_LINK, CORRELATED]

In [None]:
import pandas as pd
import re
# The following snippet checks for the number of annotations per one paper per annotator and reports discrepancies
def print_metric_counts(df):
  counts = df.groupby([ID, ANNOTATOR]).size().reset_index(name='count')
  agreed = disagreed = 0
  for i, group in counts.groupby(ID):
      unique_counts = group['count'].nunique()
      if unique_counts > 1:
          disagreed += 1
          annotations_info = []
          for annotator, count in zip(group[ANNOTATOR], group['count']):
              annotations_info.append(f"{annotator} reported {count} metrics")
          print(f"For paper id {i}, {'; '.join(annotations_info)}")
      else:
        agreed += 1
  print(f'With {agreed} agreements and {disagreed} disagreements, annotators agreed in {100 * agreed / float(agreed + disagreed)}% cases.')

# Normalize the metric string
def normalize_metric(metric):
  metric = re.sub('[- +@]+', '', metric) # Remove spaces and other special symbols that might occur - keeping parentheses deliberately
  metric = metric.lower() # Lowercase everything
  if metric in metric_mapping.keys():
    return metric_mapping[metric]
  return metric

# Normalize URLs
def normalize_urls(url):
  return re.sub('(/|\.pdf)$', '', url)

def normalize_task(taskstring):
  subtasks = re.split('[:;,]', taskstring)
  updated = []
  for t in subtasks:
    task = t.strip().lower()
    if task != "":
      if task in task_mapping.keys():
        task = task_mapping[task]
      updated.append(task)

  return frozenset(updated)

"""
  The functions below were taken from the 20 years repo - iaa utilities and modified
"""
def extract_iaa_df_by_column_name(annotation_df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    """Extract a three-column dataframe with `column_name` items grouped by ANNOTATOR (instead of the source spreadsheet) and ID instead of `key`."""
    if column_name == TASK:
      return annotation_df[[ANNOTATOR, ID, column_name]]
    return annotation_df[[ANNOTATOR, ID, column_name]] \
        .groupby([ANNOTATOR, ID])[column_name] \
        .apply(frozenset).reset_index()


def extract_records_for_nltk(iaa_df: pd.DataFrame):
    """The first column in the `to_records()` representation is an index, which we don't need for `nltk`."""
    return [(b, c, d) for _, b, c, d in iaa_df.to_records()]


def pretty_print_iaa_by_column(iaa_by_column_dict, values=("alpha_jaccard", "alpha_masi")):
    print(f"column\t{'  '.join(values)}")
    for column in iaa_by_column_dict:
        values_string = '    '.join([f"{iaa_by_column_dict[column][value]:.2f}" for value in values])
        print(f"{column}\t{values_string}")

def run_closed_class_jaccard_and_masi(df: pd.DataFrame, columns):
        iaa_by_column = {column: {"df": extract_iaa_df_by_column_name(df, column)} for column in columns}

        for column in iaa_by_column:
            #if column == APPENDIX:
            #  print(iaa_by_column[column]['df'].groupby(ID).head())
            task = AnnotationTask(distance=jaccard_distance)
            task.load_array(extract_records_for_nltk(iaa_by_column[column]['df']))
            iaa_by_column[column]['alpha_jaccard'] = task.alpha()
            #iaa_by_column[column]['fleissk_jaccard'] = task.multi_kappa()

            task = AnnotationTask(distance=masi_distance)
            task.load_array(extract_records_for_nltk(iaa_by_column[column]['df']))
            iaa_by_column[column]['alpha_masi'] = task.alpha()
            #iaa_by_column[column]['fleissk_masi'] = task.multi_kappa()
        return iaa_by_column

def print_absolute_agreement(dataframe: pd.DataFrame, iaa_by_column_dict=None, columns=[METRIC_NAME]):
        if iaa_by_column_dict is None:
            iaa_by_column_dict = run_closed_class_jaccard_and_masi(dataframe, columns)
            pretty_print_iaa_by_column(iaa_by_column_dict)
            print()
        for column in columns:
            df = iaa_by_column_dict[column]['df']
            print(f"Interannotator agreement for {column}")
            annotator_list = dataframe[ANNOTATOR].unique()
            print(" \t" + "\t".join([str(annotator) for annotator in annotator_list]))
            for a1 in annotator_list:
                a1_vals = list(df[df[ANNOTATOR] == a1][column])
                print(f"{a1}", end="\t")
                pairwise_agreements = []
                for a2 in annotator_list:
                    a2_vals = list(df[df[ANNOTATOR] == a2][column])
                    agreement_sum = 0
                    for a1_val, a2_val in zip(a1_vals, a2_vals):
                        agreement_sum += 1 - jaccard_distance(a1_val, a2_val)
                    pairwise_agreements.append(agreement_sum / min(len(a1_vals), len(a2_vals)))
                    print(f"{pairwise_agreements[-1]:.2f}", end="\t")
                print(f"\t{(sum(pairwise_agreements) - 1) / (len(pairwise_agreements) - 1):.2f}")
            print()
            print()

def print_absolute_agreement_by_id(dataframe: pd.DataFrame, iaa_by_column_dict=None, columns=[METRIC_NAME]):
        if iaa_by_column_dict is None:
            iaa_by_column_dict = run_closed_class_jaccard_and_masi(dataframe, columns)
            pretty_print_iaa_by_column(iaa_by_column_dict)
            print()
        for column in columns:
            df = iaa_by_column_dict[column]['df']
            print(f"Interannotator agreement for {column}")
            annotator_list = dataframe[ANNOTATOR].unique()
            print(" \t" + "\t".join([str(annotator) for annotator in annotator_list]))
            for a1 in annotator_list:
                a1_vals = df[df[ANNOTATOR] == a1].set_index(ID)[column].to_dict()
                print(f"{a1}", end="\t")
                pairwise_agreements = []
                for a2 in annotator_list:
                    a2_vals = df[df[ANNOTATOR] == a2].set_index(ID)[column].to_dict()
                    agreement_sum = 0
                    for a1_key in a1_vals.keys():
                        if not a1_key in a2_vals.keys():
                          continue
                        a1_val = a1_vals[a1_key]
                        a2_val = a2_vals[a1_key]
                        agreement_sum += 1 - jaccard_distance(a1_val, a2_val)
                    pairwise_agreements.append(agreement_sum / min(len(a1_vals), len(a2_vals)))
                    print(f"{pairwise_agreements[-1]:.2f}", end="\t")
                print(f"\t{(sum(pairwise_agreements) - 1) / (len(pairwise_agreements) - 1):.2f}")
            print()
            print()

Open the worksheet and make it into a DataFrame + Normalization

In [None]:
worksheet = gc.open_by_url('https://docs.google.com/spreadsheets/d/1NU6IlxhYg515RLjsVxNW5FS0ChWbIKi3yrrdfqEvsRM/').worksheet('IAA')

# get_all_values gives a list of rows.
rows = worksheet.get_all_values()

# Convert to a DataFrame and render.
df = pd.DataFrame.from_records(rows[1:], columns=rows[0])



In [None]:
metric_mapping = {
    'harmonicmean(hmean)between(1−pbleu)andbleu': 'harmonicmean(pbleubleu)',
    'harmonicmeanof1pbleuandbleu': 'harmonicmean(pbleubleu)',
    'hmeanbetween(1pbleu)andbleu':'harmonicmean(pbleubleu)',
    'harmonicmeanofbleu4andstyleaccuracy': 'harmonicmean(bleu4styleaccuracy)',
    'pairwisebleu': 'pbleu',
    'pbleu(selfbleu)': 'pbleu',
    'em': 'exactmatch',
    'exactmatch(em)': 'exactmatch',
    'inform(rate)': 'inform',
    'success(rate)': 'success',
    'combinescore(informandrate)': 'combinedscore(informandrate)',
    'bleu(4)': 'bleu4',
    'accuracy(?)': 'accuracy',
    'macroaveragedf1score(f1)': 'f1',
    'sensitivity': 'demetrbenchmarksensitivityscores',
    'bleurtbase': 'bleurt',
    'allmpnetbasev2': 'mpnetcosinesimilarity',
    'negmpnet': 'negmpnetcosinesimilarity',
    'distinct1': 'distinctunigrams',
    'distinct2': 'distinctbigrams',
    'distinct4': 'distinct4grams'
}

In [None]:
task_mapping = {
    "natural language entailment)": "natural language inference",
    "data-text generation": "data-to-text generation",
    "data-to-text": "data-to-text generation",
    "dialogue generation": "dialogue turn generation",
    "dialogue response": "dialogue turn generation",
    "dialouge": "dialogue turn generation",
    "open-ended dialogue": "dialogue turn generation",
    "task-oriented dialouge": "dialogue turn generation",
    "paraphrase generation": "paraphrasing / lossless simplification",
    "text simplification": "compression / lossy simplification",
    "question-generation": "question generation",
    "quora question pairs": "question answering",
    "story-generation": "story generation",
    "summarisation": "summarisation (text-to-text)",
    "summarization": "summarisation (text-to-text)",
}

In [None]:
# Normalize
df[METRIC_NAME] = df[METRIC_NAME].apply(normalize_metric)
df[PAPER_LINK] = df[PAPER_LINK].apply(normalize_urls)
df[LINK_TO_METRIC] = df[LINK_TO_METRIC].apply(normalize_urls)
df[TASK] = df[TASK].apply(normalize_task)

In [None]:
uni = set ()
for fs in df[TASK].unique():
  uni.update(fs)
uni

Calculate IAA:

In [None]:
# Quick glance at the number of metrics found by each annotator
print_metric_counts(df)
print()

# Make sure everyone has the same ordering of papers:
df = df.sort_values(by=ID)

# Print agreement on all metrics including human
print('IAA including human metrics:')
print_absolute_agreement(df)

# Watch out, maybe there is a paper with only human metrics

# Now exclude human metrics
df_automatic = df[df[METRIC_NAME].str.contains('human|n/a') == False]
print('IAA after excluding human metrics:')
print_absolute_agreement(df_automatic)

# Create a copy of the dataframe to calculate IAA for ID + metric pairs
df_by_metrics = df_automatic.copy(deep=True)
df_by_metrics[ID] = df_by_metrics[ID] + '-' + df_by_metrics[METRIC_NAME]


# For each paper+metric, compute the agreement for the remaining columns
print_absolute_agreement_by_id(df_by_metrics, None, COLUMNS_TO_EVALUATE)

In [None]:
df_automatic[METRIC_NAME].unique()

In [None]:
df_automatic