In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import itertools as it

import pandas as pd
from nltk.metrics.agreement import AnnotationTask
from nltk.metrics.distance import jaccard_distance
from nltk.metrics.distance import masi_distance

import src.db.load
from src.coding.labels import LABELS
from src.coding.metrics import label_agreement
from src.coding.metrics import label_confusion
from src.db.connect import make_engine

In [None]:
# setup
engine = make_engine("DB")

pd.set_option("display.max_colwidth", 512)
pd.set_option("display.max_rows", None)

In [None]:
df = src.db.load.label_data(engine)

df_labels = df.explode("labels")
df_labels = df_labels[~df_labels.labels.str.startswith("no_")]

In [None]:
class AnnoTask(AnnotationTask):
    def load_data(self, array):
        self.C = set()
        self.K = set()
        self.I = set()
        self.data = []
        self.load_array(array)


task = AnnoTask(distance=jaccard_distance)

# Overall Evaluation

## Krippendorff's alpha

### all

- all cases, including None
- all cases, including those only labeled by a single coder


In [None]:
task.load_data(zip(df.username, df.sample_id, df.labels))
task.alpha()

0.3722236712284903

In [None]:
task_df = df_labels.groupby(["sample_id", "username"]).agg(frozenset).reset_index()
task.load_data(zip(task_df.username, task_df.sample_id, task_df.labels))
task.alpha()

0.698720596365064

# Krippendorff's alpha by label


## at least 1 coder

- documents are included if at least 1 coder has set the specific label on a document


In [None]:
labels = [l for l in df_labels.labels.unique()]

rows = []
for lab in labels:
    sub = (
        df_labels.groupby(["sample_id", "username"])["labels"]
        .agg(frozenset)
        .reset_index()
        .groupby("sample_id")
        .filter(lambda x: x.username.nunique() > 1)
        .groupby("sample_id")
        .filter(lambda x: x.labels.apply(lambda x: lab in x).any())
    )

    if sub.empty:
        row = (lab, -1, 0)
    else:
        task.load_data(zip(sub.username, sub.sample_id, sub.labels))
        alpha = round(task.alpha(), 3)
        row = (lab, alpha, sub.sample_id.nunique())

    rows.append(row)

In [None]:
alpha = pd.DataFrame(rows, columns=["label", "alpha", "n_docs"]).sort_values(
    "alpha", ascending=False
)
alpha.style.background_gradient()

Unnamed: 0,label,alpha,n_docs
0,antielite,0.474,1433
1,pplcentr,0.452,695
3,left,0.394,504
5,pplmore,0.354,97
4,right,0.309,311
2,eliteless,0.145,55


## at least 2 coders

- documents are included if at least 2 coder have set the specific label on a document


In [None]:
labels = [l for l in df_labels.labels.unique()]

rows = []

for lab in labels:
    sub = (
        df_labels.groupby(["sample_id", "username"])["labels"]
        .agg(frozenset)
        .reset_index()
        .groupby("sample_id")
        .filter(lambda x: x.username.nunique() > 1)
        .groupby("sample_id")
        .filter(lambda x: sum(x.labels.apply(lambda x: lab in x)) > 1)
    )

    if sub.empty:
        row = (lab, -1, 0)
    else:
        task.load_data(zip(sub.username, sub.sample_id, sub.labels))
        alpha = round(task.alpha(), 3)
        row = (lab, alpha, sub.sample_id.nunique())

    rows.append(row)

In [None]:
alpha_vals = pd.DataFrame(rows, columns=["label", "alpha", "n_docs"]).sort_values(
    "alpha", ascending=False
)
alpha_vals.style.background_gradient()

Unnamed: 0,label,alpha,n_docs
1,pplcentr,0.513,513
0,antielite,0.51,1314
3,left,0.478,313
4,right,0.32,198
5,pplmore,0.248,67
2,eliteless,0.125,12


# Custom Agreement Metric


In [None]:
labels = [l for l in df_labels.labels.unique()]

rows = []

for lab in labels:
    agreements = (
        df_labels.groupby(["sample_id", "username"])["labels"]
        .agg(frozenset)
        .reset_index()
        .groupby("sample_id")
        .filter(lambda x: x.username.nunique() > 1)
        .groupby("sample_id")
        .apply(lambda x: label_agreement(x.labels, lab))
    )

    mean_agreement = agreements.mean()
    n_docs = len(agreements[~agreements.isnull()])

    rows.append((lab, agreements.mean(), n_docs))

In [None]:
agreement = pd.DataFrame(rows, columns=["label", "agreement", "n_docs"]).sort_values(
    "agreement", ascending=False
)

metrics = pd.merge(alpha_vals, agreement.drop("n_docs", axis=1), on="label").sort_values(
    "agreement", ascending=False
)

metrics.style.background_gradient(axis=0)

Unnamed: 0,label,alpha,n_docs,agreement
1,antielite,0.51,1314,0.886078
0,pplcentr,0.513,513,0.66777
4,pplmore,0.248,67,0.652062
2,left,0.478,313,0.511475
3,right,0.32,198,0.499357
5,eliteless,0.125,12,0.137879


## Coherence of metrics

- alpha seems to be more dependent on number of docs


In [None]:
metrics.corr("pearson", numeric_only=True)

Unnamed: 0,alpha,n_docs,agreement
alpha,1.0,0.725956,0.754392
n_docs,0.725956,1.0,0.766712
agreement,0.754392,0.766712,1.0


In [None]:
metrics.corr("spearman", numeric_only=True)

Unnamed: 0,alpha,n_docs,agreement
alpha,1.0,0.942857,0.771429
n_docs,0.942857,1.0,0.828571
agreement,0.771429,0.828571,1.0


# Labels Pairwise

- General setting:
  - docs with less than 2 coders are ignored
  - docs must be coded with both labels of the pair at least once, otherwise they are ignored


In [None]:
# setup


def pivot(df, values, precision):
    """reformats and displays a pivot table

    Args:
        df: DataFrame to be pivoted
        values (Any): value to be pivoted
        precision (int): precision for values in table
    """
    piv = pd.pivot(df, index="label1", columns="label2", values=values).fillna(value=0)
    piv = piv.reindex(sorted(piv.columns), axis=1).reindex(sorted(piv.index), axis=0)

    return piv.style.background_gradient(axis=None).format(precision=precision)

### Create Data


In [None]:
labels = [l for l in df_labels.labels.unique()]

rows = []
for labs in it.combinations(labels, r=2):
    confusion = (
        df_labels.groupby("sample_id")
        .filter(
            lambda x: x.labels.str.contains(labs[0]).any() and x.labels.str.contains(labs[1]).any()
        )
        .groupby(["sample_id", "username"])["labels"]
        .agg(frozenset)
        .reset_index()
        .groupby("sample_id")
        .filter(lambda x: x.username.nunique() > 1)
        .groupby("sample_id", group_keys=False)
        .apply(lambda x: label_confusion(x.labels, *labs))
    )

    if confusion.empty:
        rows.append((*sorted(labs, reverse=True), 0, 0))
        rows.append((*sorted(labs, reverse=False), 0, 0))
        continue

    mean_confusion = confusion.mean()
    n_docs = len(confusion[~confusion.isnull()])

    rows.append((*sorted(labs, reverse=True), mean_confusion, n_docs))
    rows.append((*sorted(labs, reverse=False), mean_confusion, n_docs))

confusion = pd.DataFrame(rows, columns=["label1", "label2", "confusion", "n_docs"])

In [None]:
print(confusion.corr("pearson", numeric_only=True))

print(confusion.corr("spearman", numeric_only=True))

           confusion   n_docs
confusion    1.00000 -0.37038
n_docs      -0.37038  1.00000
           confusion    n_docs
confusion   1.000000 -0.169795
n_docs     -0.169795  1.000000


## Pairwise Counts

- Question: How often are labels set pairwise on a single document?
- Can be of the same or different coders


In [None]:
pivot(confusion, "n_docs", 0)

label2,antielite,eliteless,left,pplcentr,pplmore,right
label1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
antielite,0,47,369,266,12,285
eliteless,47,0,18,12,5,11
left,369,18,0,245,16,24
pplcentr,266,12,245,0,31,80
pplmore,12,5,16,31,0,2
right,285,11,24,80,2,0


## Pairwise Confusion

- Question: How much better had the agreement been if the worse label had been coded as the better one?
- Higher values show higher confusion


In [None]:
pivot(confusion, "confusion", 3)

label2,antielite,eliteless,left,pplcentr,pplmore,right
label1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
antielite,0.0,0.269,0.031,0.395,0.333,0.054
eliteless,0.269,0.0,0.241,0.25,0.625,0.25
left,0.031,0.241,0.0,0.127,0.267,0.0
pplcentr,0.395,0.25,0.127,0.0,0.583,0.333
pplmore,0.333,0.625,0.267,0.583,0.0,0.0
right,0.054,0.25,0.0,0.333,0.0,0.0


## Pairwise 2

- Change: less data restrictions
- Only 1 of the 2 labels has to be present for a doc to count


### Create Data


In [None]:
labels = [l for l in df_labels.labels.unique()]

rows = []

for labs in it.combinations(labels, r=2):
    confusion = (
        df_labels.groupby("sample_id")
        .filter(lambda x: x.labels.isin(labs).any())
        .groupby(["sample_id", "username"])["labels"]
        .agg(frozenset)
        .reset_index()
        .groupby("sample_id")
        .filter(lambda x: x.username.nunique() > 1)
        .groupby("sample_id", group_keys=False)
        .apply(lambda x: label_confusion(x.labels, *labs))
    )

    if confusion.empty:
        rows.append((*sorted(labs, reverse=True), 0, 0))
        rows.append((*sorted(labs, reverse=False), 0, 0))
        continue

    mean_confusion = confusion.mean()
    n_docs = len(confusion[~confusion.isnull()])

    rows.append((*sorted(labs, reverse=True), mean_confusion, n_docs))
    rows.append((*sorted(labs, reverse=False), mean_confusion, n_docs))

confusion = pd.DataFrame(rows, columns=["label1", "label2", "confusion", "n_docs"])

In [None]:
print(confusion.corr("pearson", numeric_only=True))

print(confusion.corr("spearman", numeric_only=True))

           confusion    n_docs
confusion   1.000000 -0.031535
n_docs     -0.031535  1.000000
           confusion   n_docs
confusion    1.00000  0.01966
n_docs       0.01966  1.00000


### Pairwise Counts 2


In [None]:
pivot(confusion, "n_docs", 0)

label2,antielite,eliteless,left,pplcentr,pplmore,right
label1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
antielite,0,376,380,419,404,374
eliteless,376,0,100,98,54,46
left,380,100,0,149,120,115
pplcentr,419,98,149,0,108,105
pplmore,404,54,120,108,0,64
right,374,46,115,105,64,0


### Pairwise Confusion 2

- Question: How much better had the agreement been if the worse label had been coded as the better one?
- Higher values show higher confusion


In [None]:
pivot(confusion, "confusion", 3)

label2,antielite,eliteless,left,pplcentr,pplmore,right
label1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
antielite,0.0,0.013,0.007,0.033,0.005,0.004
eliteless,0.013,0.0,0.022,0.005,0.023,0.011
left,0.007,0.022,0.0,0.017,0.011,0.0
pplcentr,0.033,0.005,0.017,0.0,0.043,0.01
pplmore,0.005,0.023,0.011,0.043,0.0,0.0
right,0.004,0.011,0.0,0.01,0.0,0.0
