In [None]:
from collections import Counter
from collections import OrderedDict

import altair as alt
import pandas as pd
import spacy
from asciitree import LeftAligned
from asciitree import drawing
from nltk.metrics.agreement import AnnotationTask
from nltk.metrics.distance import jaccard_distance
from sqlalchemy.orm import Session
from sqlalchemy.orm import joinedload
from tqdm import tqdm

import src
import src.db.models.doccano as m
from src.coding.labels import LABELS
from src.db.connect import make_engine
from src.db.sample import Sample

In [None]:
# setup
engine = make_engine("DOCCANO")

session = Session(engine)

pd.set_option("display.max_colwidth", 512)
pd.set_option("display.max_rows", 512)

project = "PBert AnnoTask 5"

In [None]:
# prepare sentence tokenizer

nlp = spacy.load("de_core_news_md")
_ = nlp.disable_pipes(["tagger", "morphologizer", "lemmatizer", "attribute_ruler", "ner"])

# 1. Coding scheme / Intro

## Overarching Goal

We are trying to build a classifier that predicts if a sentence contains any populism --- and if so, is it left or right-wing populism?

## Data

We use protocols of parliamentary debates of the German Bundestag

### Sparse Data

Populism is not very frequent in our data. 
To circumvent coding almost purely negative samples, we use **existing dictionary approaches to classify sentences** and draw stratified samples.



## Problem 1: Level of coding

- We currently have a heavily nested coding schema based on theoretical considerations (apologies for the German/English mix...)
- **Question: What level is helpful for coding?**
- **Question: What level is helpful to build the classifier?**
  - Should these two align?


In [None]:
tr = LeftAligned(
    draw=drawing.BoxStyle(gfx=drawing.BOX_HEAVY, horiz_len=10, indent=1),
)

data = {"labels": OrderedDict()}
order = ["neutral", "rechts", "links"]
items = [(l, d) for o in order for (l, d) in [k for k in LABELS.items() if k[1]["lr"] == o]]

for label, d in items:
    data["labels"][d["lr"]] = OrderedDict()

for label, d in items:
    if not data["labels"][d["lr"]].get(d["dim"]):
        data["labels"][d["lr"]][d["dim"]] = OrderedDict()
    data["labels"][d["lr"]].get(d["dim"], {})[label] = {}

print(tr(data))

labels
 ┣━━━━━━━━━━ neutral
 ┃           ┣━━━━━━━━━━ anti-elite
 ┃           ┃           ┣━━━━━━━━━━ Eliten sind korrumpiert
 ┃           ┃           ┣━━━━━━━━━━ Eliten sind schuld
 ┃           ┃           ┗━━━━━━━━━━ Eliten repräsentieren nicht das Volk
 ┃           ┣━━━━━━━━━━ people-centrism
 ┃           ┃           ┣━━━━━━━━━━ Sprecher repräsentiert Volk
 ┃           ┃           ┣━━━━━━━━━━ Volk ist tugendhaft
 ┃           ┃           ┣━━━━━━━━━━ Volk als homogene Masse
 ┃           ┃           ┗━━━━━━━━━━ Volk als Verursacher einer positiven Entwicklung
 ┃           ┗━━━━━━━━━━ volkssouveränität
 ┃                       ┣━━━━━━━━━━ Weniger Macht für Eliten
 ┃                       ┗━━━━━━━━━━ Mehr Macht für Volk
 ┣━━━━━━━━━━ rechts
 ┃           ┣━━━━━━━━━━ nativismus
 ┃           ┃           ┣━━━━━━━━━━ Exklusion von Randgruppen
 ┃           ┃           ┣━━━━━━━━━━ Nicht-Einheimische sind korrumpiert / schuld
 ┃           ┃           ┗━━━━━━━━━━ Einheimisches Volk ist tugendhaft /

## Problem 2: Context is needed

- Coder's suggested that a single sentence is often not enough to decide on a label (especially when it is unclear what some elments of the sentence refer to)

- Because of this, we started to give "context"; so the surrounding to sentences in the form of:

&nbsp;

```
Context sentence 1
----------
Sentence to code
----------
Context sentence 2
```

&nbsp;

- **Question: Does this even make any sense?**
    - Can we still train a BERT-Classifier on the middle-sentence or should we give all 3?

## Problem 3: Majority voting(?)

- The theoretical definitions of Populism are very fuzzy.

- Current solution: We allow a label if at least **two coders agree** on it.
    - Is this a viable option?
    - Should we use something else like a majority vote?
    - Often, the majority would be "none".

In [None]:
q = (
    session.query(m.ExamplesExample)
    .options(joinedload(m.ExamplesExample.labels), joinedload(m.ExamplesExample.state))
    .join(m.ExamplesExample.project)
    .filter(
        m.ProjectsProject.name == project,
        # uncomment to only collect samples confirmed by at least one person
        m.ExamplesExample.state.any(),
    )
)

samples = []
label_dict = {label: LABELS[label]["lr"] for label in LABELS.keys()}
for row in q:
    sample = Sample(row, nlp, label_dict)
    samples.append(sample)

df = pd.DataFrame(
    {"n_coders": [len(s.confirmed_by) for s in samples if "NICHT ZUTREFFEND" not in s.label_counts]}
)
print(len(df))

1535


In [None]:
%%capture --no-display

alt.Chart(df).mark_bar().encode(alt.X("n_coders:O"), alt.Y("count(*):Q")).properties(
    width=800, height=400, title="Number of coders per Sample"
)

In [None]:
# example case id = 2936

row = session.query(m.ExamplesExample).filter(m.ExamplesExample.id == 2936).one()

sample = Sample(row, nlp)

print("\n------\n".join(sample.sents))
print()
print(sample.label_counts)

Über 60 Prozent der deutschen Waffenexporte gehen mittlerweile an Länder außerhalb der NATO, obwohl in diesen Politischen Grundsätzen steht, dass das die riesengroße Ausnahme sein soll.
------
Über 60 Prozent!
------
Das haben Sie von der CDU verbrochen.

Counter({'Eliten sind schuld': 2, 'none': 1})


# 2. Modeling

## Problem 4: Whether or not to use multilabel-classification?

There exist dimensions (and therefore phrases) that can be classified into left-wing (_links_) or right-wing (_rechts_) populism.

But there also exist dimensions that can belong to either or none of the two dimensions (_neutral_).

Our idea was to use multilabel-classification to distinguish between cases that are:

- neutral (either) + left
- neutral (either) + right
- neutral but neither right nor left

---

## Problem 5: Class imbalance

- We have extremely imbalanced data
- **Question: Are there (dis-)advantages in *down-sampling* vs. *down-weighting* the majority class (none)?**


--> maybe show example code...

In [None]:
# if at least two coders have to agree
Counter(frozenset(s.labels) for s in samples).most_common()

[(frozenset({'none'}), 1395),
 (frozenset({'neutral'}), 84),
 (frozenset({'nicht zutr'}), 35),
 (frozenset({'links'}), 15),
 (frozenset({'links', 'neutral'}), 4),
 (frozenset({'neutral', 'rechts'}), 2)]

In [None]:
# raw labels
Counter(frozenset(s.label_counts.keys()) for s in samples).most_common()

[(frozenset({'none'}), 1169),
 (frozenset({'neutral', 'none'}), 150),
 (frozenset({'nicht zutr', 'none'}), 117),
 (frozenset({'links', 'none'}), 54),
 (frozenset({'neutral'}), 15),
 (frozenset({'links', 'neutral', 'none'}), 12),
 (frozenset({'none', 'rechts'}), 5),
 (frozenset({'neutral', 'none', 'rechts'}), 4),
 (frozenset({'links', 'neutral'}), 2),
 (frozenset({'neutral', 'rechts'}), 2),
 (frozenset({'neutral', 'nicht zutr', 'none'}), 1),
 (frozenset({'links', 'none', 'rechts'}), 1),
 (frozenset({'links', 'nicht zutr', 'none'}), 1),
 (frozenset({'links'}), 1),
 (frozenset({'links', 'neutral', 'none', 'rechts'}), 1)]

In [None]:
samples = []
for row in q:
    sample = Sample(row, nlp)
    samples.append(sample)

Counter(label for sample in samples for label in sample.label_counts).most_common()

[('none', 1515),
 ('NICHT ZUTREFFEND', 119),
 ('Eliten sind korrumpiert', 114),
 ('Eliten sind schuld', 94),
 ('Inklusion von Randgruppen', 43),
 ('Ökonomische Eliten sind korrumpiert / schuld', 26),
 ('Eliten repräsentieren nicht das Volk', 15),
 ('Sprecher repräsentiert Volk', 9),
 ('Exklusion von Randgruppen', 8),
 ('Volk ist tugendhaft', 8),
 ('Nicht-Einheimische sind korrumpiert / schuld', 4),
 ('Volk als Verursacher einer positiven Entwicklung', 4),
 ('Volk als homogene Masse', 3),
 ('Mehr Macht für Volk', 3),
 ('Arbeiterklasse ist tugendhaft / Verursacher für positive Entwicklung', 3),
 ('Einheimisches Volk ist tugendhaft / Verursacher für positive Entwicklung',
  2),
 ('Recht und Ordnung', 1),
 ('Weniger Macht für Eliten', 1)]

# Ignore for now: Current state of coding


In [None]:
rows = [
    (s.labels, len(s.label_counts.keys()))
    for s in samples
    if "NICHT ZUTREFFEND" not in s.label_counts
]
df = pd.DataFrame(rows, columns=["labels", "n_labels"])

print(df.describe())

          n_labels
count  1535.000000
mean      1.241694
std       0.460522
min       1.000000
25%       1.000000
50%       1.000000
75%       1.000000
max       4.000000


### Agreement


In [None]:
rows = []
for s in samples:
    for coder in s.user_labels:
        labels = s.user_labels[coder]
        if "NICHT ZUTREFFEND" not in labels not in labels:
            row = (coder, s.example.id, frozenset(labels))
            rows.append(row)

df = pd.DataFrame(rows, columns=["coder", "example", "labels"])


def krippendorff_alpha(df):
    task = AnnotationTask(distance=jaccard_distance)
    task.load_array(zip(df.coder, df.example, df.labels))
    return task.alpha()

In [None]:
def remove_none_disagreement(d):
    labels = set(lab for coder in d.labels for lab in coder)
    if "nicht zutr" in labels:
        return False
    if labels == frozenset(["none"]):
        return False
    return True


results = pd.DataFrame()

rows = []
rows.append(("labels", krippendorff_alpha(df)))

df_no_none = df.groupby("example").filter(remove_none_disagreement)

rows.append(("all_labels_no_none", krippendorff_alpha(df_no_none)))

print(
    f"Full DF remains: {len(df_no_none) / len(df):.2%} ({len(df_no_none)} of "
    f"{df_no_none.example.nunique()} samples)"
)

pd.DataFrame(rows, columns=["type", "alpha"])

Full DF remains: 16.12% (969 of 247 samples)


Unnamed: 0,type,alpha
0,labels,0.305255
1,all_labels_no_none,0.064678


# Something is wrong with our data....

In [None]:
df = pd.read_feather(src.PATH / "data/PBert_AnnoTask5_CoreSentences.arrow")



- Example "speech": 819866

- This is the only sentence of the "speech"...


In [None]:
df.loc[df.speeches_id == 819866, :]

Unnamed: 0,faction_id,speeches_id,sentence_no,sentence,electoral_term,abbreviation,full_name,pop_dict_score
41,-1.0,819866.0,1,Frau Zypries .,18.0,not found,not found,1.0
