In [None]:
%load_ext rpy2.ipython

In [None]:
from collections import defaultdict

import numpy as np
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

import src

DEVICE = "cuda"

In [None]:
df_mope = pd.read_json(src.PATH / "data/mope/dataset.json")

In [None]:
def count_tags(tags):
    counter = defaultdict(int)
    for tag in tags:
        if tag.startswith("B-"):
            tag = tag.lstrip("B-")
            counter[tag] += 1
    return counter

In [None]:
df_mope["counts"] = df_mope.tags.apply(count_tags)
df_mope["text"] = df_mope.words.apply(lambda x: " ".join(x))

In [None]:
tokenizer = AutoTokenizer.from_pretrained("luerhard/PopBERT")
model = AutoModelForSequenceClassification.from_pretrained("luerhard/PopBERT").to(DEVICE)

In [None]:
def chunks(iterable, size):
    pos = 0
    while True:
        if pos + size < len(iterable):
            yield iterable[pos : pos + size]
            pos = pos + size
        else:
            yield iterable[pos:]
            break

In [None]:
outs = []
with torch.inference_mode():
    for chunk in chunks(df_mope.text.tolist(), 30):
        encodings = tokenizer(chunk, padding=True, return_tensors="pt").to(DEVICE)
        out = model(**encodings)
        probas = torch.nn.functional.sigmoid(out.logits).detach().cpu().numpy()
        outs.extend(probas)

In [None]:
probs = pd.DataFrame(np.vstack(outs), columns=["elite", "centr", "left", "right"])

In [None]:
df = pd.concat([df_mope, probs], axis=1)

In [None]:
thresh = {"elite": 0.415961, "centr": 0.295400, "left": 0.429109, "right": 0.302714}


def apply_thresh(row, thresh):
    for key, val in thresh.items():
        if row[key] > val:
            row[key] = 1
        else:
            row[key] = 0
    return row


df_bin = df.apply(lambda x: apply_thresh(x, thresh=thresh), axis=1)

In [None]:
ent_set = {key for d in df_bin.counts.tolist() for key in d}
for ent in ent_set:
    df_bin[ent] = df_bin.counts.apply(lambda x: ent in x)

In [None]:
elite = df_bin.groupby(["elite"])[["EORG", "PPEO", "EPER"]].mean().reset_index()
elite = elite.rename({"elite": "val"}, axis=1)
elite["val"] = elite["val"].replace({0: "elite_0", 1: "elite_1"})
elite["group"] = "elite"

centr = df_bin.groupby(["centr"])[["EORG", "PPEO", "EPER"]].mean().reset_index()
centr = centr.rename({"centr": "val"}, axis=1)
centr["val"] = centr["val"].replace({0: "centr_0", 1: "centr_1"})
centr["group"] = "centr"

In [None]:
grouped = pd.concat([elite, centr])
grouped = pd.melt(grouped, id_vars=["val", "group"], value_vars=["EORG", "PPEO", "EPER"])
grouped

Unnamed: 0,val,group,variable,value
0,elite_0,elite,EORG,0.290181
1,elite_1,elite,EORG,0.515254
2,centr_0,centr,EORG,0.335905
3,centr_1,centr,EORG,0.304348
4,elite_0,elite,PPEO,0.30554
5,elite_1,elite,PPEO,0.371751
6,centr_0,centr,PPEO,0.284946
7,centr_1,centr,PPEO,0.885375
8,elite_0,elite,EPER,0.224904
9,elite_1,elite,EPER,0.360452


In [None]:
%%R -i grouped

library(here)
library(ggplot2)
library(tidyverse)
library(ggpubr)
library(ggeffects)
theme_set(theme_ggeffects())

plot_elite <- grouped %>%
    filter(group == "elite") %>%
    mutate(val = recode(val, "elite_0" = 0, "elite_1" = 1)) %>%
    rename(PeopleCentrism = val) %>%
    ggplot(aes(x=as_factor(PeopleCentrism), fill=variable, y=value)) +
        geom_bar(stat="identity", position="dodge2") +
        labs(title="Anti-Elitism") +
        ylim(0 ,1) +
        theme(
            legend.position = "none",
            axis.title.x = element_blank(),
            axis.title.y = element_blank(),
            axis.ticks.x = element_blank()
        ) +
        scale_fill_manual(values=c('#f77189', '#50b131', '#3ba3ec'))

plot_pplcentr <- grouped %>%
    filter(group == "centr") %>%
    mutate(val = recode(val, "centr_0" = 0, "centr_1" = 1)) %>%
    rename(PeopleCentrism = val) %>%
    ggplot(aes(x=as_factor(PeopleCentrism), fill=variable, y=value)) +
        geom_bar(stat="identity", position="dodge2") +
        labs(title="People-Centrism") +
        ylim(0, 1) +
        theme(
            axis.title.x = element_blank(),
            axis.title.y = element_blank(),
            axis.ticks.x = element_blank()
        ) +
        scale_fill_manual(values=c('#f77189', '#50b131', '#3ba3ec'))
    

ggarrange(plot_elite, plot_pplcentr, ncol=2, widths=c(11, 16), heights=9)
ggsave(here("klamm_et_al_comparison.pdf"))

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ lubridate 1.9.3     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Saving 6.67 x 6.67 in image


here() starts at /mnt/nvme_storage/git/bert_populism
