In [1]:
%load_ext rpy2.ipython

- Project '~/git/PopBERT' loaded. [renv 1.0.5]


In [2]:
import pandas as pd
from sklearn import preprocessing

import src
import src.pop_aggregate as metrics

In [3]:
pd.set_option("display.max_colwidth", 2048)
pd.set_option("display.max_rows", 256)

In [4]:
%%R

library(tidyverse)
library(ggplot2)
library(ggpattern)
theme_set(theme_minimal())
library(scales)
library(here)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     


── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors



Attaching package: ‘scales’

The following object is masked from ‘package:purrr’:

    discard

The following object is masked from ‘package:readr’:

    col_factor

here() starts at /Users/lukas/git/PopBERT


# Load Data

In [5]:
df = pd.read_parquet(src.PATH / "data/raw/sentences.parquet.gzip")
preds = pd.read_parquet(src.PATH / "data/interim/sentence_predictions.parquet.gzip")

df = pd.merge(df, preds, on="sample_id")

In [6]:
thresh = {"elite": 0.415961, "pplcentr": 0.295400, "left": 0.429109, "right": 0.302714}

In [7]:
for key, val in thresh.items():
    df[key] = df[key].apply(lambda x: 1 if x > val else 0)

In [8]:
colormap = {
    "CDU/CSU": "#000000",
    "Grüne": "#1AA037",
    "DIE LINKE": "#8B008B",
    "FDP": "#FFEF00",
    "AfD": "#0489DB",
    "SPD": "#E3000F",
}
color_names = list(colormap.keys())
color_vals = list(colormap.values())

In [9]:
%%R -i color_names -i color_vals
colors <- setNames(color_vals, color_names)

# Populism by speech


In [10]:
# filter all speeches with less than 3 sentences
df = df.groupby(["electoral_term", "session", "politician_id"]).filter(
    lambda x: x["sentence_no"].max() > 3
)

In [11]:
df_speech = (
    df.drop(["speech_id", "sentence_no"], axis=1)
    .groupby(["electoral_term", "session", "politician_id", "abbreviation"])
    .mean(numeric_only=True)
    .reset_index()
)

# Multiplicative index

- antielite and pplcentr are multiplied. pop score is the result


In [12]:
df_speech.columns

Index(['electoral_term', 'session', 'politician_id', 'abbreviation',
       'sample_id', 'sentence_length', 'elite', 'pplcentr', 'left', 'right'],
      dtype='object')

In [13]:
df_speech["pop"] = df_speech.apply(metrics.multiplicative_index, axis=1)
df_speech["pop_right"] = df_speech.elite * df_speech.pplcentr * df_speech.right
df_speech["pop_left"] = df_speech.elite * df_speech.pplcentr * df_speech.left
df_speech["pop_goertz"] = df_speech.apply(metrics.goertz_index, axis=1)
df_speech["pop_bollen"] = df_speech.apply(metrics.bollen_index, axis=1)

In [14]:
thresh = {
    "elite": df_speech.elite.quantile(0.75),
    "pplcentr": df_speech.pplcentr.quantile(0.75),
}

df_speech["pop_sartori"] = df_speech.apply(
    lambda x: metrics.sartori_index(x, threshold=thresh), axis=1
)

In [15]:
cols = [
    "(a) Populism-Index",
    "(a) Left-Wing Populism-Index",
    "(b) Right-Wing Populism-Index",
    "(b) Goertz-Index",
    "(c) Bollen-Index",
    "(d) Sartori-Index",
]

mean_pop = (
    df_speech.rename(
        {
            "abbreviation": "Party",
            "electoral_term": "Term",
            "pop": "(a) Populism-Index",
            "pop_left": "(a) Left-Wing Populism-Index",
            "pop_right": "(b) Right-Wing Populism-Index",
            "pop_goertz": "(b) Goertz-Index",
            "pop_bollen": "(c) Bollen-Index",
            "pop_sartori": "(d) Sartori-Index",
        },
        axis=1,
    )
    .groupby(["Term", "Party"])[cols]
    .mean()
    .reset_index()
)


mean_pop_norm = mean_pop.copy()
for col in cols:
    mean_pop_norm[col] = preprocessing.maxabs_scale(mean_pop_norm[col])


def reshape_df(df):
    new = pd.melt(df, id_vars=["Term", "Party"], value_vars=cols, var_name="variable")
    new["Party"] = new.Party.astype("category")
    return new


mean_pop = reshape_df(mean_pop)
mean_pop_norm = reshape_df(mean_pop_norm)

In [16]:
color_names = list(colormap.keys())
color_vals = list(colormap.values())

In [17]:
%%R -i mean_pop -i mean_pop_norm -i color_names -i color_vals
library(here)
library(tidyverse)
library(ggplot2)
library(ggpattern)
theme_set(theme_minimal())

colors <- setNames(color_vals, color_names)

create_plot_pop <- function(df) {

    vars <-  c(
            "(a) Populism-Index",
            "(b) Goertz-Index",
            "(c) Bollen-Index",
            "(d) Sartori-Index"
        )

    df <- df %>% filter(variable %in% vars)

    df$Term <- as_factor(df$Term)

    df$variable <- fct_relevel(df$variable, vars)

    df <- complete(df,
        Party = unique(df$Party),
        Term = unique(df$Term),
        variable = unique(df$variable),
        fill=list(value=0)
    )
    
    ggplot(df, aes(x=Party, y=value, fill=Party, pattern=Term)) +
        geom_bar_pattern(
            position=position_dodge(preserve="single"),
            stat="identity",
            color="grey",
            pattern_fill="grey",
            pattern_angle=45,
            pattern_density=0.1,
            pattern_spacing=0.025,
            pattern_key_scale_factor=0.6
            ) +
        scale_fill_manual(values=colors) +
        scale_pattern_manual(values=c("18"="stripe", "19"="none")) + 
        theme(
            text=element_text(size=18),
            axis.text=element_text(size=14),
            axis.text.x=element_blank(), 
            axis.title.y=element_blank(),
            axis.title.x=element_blank(),
            strip.text=element_text(size=20)
        ) +
        facet_wrap("~variable", scales="free", ncol=2) +
        guides(pattern = guide_legend(title="Term", override.aes = list(fill = "white")),
            fill = guide_legend(override.aes = list(pattern = "none")))
}

create_plot_pop_ideol <- function(df) {
    vars <-  c(
            "(a) Left-Wing Populism-Index",
            "(b) Right-Wing Populism-Index"
        )
    df <- df %>% filter(variable %in% vars)
    df$Term <- as_factor(df$Term)
    df$variable <- fct_relevel(df$variable, vars)

    df <- complete(df,
        Party = unique(df$Party),
        Term = unique(df$Term),
        variable = unique(df$variable),
        fill=list(value=0)
    )
    
    ggplot(df, aes(x=Party, y=value, fill=Party, pattern=Term)) +
        geom_bar_pattern(
            position=position_dodge(preserve="single"),
            stat="identity",
            color="grey",
            pattern_fill="grey",
            pattern_angle=45,
            pattern_density=0.1,
            pattern_spacing=0.025,
            pattern_key_scale_factor=0.6
            ) +
        scale_fill_manual(values=colors) +
        scale_pattern_manual(values=c("18"="stripe", "19"="none")) + 
        theme(
            text=element_text(size=18),
            axis.text=element_text(size=14),
            axis.text.x=element_blank(), 
            axis.title.y=element_blank(),
            axis.title.x=element_blank(),
            strip.text=element_text(size=20)
        ) +
        facet_wrap("~variable", scales="free", ncol=2) +
        guides(pattern = guide_legend(title="Term", override.aes = list(fill = "white")),
            fill = guide_legend(override.aes = list(pattern = "none")))
}

plot <- create_plot_pop(mean_pop)
plot
ggsave(here("results", "figures", "populism_by_party_measures_orig.pdf"), width=16, height=9)

plot <- create_plot_pop(mean_pop_norm)
plot
ggsave(here("results", "figures", "populism_by_party_measures_normalized.pdf"), width=16, height=9)
