In [None]:
%load_ext rpy2.ipython

In [None]:
import matplotlib as mlp
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
from sqlalchemy import case
from sqlalchemy import func
from sqlalchemy import literal
from sqlalchemy.orm import Query

import src
import src.db.models.bert_data as bm
import src.db.models.open_discourse as od

In [None]:
pd.set_option("display.max_colwidth", 2048)
pd.set_option("display.max_rows", 256)

engine = src.db.connect.make_engine("DB")

In [None]:
thresh = {"elite": 0.5013018, "centr": 0.5017193, "left": 0.42243505, "right": 0.38281676}

In [None]:
colormap = {
    "CDU/CSU": "#000000",
    "Grüne": "#1AA037",
    "DIE LINKE.": "#8B008B",
    "FDP": "#FFEF00",
    "AfD": "#0489DB",
    "SPD": "#E3000F",
}

# Analysis: How many speeches per day per politican?


In [None]:
query = (
    Query(od.Speech)
    .join(bm.Sample)
    .filter(od.Speech.electoral_term.in_([18, 19]), od.Speech.politician_id != -1)
    .group_by(od.Speech.session, od.Speech.electoral_term, od.Speech.politician_id, od.Speech.id)
    .with_entities(
        od.Speech.session,
        od.Speech.electoral_term,
        od.Speech.politician_id,
        od.Speech.id,
        func.max(bm.Sample.sentence_no).label("n_sents"),
    )
)

with engine.connect() as conn:
    test_df = pd.read_sql(query.statement, conn)

In [None]:
speeches = test_df.groupby(["electoral_term", "session", "politician_id"])

cache = []
group_ix = 0
new_block = True


def group_counter(gap):
    global group_ix
    global new_block
    if gap > 7 or new_block:
        new_block = False
        group_ix += 1
        return group_ix
    else:
        return group_ix


for i, speech in speeches:
    new_block = True
    if len(speech) == 1:
        speech["group"] = group_counter(100)
        cache.append(speech)
        continue
    speech.reset_index(inplace=True, drop=True)
    speech = speech.sort_values("id", ascending=True)
    speech["prev_id"] = speech["id"].shift(1)
    speech["gap"] = speech["id"] - speech["prev_id"]
    speech["group"] = speech.gap.apply(group_counter)

    for _, speech_group in speech.groupby("group"):
        if speech_group.n_sents.sum() > 4:
            cache.append(speech_group)
        else:
            continue

In [None]:
test_df = pd.concat(cache)

In [None]:
test_df[(test_df.session == 3) & (test_df.politician_id == 11002636)]

Unnamed: 0,session,electoral_term,politician_id,id,n_sents,group,prev_id,gap
0,3,18,11002636,794237,51,46,,
1,3,18,11002636,794239,3,46,794237.0,2.0
2,3,18,11002636,794251,12,47,794239.0,12.0


In [None]:
counts = test_df.groupby(["electoral_term", "session", "politician_id"])["group"].nunique()

In [None]:
counts.value_counts(normalize=False)

1    28753
2     3049
3      392
4       57
5       11
6        3
7        1
Name: group, dtype: int64

In [None]:
counts.value_counts(normalize=True)

1    0.891124
2    0.094496
3    0.012149
4    0.001767
5    0.000341
6    0.000093
7    0.000031
Name: group, dtype: float64

# Populism by speech


In [None]:
query = (
    Query(bm.Sample)
    .join(bm.Prediction)
    .join(od.Speech)
    .join(od.Faction)
    .filter(
        bm.Sample.sentence_length > 2,
        od.Speech.electoral_term.in_([18, 19]),
        od.Faction.abbreviation != "Fraktionslos",
    )
    .with_entities(
        # bm.Sample.text,
        od.Speech.id.label("speech_id"),
        od.Speech.electoral_term,
        od.Speech.session,
        od.Speech.politician_id,
        bm.Sample.sentence_no,
        od.Faction.abbreviation,
        bm.Sample.pop_dict_score,
        case(
            (bm.Prediction.elite >= thresh["elite"], literal(1)),
            (bm.Prediction.elite < thresh["elite"], literal(0)),
        ).label("antielite"),
        case(
            (bm.Prediction.pplcentr >= thresh["centr"], literal(1)),
            (bm.Prediction.pplcentr < thresh["centr"], literal(0)),
        ).label("pplcentr"),
        case(
            (bm.Prediction.left >= thresh["left"], literal(1)),
            (bm.Prediction.left < thresh["left"], literal(0)),
        ).label("left"),
        case(
            (bm.Prediction.right >= thresh["right"], literal(1)),
            (bm.Prediction.right < thresh["right"], literal(0)),
        ).label("right"),
    )
)

In [None]:
with engine.connect() as conn:
    df_raw = pd.read_sql(query.statement, conn)

In [None]:
# filter all speeches with less than 3 sentences
df_raw = df_raw.groupby(["electoral_term", "session", "politician_id"]).filter(
    lambda x: x["sentence_no"].max() > 3
)

In [None]:
df_speech = (
    df_raw.drop(["speech_id", "sentence_no"], axis=1)
    .groupby(["electoral_term", "session", "politician_id", "abbreviation"])
    .mean()
    .reset_index()
)

# Multiplicative index

- antielite and pplcentr are multiplicated. pop score is the result


In [None]:
df_speech["pop"] = df_speech.antielite * df_speech.pplcentr
df_speech["pop_right"] = df_speech.antielite * df_speech.pplcentr * df_speech.right
df_speech["pop_left"] = df_speech.antielite * df_speech.pplcentr * df_speech.left

In [None]:
cols = [
    "(a) Gründl (2022)",
    "(b) Populism-Index",
    "(c) Left-Wing Populism-Index",
    "(d) Right-Wing Populism-Index",
]
mean_pop = (
    df_speech.rename(
        {
            "abbreviation": "Party",
            "electoral_term": "Term",
            "pop_dict_score": "(a) Gründl (2022)",
            "pop": "(b) Populism-Index",
            "pop_left": "(c) Left-Wing Populism-Index",
            "pop_right": "(d) Right-Wing Populism-Index",
        },
        axis=1,
    )
    .groupby(["Term", "Party"])[cols]
    .mean()
    .reset_index()
)


mean_pop_norm = mean_pop.copy()
for col in cols:
    mean_pop_norm[col] = preprocessing.maxabs_scale(mean_pop_norm[col])


def reshape_df(df):
    new = pd.melt(df, id_vars=["Term", "Party"], value_vars=cols, var_name="variable")
    new["Party"] = new.Party.astype("category")
    return new


mean_pop = reshape_df(mean_pop)
mean_pop_norm = reshape_df(mean_pop_norm)


In [None]:
color_names = list(colormap.keys())
color_vals = list(colormap.values())

In [None]:
%%R -i mean_pop -i mean_pop_norm -i color_names -i color_vals

library(tidyverse)
library(ggplot2)
library(ggpattern)
theme_set(theme_minimal())

colors <- setNames(color_vals, color_names)

create_plot <- function(df) {
    df$Term <- as_factor(df$Term)
    df$variable <- fct_relevel(df$variable, c("(a) Gründl (2022)", "(b) Populism-Index", "(c) Left-Wing Populism-Index", "(d) Right-Wing Populism-Index"))
    df <- complete(df, Party = unique(df$Party), Term = unique(df$Term), variable = unique(df$variable), fill=list(value=0))
    ggplot(df, aes_string(x="Party", y="value", fill="Party", pattern="Term")) +
        geom_bar_pattern(
            position=position_dodge(preserve="single"),
            stat="identity",
            color="grey",
            pattern_fill="grey",
            pattern_angle=45,
            pattern_density=0.1,
            pattern_spacing=0.025,
            pattern_key_scale_factor=0.6
            ) +
        scale_fill_manual(values=colors) +
        scale_pattern_manual(values=c("18"="stripe", "19"="none")) + 
        theme(
            text=element_text(size=18),
            axis.text=element_text(size=14),
            axis.text.x=element_blank(), 
            axis.title.y=element_blank(),
            axis.title.x=element_blank(),
            strip.text=element_text(size=20)
        ) +
        facet_wrap("~variable", scales="free") +
        guides(pattern = guide_legend(title="Term", override.aes = list(fill = "white")),
            fill = guide_legend(override.aes = list(pattern = "none")))
}
plot <- create_plot(mean_pop)
plot
ggsave("/home/lukas/overleaf/bert_populism/Figures/populism_by_party_orig.pdf", width=16, height=9)

plot <- create_plot(mean_pop_norm)
plot
ggsave("/home/lukas/overleaf/bert_populism/Figures/populism_by_party_normalized.pdf", width=16, height=9)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.2     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors


1: In (function (package, help, pos = 2, lib.loc = NULL, character.only = FALSE,  :
  libraries ‘/usr/local/lib/R/site-library’, ‘/usr/lib/R/site-library’ contain no packages
2: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
generated. 
