In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import sqlalchemy as sa
from sqlalchemy.orm import Query
from sqlalchemy.orm import Session
from sqlalchemy.orm import joinedload

import src.db.models.bert_data as bm
import src.db.models.open_discourse as od
from src.db.connect import make_engine

In [None]:
out_path = Path("/home/lukas/overleaf/bert_populism/tables")

In [None]:
engine = make_engine("DB")
s = Session(engine)

In [None]:
def add_midrules(latex: str) -> str:
    new = []
    for line in latex.splitlines():
        if line.startswith(r"\multirow"):
            new.append(r"\midrule")
        if line.startswith("sum"):
            new.append(r"\midrule")
        new.append(line)
    return "\n".join(new)

# How many speeches / sentences?


In [None]:
query = (
    Query(od.Speech)
    .join(bm.Sample)
    .join(od.Faction)
    .filter(
        od.Speech.electoral_term.in_([18, 19]), od.Faction.id != -1, od.Speech.politician_id != -1
    )
    .group_by(
        od.Speech.electoral_term,
        od.Speech.session,
        od.Faction.abbreviation,
        od.Speech.politician_id,
    )
    .with_entities(
        od.Speech.electoral_term,
        sa.case(
            (od.Faction.abbreviation == "DIE LINKE.", sa.literal("DIE LINKE")),
            else_=od.Faction.abbreviation,
        ).label("abbreviation"),
        sa.func.max(bm.Sample.sentence_no).label("n_sents"),
    )
)

with engine.connect() as conn:
    df_sents = pd.read_sql(query.statement, conn)

df_sents.columns = ["Term", "Party", "sentences"]

In [None]:
df_sents.head()

Unnamed: 0,Term,Party,sentences
0,18,CDU/CSU,4
1,18,CDU/CSU,31
2,18,DIE LINKE,51
3,18,Grüne,44
4,18,SPD,2


In [None]:
table = (
    df_sents.groupby(["Term", "Party"])["sentences"]
    .agg(func=[len, sum, np.mean, np.std])
    .rename(
        {
            "len": "Speeches",
            "mean": "Avg. sentences",
            "std": "Std. sentences",
            "sum": "Sentences",
        },
        axis=1,
    )
    .sort_values(["Term", "Speeches"], ascending=[True, False])
).reset_index()

table = pd.concat(
    [
        table,
        pd.DataFrame(
            [["sum", "", table["Speeches"].sum(), table["Sentences"].sum(), "", ""]],
            columns=table.columns,
        ),
    ]
).set_index(["Term", "Party"])

In [None]:
table

Unnamed: 0_level_0,Unnamed: 1_level_0,Speeches,Sentences,Avg. sentences,Std. sentences
Term,Party,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
18,CDU/CSU,4437,251109,56.59432,23.651102
18,SPD,3397,177056,52.121283,21.805571
18,Grüne,2831,116538,41.164959,27.980432
18,DIE LINKE,2444,108251,44.292553,29.498537
18,Fraktionslos,2,52,26.0,2.828427
19,CDU/CSU,5104,225145,44.111481,16.806128
19,SPD,3604,150928,41.877913,17.133557
19,AfD,3020,109504,36.259603,17.292627
19,FDP,2622,93045,35.48627,18.412118
19,Grüne,2495,88244,35.368337,18.336694


In [None]:
tex = (
    table.style.set_table_styles(
        [
            {"selector": "toprule", "props": ":toprule;"},
            {"selector": "bottomrule", "props": ":bottomrule;"},
        ]
    )
    .format(thousands=",", precision=3)
    .to_latex()
)


lines = tex.splitlines()
new = []
for i, line in enumerate(lines, 1):
    #    print(i, line)
    line = line.replace(r"\multirow[c]", r"\multirow[t]")
    if i == 3:
        line = r"Term & Party & Speeches & Sentences & Avg. sentences & Std. sentences \\\midrule"
    if i == 4:
        continue

    if i == len(lines) - 3:
        line = line + r"\midrule"

    new.append(line)

tex = "\n".join(new)

(out_path / "n_dataset.tex").write_text(tex)

print(tex)

\begin{tabular}{llrrll}
\toprule
Term & Party & Speeches & Sentences & Avg. sentences & Std. sentences \\\midrule
\multirow[t]{5}{*}{18} & CDU/CSU & 4,437 & 251,109 & 56.594 & 23.651 \\
 & SPD & 3,397 & 177,056 & 52.121 & 21.806 \\
 & Grüne & 2,831 & 116,538 & 41.165 & 27.980 \\
 & DIE LINKE & 2,444 & 108,251 & 44.293 & 29.499 \\
 & Fraktionslos & 2 & 52 & 26.000 & 2.828 \\
\multirow[t]{7}{*}{19} & CDU/CSU & 5,104 & 225,145 & 44.111 & 16.806 \\
 & SPD & 3,604 & 150,928 & 41.878 & 17.134 \\
 & AfD & 3,020 & 109,504 & 36.260 & 17.293 \\
 & FDP & 2,622 & 93,045 & 35.486 & 18.412 \\
 & Grüne & 2,495 & 88,244 & 35.368 & 18.337 \\
 & DIE LINKE & 2,321 & 84,132 & 36.248 & 20.005 \\
 & Fraktionslos & 71 & 2,022 & 28.479 & 7.107 \\\midrule
sum &  & 32,348 & 1,406,026 &  &  \\
\bottomrule
\end{tabular}


# How many sentences labeled?


In [None]:
s = Session(engine)

In [None]:
samples = (
    s.query(bm.Sample)
    .options(joinedload(bm.Sample.raw_labels))
    .join(bm.Label)
    .join(od.Speech)
    .join(od.Faction)
    .filter(bm.Sample.used_in_batch != None)
    .with_entities(
        od.Speech.electoral_term,
        sa.case(
            (od.Faction.abbreviation == "DIE LINKE.", sa.literal("DIE LINKE")),
            else_=od.Faction.abbreviation,
        ).label("abbreviation"),
        bm.Sample,
    )
)

rows = []
for term, faction, sample in samples:
    for label in sample.raw_labels:
        row = (
            term,
            faction,
            sample.id,
            label.pop_antielite,
            label.pop_pplcentr,
            label.souv_eliteless,
            label.souv_pplmore,
            label.ideol_left,
            label.ideol_right,
        )
        rows.append(row)

df = pd.DataFrame(
    rows,
    columns=[
        "Term",
        "Party",
        "sample_id",
        "antielite",
        "pplcentr",
        "eliteless",
        "pplmore",
        "left",
        "right",
    ],
)

In [None]:
cols = [
    "antielite",
    "pplcentr",
    "eliteless",
    "pplmore",
    "left",
    "right",
]

pivot = pd.pivot_table(
    df,
    index=["Term", "Party"],
    values=cols,
    aggfunc="sum",
)
pivot = pivot[cols]
pivot = pivot.reset_index()

sums = pivot[cols].sum(axis=0).tolist()


pivot = pd.concat(
    [pivot, pd.DataFrame([["sum", "", *sums]], columns=pivot.columns)],
    ignore_index=True,
)
pivot = pivot.set_index(["Term", "Party"])

ValueError: 8 columns passed, passed data had 7 columns

In [None]:
style = pivot.style.set_table_styles(
    [
        {"selector": "toprule", "props": ":toprule;"},
        {"selector": "bottomrule", "props": ":bottomrule;"},
    ]
).format(subset=cols, thousands=",")

latex = style.to_latex(
    multirow_align="t",
)

latex = add_midrules(latex)

_ = (out_path / "n_labels.tex").write_text(latex)