In [None]:
from pathlib import Path

import pandas as pd
import sqlalchemy as sa
from sqlalchemy.orm import Query

import src.db.models.bert_data as bm
import src.db.models.open_discourse as od
from src.db.connect import make_engine

In [None]:
pd.set_option("display.max_colwidth", 1024)

out_path = Path("/home/lukas/overleaf/bert_populism/tables")

engine = make_engine("DB")

In [None]:
thresh = {"elite": 0.5013018, "pplcentr": 0.5017193, "left": 0.42243505, "right": 0.38281676}


def create_bins(thresh):
    BOUND_RANGE = 0.15
    lower_bound = thresh - BOUND_RANGE
    upper_bound = thresh + BOUND_RANGE
    bins = [(0, lower_bound), (lower_bound, upper_bound), (upper_bound, 1)]
    return pd.IntervalIndex.from_tuples(bins)


intervals = {k: create_bins(v) for k, v in thresh.items()}

In [None]:
pd.IntervalIndex.from_tuples([(0, 1), (1, 2)])

IntervalIndex([(0, 1], (1, 2]], dtype='interval[int64, right]')

In [None]:
def add_midrules(latex: str) -> str:
    new = []
    for line in latex.splitlines():
        if line.startswith(r"\multirow"):
            new.append(r"\midrule")
        if line.startswith("sum"):
            new.append(r"\midrule")
        new.append(line)
    return "\n".join(new)

# Load Data


In [None]:
query = (
    Query(bm.Sample)
    .join(bm.Prediction)
    .with_entities(
        bm.Sample.text,
        bm.Prediction.elite,
        bm.Prediction.pplcentr,
        bm.Prediction.left,
        bm.Prediction.right,
    )
)

with engine.connect() as conn:
    df = pd.read_sql(query.statement, conn)

In [None]:
cols = ["elite", "pplcentr", "left", "right"]
for col in cols:
    df[f"{col}_bin"] = pd.cut(df[col], intervals[col])

In [None]:
selection = []

groupnames = {
    "elite": "Anti-Elite",
    "pplcentr": "People-Centric",
    "left": "Host-Left",
    "right": "Host-Right",
}

for col in cols:
    select = df.groupby(f"{col}_bin").sample(5, random_state=12)
    select["select"] = groupnames[col]
    select = select.sort_values(col, ascending=True)
    selection.append(select)

selection = pd.concat(selection)

In [None]:
table = selection.copy()


def add_parbox(text, size):
    return r"\parbox[t]{" + size + "}{" + text + "}"


table["text"] = table["text"].apply(lambda x: add_parbox(x, r".55\textwidth"))


def add_bold_font(text):
    return r"\textbf{" + text + "}"


table["select"] = table["select"].apply(add_bold_font)
# table["select"] = table["select"].apply(lambda x: add_parbox(x, r".1\textwidth"))


def bold_formatter_thresh(num, thresh):
    num_str = str(round(num, 2))
    if num < thresh:
        return num_str
    else:
        return r"\textbf{" + num_str + "}"


for col in cols:
    table[col] = table[col].apply(lambda x: bold_formatter_thresh(x, thresh[col]))


table = table.set_index(["select", "text"])


tex = (
    table[["elite", "pplcentr", "left", "right"]]
    .style.set_table_styles(
        [
            {"selector": "toprule", "props": ":toprule;"},
            {"selector": "bottomrule", "props": ":bottomrule;"},
        ]
    )
    .format(precision=3)
    .to_latex()
)


lines = tex.splitlines()

new = []
for i, line in enumerate(lines, 1):
    if i == 1:
        line = r"\begin{longtable}{p{.1\textwidth}p{.55\textwidth}p{.04\textwidth}p{.04\textwidth}p{.04\textwidth}p{.04\textwidth}}\\"
    if i == len(lines):
        line = "\end{longtable}"
    if i == 3:
        line = r"""Dim & Sentence & Anti-Elite & People-Centric & Host-Left & Host-Right \\
\midrule
\endhead
"""
    if i == 4:
        continue

    line = line.replace(r"\multirow[c]", "\multirow[t]")

    if i > 4 and i < len(lines) - 2:
        line = line + "\midrule"
    new.append(line)

tex = "\n".join(new)

(out_path / "prediction_samples.tex").write_text(tex)

print(tex)

\begin{longtable}{p{.1\textwidth}p{.55\textwidth}p{.04\textwidth}p{.04\textwidth}p{.04\textwidth}p{.04\textwidth}}\\
\toprule
Dim & Sentence & Anti-Elite & People-Centric & Host-Left & Host-Right \\
\midrule
\endhead

\multirow[t]{15}{*}{\textbf{Anti-Elite}} & \parbox[t]{.55\textwidth}{Wir haben ungefähr 1 500 Braustätten in Deutschland – übrigens die Hälfte davon in Bayern, die meisten als mittelständische Unternehmen und Kleinunternehmen geführt –, die in dieser pandemischen Lage natürlich erhebliche Probleme haben.} & 0.0 & 0.01 & 0.0 & 0.0 \\\midrule
 & \parbox[t]{.55\textwidth}{Eine andere Aussage lautet, eine Verschärfung des nationalen Aufsichtsrechts sei nicht sinnvoll.} & 0.0 & 0.0 & 0.0 & 0.0 \\\midrule
 & \parbox[t]{.55\textwidth}{Strukturmittel in Höhe von 102 Milliarden Euro sind nicht abgerufen worden.} & 0.0 & 0.0 & 0.0 & 0.0 \\\midrule
 & \parbox[t]{.55\textwidth}{Insbesondere Kürzungen bei der Förderung von Innovationen konterkarieren doch den Aufwuchs durch die zusätz