In [None]:
import json

import pandas as pd
from sqlalchemy.orm import Session
from sqlalchemy.orm import joinedload

import src
import src.db
import src.db.models.bert_data as bm
import src.db.models.open_discourse as od

In [None]:
path = src.PATH / "data/bert/dataset.csv.zip"
pd.set_option("display.max_colwidth", None)
engine = src.db.make_engine()

In [None]:
df = pd.read_csv(path)

In [None]:
usernames = set(df.username)
coder_dict = {name: f"coder_{i}" for i, name in enumerate(usernames)}

In [None]:
coder_dict

{'schadt': 'coder_0',
 'riedel': 'coder_1',
 'coudry': 'coder_2',
 'richter': 'coder_3',
 'grabsch': 'coder_4'}

In [None]:
df["username"] = df["username"].replace(coder_dict)

In [None]:
def create_pred_dict(group):
    all_dicts = {}
    users = set(group.username)
    for var in ["elite", "centr", "left", "right"]:
        var_dict = {}
        for user in users:
            codes = set(group.loc[group.username == user, var])
            if len(codes) != 1:
                raise Exception(f"Codes != 1: {codes} for {user}")
            var_dict[user] = codes.pop()

        all_dicts[var] = var_dict

    return all_dicts


out = df.groupby(["id", "text"], sort=False).apply(create_pred_dict)

In [None]:
out = out.reset_index()
out.columns = ["id", "text", "prediction_dict"]

In [None]:
for d in out["prediction_dict"]:
    for var, d2 in d.items():
        assert len(d2) == 5

In [None]:
out["anti_elitism"] = out["prediction_dict"].apply(lambda x: x["elite"])
out["people_centrism"] = out["prediction_dict"].apply(lambda x: x["centr"])
out["left_wing"] = out["prediction_dict"].apply(lambda x: x["left"])
out["right_wing"] = out["prediction_dict"].apply(lambda x: x["right"])

In [None]:
out = out.drop("prediction_dict", axis=1)

In [None]:
s = Session(engine)


def get_metadata(id_):
    sample = (
        s.query(bm.Sample)
        .options(
            joinedload(bm.Sample.speech).joinedload(od.Speech.politician),
            joinedload(bm.Sample.faction),
        )
        .filter(bm.Sample.id == id_)
        .one()
    )
    faction = sample.faction[0]
    speech = sample.speech
    speaker = sample.speech.politician
    out = {
        "speech_id": speech.id,
        "speech_date": speech.date,
        "sentence_no": sample.sentence_no,
        "faction": faction.abbreviation,
        "speaker_first_name": speaker.first_name,
        "speaker_last_name": speaker.last_name,
    }
    return out

In [None]:
metadata = out["id"].apply(get_metadata)

In [None]:
meta = metadata.to_list()
meta = pd.DataFrame(meta)

In [None]:
df = pd.concat([out, meta], ignore_index=False, axis=1)

In [None]:
df = df[
    [
        "id",
        "speech_id",
        "speech_date",
        "sentence_no",
        "faction",
        "speaker_first_name",
        "speaker_last_name",
        "text",
        "anti_elitism",
        "people_centrism",
        "left_wing",
        "right_wing",
    ]
]

In [None]:
df.anti_elitism = df.anti_elitism.apply(json.dumps)
df.people_centrism = df.people_centrism.apply(json.dumps)
df.left_wing = df.left_wing.apply(json.dumps)
df.right_wing = df.right_wing.apply(json.dumps)

In [None]:
df.to_csv(src.PATH / "data/labelled_data.csv.zip", index=False)