In [1]:
import src
from src.db.connect import make_engine
import src.db.models.bert_data as bm
import src.db.models.open_discourse as od

from sqlalchemy.orm import Session
import pandas as pd

In [2]:
engine = make_engine("DB")

In [7]:
with Session(engine) as s:
    query = (
        s.query(bm.Sample)
        .join(od.Speech)
        .join(od.Politician)
        .join(od.Faction)
        .filter(
            od.Speech.electoral_term.in_([18, 19]),
            od.Faction.abbreviation != "Fraktionslos",
        )
        .with_entities(
            bm.Sample.id.label("sample_id"),
            bm.Sample.speeches_id.label("speech_id"),
            od.Speech.date,
            od.Speech.session,
            od.Speech.electoral_term,
            od.Politician.id.label("politician_id"),
            od.Politician.first_name,
            od.Politician.last_name,
            od.Faction.abbreviation,
            bm.Sample.sentence_no,
            bm.Sample.sentence_length,
            bm.Sample.text,
        )
        .order_by(bm.Sample.id.asc())
        .distinct()
    )

with engine.connect() as conn:
    df = pd.read_sql(query.statement, conn)

df.abbreviation = df.abbreviation.replace({"DIE LINKE.": "DIE LINKE"})

In [8]:
df.to_parquet(src.PATH / "data/raw/sentences.parquet.gzip", compression="gzip")