# Sample Examples for Annotation


In [None]:
import re

import pandas as pd
import spacy
from sqlalchemy import func
from sqlalchemy.orm import Query
from sqlalchemy.orm import Session
from sqlalchemy.orm import joinedload
from tqdm.notebook import tqdm

import src
import src.db.models.bert_data as bm
import src.db.models.open_discourse as m
from src.db.connect import make_engine

In [None]:
# setup

engine = make_engine("DB")
session = Session(engine)

pd.set_option("display.max_colwidth", 2000)
pd.set_option("display.max_rows", 50)


# create Spacy-Doc column in DataFrame

nlp = spacy.load("de_core_news_md")
nlp.disable_pipes(
    [
        "tok2vec",
        "tagger",
        "morphologizer",
        "parser",
        "lemmatizer",
        "attribute_ruler",
        "ner",
    ]
)
nlp.enable_pipe("senter")

## Preprocessing

Get Speeches


In [None]:
query = (
    session.query(m.Speech)
    .options(joinedload(m.Speech.faction))
    .join(m.Faction)
    .filter(
        m.Speech.electoral_term.in_([18, 19]),
        m.Faction.id != -1,
    )
    # for debugging:
    # .order_by(func.random())
    # .limit(1200)
    # or
    # .filter(m.Speech.id == 831139)
)

### Clean Text and create sentence_df

Speeches contain many linebreaks and inline references to contributions of other speakers (table contributions in db). These have to be removed.


In [None]:
# take around 3-6 min with 4.2 Ghz and no WiFi


def clean_speech(text):
    # replace Zwischenrufnummern
    text = re.sub(r"(?=\(\{)(.*)(?<=\}\))", "", text)
    # replace newlines with spaces
    text = re.sub(r"\n+$", " ", text)
    # no clue why this is here. Uwe?
    text = re.sub(r"\s*\n+\s*", " ", text)

    # insert Space after PUNKT if following letter is uppercase
    # TODO: should this be done everytime?
    text = re.sub(r"\s*\.\s*(?=[A-ZÄÜÖ])", ". ", text)
    text = re.sub(r"\s*\?\s*(?=[A-ZÄÜÖ])", "? ", text)
    text = re.sub(r"\s*!\s*(?=[A-ZÄÜÖ])", "! ", text)
    return text


# iterate over all speeches in query
all_sents = []
for speech in tqdm(query.yield_per(100), total=query.count()):
    # extract doc-level information
    text = speech.speech_content
    text = clean_speech(text)
    doc = nlp(text)

    # get sentences of doc to iterate over
    sents = list(doc.sents)
    n_sentences = len(list(doc.sents))

    # skip very short speeches
    if n_sentences < 3:
        continue

    # iterate over sentences + add 1 row per sentence to all_sents
    for sent_no, sent in enumerate(sents, 1):
        # do not use first and last sentence
        if not 1 < sent_no < n_sentences:
            continue

        sent_length = len([tok for tok in sent if not tok.is_punct])
        # skip very short sentences
        if sent_length < 3:
            continue

        data = {
            "speeches_id": speech.id,
            "electoral_term": speech.electoral_term,
            "faction_id": speech.faction.id,
            "abbreviation": speech.faction.abbreviation,
            "sentence_no": sent_no,
            "sentence_length": sent_length,
            "sentence": str(sent),
        }
        all_sents.append(data)

sentence_df = pd.DataFrame(all_sents)

  0%|          | 0/53202 [00:00<?, ?it/s]

In [None]:
%%capture --no-display

# remove sents by regexes
sentence_df = sentence_df[
    ~sentence_df.sentence.str.contains(
        "^((meine )?sehr (geehrt|verehrt)|liebe|(meine )?damen und)", case=False
    )  # introductions
    & ~sentence_df.sentence.str.contains("(:|;)$", case=False)  # bad endings
    & ~sentence_df.sentence.str.contains("^([a-z]|-)", case=True)  # lowercase sent starts / dashes
]

In [None]:
sentence_df.sample(5)

Unnamed: 0,speeches_id,electoral_term,faction_id,abbreviation,sentence_no,sentence_length,sentence
388891,794137,18,4,CDU/CSU,11,8,Wir konnten dies glücklicherweise Punkt für Punkt widerlegen.
1061575,844483,18,23,SPD,89,11,In den letzten Jahren und Jahrzehnten gab es eine positive Entwicklung.
715254,818740,18,4,CDU/CSU,20,7,"Sie waren zusammengekommen, um gemeinsam zu beten."
582116,808235,18,6,DIE LINKE.,46,29,"Wir als Linke werden - ich hoffe, zusammen mit allen Fraktionen dieses Hauses - immer dabei sein, wenn es darum geht, Rassismus zu bekämpfen und Flüchtlinge in unserem Land zu verteidigen."
1334480,1018760,19,23,SPD,14,8,Es gibt die Charta von Paris von 1990.


## Get Popdict Score per Sentence

Steps are:

1. save sentence_df to temp locations
2. call RScript that loads df from temp location and runs Gründl on it
3. Rscripts saves result to temp location
4. Load result from temp location back into python

run_gruendl.R gives back binary results for each sentence. runs for~ 10 min with 4.2 Ghz


In [None]:
# create temp directories and files

TMPDIR = src.PATH / "tmp/gruendl"
TMPDIR.mkdir(exist_ok=True, parents=True)

TMPFILE = TMPDIR / "raw_sents.parquet"

OUTFILE = TMPDIR / "result.parquet"

sentence_df[["speeches_id", "sentence_no", "sentence"]].reset_index(drop=True).to_parquet(TMPFILE)

GRUENDL_SCRIPT = src.PATH / "r/run_gruendl.R"

In [None]:
!Rscript {GRUENDL_SCRIPT} -f {TMPFILE} -o {OUTFILE}

[?25h[?25h[?25h[?25h[?25h[?25h[?25h[1] "start popdictR..."
[?25h[?25h[1] "saving output..."
[?25h[?25h[1] "done."
[?25h[?25h

In [None]:
gruendl_result = pd.read_parquet(OUTFILE)

In [None]:
# remove temp directory
TMPFILE.unlink(missing_ok=True)
OUTFILE.unlink(missing_ok=True)
if TMPDIR.is_dir():
    TMPDIR.rmdir()

In [None]:
gruendl_result.groupby("dict_gruendl_2020").sample(2)

Unnamed: 0,speeches_id,sentence_no,dict_gruendl_2020
1112712,1018675,13,0.0
271501,1059453,28,0.0
263343,1058516,17,1.0
980702,1006815,43,1.0


In [None]:
# merge faction / speech information from df, sentence info from sentence_df and gründl info from
# gruendl_result

sample_df = pd.merge(sentence_df, gruendl_result, on=["speeches_id", "sentence_no"])

In [None]:
sample_df.sample(5)

Unnamed: 0,speeches_id,electoral_term,faction_id,abbreviation,sentence_no,sentence_length,sentence,dict_gruendl_2020
957531,1004727,19,4,CDU/CSU,4,15,"Für Personen, die besonders arbeitsmarktfern sind, ziehen wir als Koalition einen sozialen Arbeitsmarkt in Betracht.",0.0
419882,803690,18,23,SPD,36,4,Zu Protokoll gegebene Reden,0.0
216283,1053789,19,23,SPD,28,9,Das will ich Ihnen noch mal ans Herz legen.,0.0
812611,837812,18,23,SPD,22,23,"Wir nehmen 800 Millionen Euro Jahr für Jahr in die Hand, um das Leben der Menschen mit Behinderung und ihrer Familien zu verbessern.",0.0
540954,813684,18,6,DIE LINKE.,6,15,Will die Bundesregierung eingegangene Kreditverpflichtungen in diesem Punkt nicht erfüllen und damit einen Präzedenzfall schaffen?,0.0


# Delete current data

# Recreate tables and upload data

In [None]:
sample_df = pd.read_parquet(src.PATH / "data/all_sentences.parquet")

In [None]:
bm.Base.metadata.create_all(engine)

In [None]:
sample_df = sample_df.rename(
    {
        "sentence": "text",
        "dict_gruendl_2020": "pop_dict_score",
    },
    axis=1,
).drop(
    ["faction_id", "abbreviation", "electoral_term"],
    axis=1,
)

In [None]:
dicts = sample_df.to_dict(orient="records")

In [None]:
session.bulk_insert_mappings(bm.Sample, dicts)

In [None]:
session.commit()