In [1]:
# %load_ext autoreload
# %autoreload 2

In [4]:
import csv
import random
import re

import pandas as pd
import spacy
from pandarallel import pandarallel
from sqlalchemy import func
from sqlalchemy.orm import Query
from sqlalchemy.orm import Session
from tqdm.notebook import tqdm

import src
import src.db.models as m
from src.pop_dict import GruendlDict

  metadata.reflect(engine, schema="open_discourse", only=["speeches"])


In [6]:
# setup
pd.set_option("max.colwidth", None)
session = Session(m.engine)
regex = "|".join(GruendlDict.postgres())


csv_params = dict(delimiter=",", quotechar='"', quoting=csv.QUOTE_NONNUMERIC)


electoral_term = 19

# Download raw data from DB


In [7]:
# prepare functions and objects


def clean_raw_text(text):
    """basic cleaning of the raw text

    Args:
        text (str): Komplette Rede

    Returns:
        str: leicht bereinigte Rede
    """
    text = re.sub("\n", " ", text)
    text = re.sub("\(\{\d+\}\)", " ", text)
    return text


def get_positives(text, pyregex):
    """Findet die positiven Matches auf Satzebene

    Args:
        text (str): Rede
        pyregex: Kompilierte python regex

    Returns:
        List[str]: Sätze, bei denen die regex einen Match gefunden hat.
    """
    matches = []

    doc = nlp(text)

    for sent in doc.sents:
        sent = str(sent).strip()
        if pyregex.search(sent):
            matches.append(sent)
    return matches


nlp = spacy.load("de_core_news_md")
pyregex = re.compile(regex, flags=re.I)

## get positive matches


In [8]:
postive_matches_file = src.PATH / f"data/dict_approach/positives_{electoral_term}.csv"

### get data from DB


In [9]:
query = (
    session.query(m.Speech)
    .filter(
        m.Speech.electoral_term == electoral_term,
        m.Speech.speech_content.regexp_match(regex, flags="i"),
    )
    .with_entities(
        m.Speech.id,
        m.Speech.speech_content,
    )
)

### Extract relevant sentences


In [10]:
with open(postive_matches_file, "w", buffering=5) as csvfile:
    writer = csv.writer(csvfile, **csv_params)
    writer.writerow(["speech_id", "sentence"])

    for row in tqdm(query.with_session(session).yield_per(15), smoothing=1, total=limit):
        speech_id = row.id
        text = row.speech_content
        clean_text = clean_raw_text(text)
        sents = get_positives(clean_text, pyregex)
        for sent in sents:
            writer.writerow([speech_id, sent])

0it [00:00, ?it/s]

## get negative matches


In [11]:
neg_matches_file = src.PATH / f"data/dict_approach/negatives_{electoral_term}.csv"

### get data from DB


In [12]:
query = (
    session.query(func.setseed(0.1337), m.Speech)
    .filter(
        m.Speech.electoral_term == electoral_term,
        ~m.Speech.speech_content.regexp_match(regex, flags="i"),
        func.random() < 0.35,
    )
    .with_entities(
        m.Speech.id,
        m.Speech.speech_content,
    )
)

### Extract relevant sentences


In [13]:
# prepare functions and objects


def get_negatives(text):
    """Findet die positiven Matches auf Satzebene

    Args:
        text (str): Rede

    Returns:
        List[str]: Sätze, bei denen die regex einen Match gefunden hat.
    """
    doc = nlp(text)

    sents = [sent for sent in doc.sents if len(sent) > 7]
    if len(sents) < 2:
        return None

    index = random.randint(0, len(sents) - 1)
    sent = str(sents[index]).strip()
    return sent


nlp = spacy.load("de_core_news_md")

In [16]:
random.seed(1337)

with open(neg_matches_file, "w" buffering=5) as csvfile:

    writer = csv.writer(csvfile, **csv_params)
    writer.writerow(["speech_id", "sentence"])

    for row in tqdm(query.with_session(session).yield_per(50), smoothing=1, total=limit):
        speech_id = row.id
        text = row.speech_content
        clean_text = clean_raw_text(text)
        sent = get_negatives(clean_text)
        if sent:
            writer.writerow([speech_id, sent])

0it [00:00, ?it/s]

# Playground