# Test German NLP Pipeline

Test NLP pipeline for dependency parsing and semantic role labeling in German.

In [1]:
import spacy
import re
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup

## Retrieve Stories From the Web

Build web scraper for German stories about the pandemic. The stories are published on a webpage from the HSPV NRW (University of Applied Sciences for the police and public administration) and written by students as well as staff. For a general description, see: https://www.hspv.nrw.de/services/corona-krise/corona-geschichten

In [2]:
# Define URL and story page ids

URL = "https://www.hspv.nrw.de/nachrichten/artikel/corona-geschichten"

ID_STORIES = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13"]

In [3]:
# Function to retrieve text from webpages

def get_text_from_web(url, ids):
    docs = []

    for id in ids:
        new_page = urlopen(url + "-" + id).read()

        new_soup = BeautifulSoup(new_page, 'html.parser')

        for script in new_soup(["script", "style"]):
            script.decompose()
        
        docs.append(list(new_soup.stripped_strings))
    
    return docs

  

Extract the relevant stories by removing text that is present in all documents (e.g., menu headers).

In [4]:
# Function to extract only text from stories

def extract_story_text(docs):
    new_docs = [[s for s in docs[0] if s not in docs[-1]]]

    for doc in docs[1:]:
        new_docs.append([s for s in doc if s not in docs[0]])
    
    return new_docs

In [5]:
docs = get_text_from_web(URL, ID_STORIES)

short_docs = extract_story_text(docs)

In [81]:
print(short_docs[1][:9])

['#WirmeisterndieKrise Der ganz normale Wahnsinn | HSPV NRW', 'Der ganz normale Wahnsinn', '05. Februar 2021', 'Ann-Katrin Vengels', 'Studienalltag in Zeiten von Corona', 'Ann-Katrin Vengels, Studentin der HSPV NRW am Studienort Mülheim an der Ruhr, berichtet aus ihrem Alltag zwischen Homeschooling, Kinderbetreuung und Online-Studium.', 'In den vergangenen Jahren habe ich die Frage nach meinem Befinden stets mit den Worten „Der normale Wahnsinn halt“\xa0 beantwortet. Und ja, irgendwie ist der Wahnsinn tatsächlich normal geworden. Ich bin Mutter von zwei Kindern, meine Tochter ist 13 und mein Sohn sechs. Seit zehn Jahren bin ich verheiratet. 2019 habe ich mich dazu entschlossen, meinen Beruf aufzugeben und mich weiterzuentwickeln, weshalb ich mich für ein duales Bachelorstudium an der HSPV entschieden habe. Mittlerweile studiere ich im zweiten Studienjahr Polizeivollzugsdienst und bin Kommissaranwärterin.', 'Mein Ehemann (der inzwischen auch mein Kollege ist) und ich sind über die Jahre

## Dependency Parsing and Named Entity Recognition

Before running the pipeline, the pipeline must be installed, e.g., via:

`python3 -m spacy download de_core_news_sm`

For details on the pipeline, see: https://spacy.io/models/de.


In [7]:
# Load german nlp pipeline trained on news documents (sm = small size)

nlp = spacy.load("de_core_news_sm")

In [8]:
def analyse_stories(docs):
    processed_docs = []
    
    for doc in docs:
        processed_docs.append(nlp("".join([s for s in short_docs[1]])))
        
    return processed_docs

In [19]:
processed_docs = analyse_stories(short_docs)

In [72]:
# From danlp_test notebook

def print_results(doc):
    analysis = []
    for token in doc:
        analysis.append({ "text": token.text, "lemma": token.lemma_, "head": token.head.i, "dep": token.dep_, "pos": token.pos_, "ent": token.ent_type_ })
    return pd.DataFrame(analysis)

In [74]:
nlp_analysis = print_results(processed_docs[0])
print(nlp_analysis)

Unnamed: 0,text,lemma,head,dep,pos,ent
0,#,#,1,pnc,X,
1,WirmeisterndieKrise,WirmeisterndieKrise,1,ROOT,X,ORG
2,Der,der,5,nk,DET,
3,ganz,ganz,4,mo,ADV,
4,normale,normale,5,nk,ADJ,
...,...,...,...,...,...,...
695,",",",",694,punct,PUNCT,
696,die,der,698,oa,PRON,
697,wir,ich,698,sb,PRON,
698,haben,haben,694,rc,VERB,


In [39]:
# From danlp_test notebook

def show_results(results_dict):
    results_df = pd.DataFrame([{ "key": key, "count": results_dict[key], "best value": ""} 
                               for key in sorted(results_dict, key=lambda k: results_dict[k], reverse=True)])
    best_keys = get_best_keys(results_df)
    for i, row in results_df.iterrows():
        if row["key"] in best_keys:
            results_df.at[i, "best value"] = "yes"
    return results_df

In [40]:
# From danlp_test notebook

def get_best_keys(results_df):
    best_count = -1
    best_keys = []
    for i, row in results_df.iterrows():
        if row["count"] > best_count:
            best_count = row["count"]
            best_keys = [row["key"]]
        elif row["count"] == best_count:
            best_keys.append(row["key"])
    case_is_upper = []
    for key in best_keys:
        case_is_upper.append(re.search(r"^[A-Z]", key) != None)
    if True in case_is_upper:
        best_keys = [ best_keys[i] for i in range(0, len(best_keys)) if case_is_upper[i] ] 
    return best_keys

In [57]:
# From danlp_test notebook

def get_actors(nlp_table_df):
    actors = {}
    for i, row in nlp_table_df.iterrows():
        if row["dep"] == "sb":
            actor = row["text"]
        else:
            actor = ""
        if actor != "":
            if actor in actors:
                actors[actor] += 1
            else:
                actors[actor] = 1
    return actors

In [79]:
# From danlp_test notebook

def get_locations(nlp_table_df):
    locations = {}
    for i, row in nlp_table_df.iterrows():
        if row["ent"] == "LOC":
            location = row["text"]
        else:
            location = ""
        if location != "":
            if location in locations:
                locations[location] += 1
            else:
                locations[location] = 1
    return locations

In [68]:
# From danlp_test notebook

def get_actions(nlp_table_df):
    actions = {}
    for i, row in nlp_table_df.iterrows():
        if row["pos"] == "VERB" and re.search("[a-zA-Z]", row["text"]):
            if row["text"] in actions:
                actions[row["text"]] += 1
            else:
                actions[row["text"]] = 1
    return actions

In [63]:
show_results(get_actors(nlp_analysis)).loc[:9]

Unnamed: 0,key,count,best value
0,ich,11,yes
1,wir,5,
2,man,3,
3,es,3,
4,Wahnsinn,2,
5,der,2,
6,Einschulung,2,
7,können,2,
8,VengelsStudienalltag,1,
9,Ich,1,


In [77]:
show_results(get_actions(nlp_analysis)).loc[:9]

Unnamed: 0,key,count,best value
0,gebildet,2,yes
1,gibt,2,yes
2,berichtet,1,
3,beantwortet,1,
4,verheiratet,1,
5,entschlossen,1,
6,aufzugeben,1,
7,weiterzuentwickeln,1,
8,entschieden,1,
9,studiere,1,


In [82]:
show_results(get_locations(nlp_analysis)).loc[:9]

Unnamed: 0,key,count,best value
0,Mülheim,1,yes
1,an,1,
2,der,1,
3,Ruhr,1,yes
4,Studienplan,1,yes
5,Lockdown,1,yes
6,Lerngruppen,1,yes
7,Sushi,1,yes
8,Kurs,1,yes


## Missing
1. Semantic role labeling
2. Analysis of all stories
3. Visualization