In [89]:
import os
from pathlib import Path
import pandas as pd
import spacy
import re
import codecs
import plotly.express as px

## Import text files

In [90]:
txt_data_path = os.path.join(Path().resolve().parent, "data", "cleaned")

text_data = {}

for year in ["2013", "2017", "2021"]:
    text_data[year] = {}
    for party in ["spd", "fdp", "cdu", "afd", "gruene", "linke"]:
        filename = party + "_" + year + ".txt"
        file_path = os.path.join(txt_data_path, filename)

        # read txt
        # with open(file_path, 'rb') as in_file:
        in_file = codecs.open(file_path, "r", encoding="utf-8")
        text_data[year][party] = in_file.read()
        in_file.close()

## Tokenize texts
Give every word a class (noun, verb, etc).

In [91]:

# select pipeline for German
# choose between speed and accuracy
mode = "speed" # "accuracy"
if mode == "speed":
    nlp = spacy.load("de_core_news_sm")
    nlp.max_length = 200000
else:
    nlp = spacy.load("de_dep_news_trf")

In [92]:
in_file = text_data["2013"]["gruene"]
tokens_list = []
for num_chunks in range((len(in_file)//50000)):
    # print("start: ", str(num_chunks*50000), ", end: ", str((1+num_chunks)*50000-1))
    chunk = nlp(str(in_file[(num_chunks*50000):((1+num_chunks)*50000-1)]))
    
    for token in chunk:
        tokens_list.append({
            "Text":     token.text,
            "Lemma":    token.lemma_,
            "POS":      token.pos_,
            "Tag":      token.tag_,
            "Dep":      token.dep_,
            "Shape":    token.shape_,
            "Alpha":    token.is_alpha, 
            "Stop":     token.is_stop
        })

token_df = pd.DataFrame(tokens_list)


In [93]:
pd.set_option('display.max_rows', 500)
nouns_df = token_df[token_df["POS"] == "PROPN"]
important_nouns = nouns_df["Lemma"].value_counts()[0:21]

In [94]:
important_nouns

Lemma
Deutschland              156
Europa                   121
EU                        62
                         56
GRÜNE                     49
GRÜN                      23
muss                      20
Merkel                    19
GRÜNEN                    16
Kitas                     16
EEG                       10
G                          9
USA                        8
BÜNDNIS                    8
Migrationshintergrund      8
Schluss                    7
N                          7
Brüssel                    6
Euro                       6
Schwarz-Gelb               6
Angela                     6
Name: count, dtype: int64

In [95]:
px.bar(important_nouns, y="count")

In [96]:
px.bar(important_nouns, y="Lemma")

ValueError: Value of 'y' is not the name of a column in 'data_frame'. Expected one of ['count'] but received: Lemma