In [None]:
import re
from tqdm.notebook import trange, tqdm
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
from stop_words import get_stop_words
from wordcloud import WordCloud

from HanTa import HanoverTagger as ht

import nltk

#from nltk.tokenize import RegexpTokenizer
#from nltk.stem.snowball import SnowballStemmer
##nltk.download('wordnet')
#from nltk.stem.wordnet import WordNetLemmatizer
##nltk.download('stopwords')
#from nltk.corpus import stopwords
#from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# import ssz color palette to use correct colors for the City of Zurich
import sszpalette
sszpalette.register()

In [None]:
nltk.download('punkt')

In [None]:
#force output to display the full description
pd.set_option('display.max_colwidth', -1)

In [None]:
#create connection to database
conn = sqlite3.connect('data.sqlite')
c = conn.cursor()

#create the pandas data frame
report_df = pd.read_sql('select signatur, titel, jahr, report_text from data', conn)

#display the top records from the data frame
report_df[['signatur', 'titel', 'jahr']].head()

In [None]:
#inline function to produce word count, splitting on spaces
report_df['word_count'] = report_df['report_text'].apply(lambda x: len(str(x).split(" ")))
report_df.word_count.describe()

In [None]:
stop_words = []
stop_words = set(get_stop_words("de", cache=False)) #show how many words are in the list of stop words

# add some custom stopwords
stop_words.update(['Jahr', 'Berichtsjahr', 'Fr', 'Po'])

print(len(stop_words))
print(stop_words)

In [None]:
#loops through descriptions and cleans them
clean_text = []
for w in range(len(report_df.report_text)):
    text = report_df['report_text'][w].lower()
    
    #remove punctuation
    text = re.sub('[^\w]', ' ', text, flags=re.UNICODE)
    
    #remove digits and special chars
    text = re.sub("(\\d|\\W)+"," ", text, flags=re.UNICODE)
    
    clean_text.append(text)#assign the cleaned descriptions to the data frame
report_df['clean_text'] = clean_text

In [None]:
report_df.tail(1)

# Texte analysieren

Im nächsten Schritt wird jeder Text analysiert und die Nomen werden extrahier (weitere Möglichkeiten wären Stemming oder Lemmatizing).

**ACHTUNG**: Der folgende Block dauert sehr lange über dem ganzen Korpus, Laufzeit ca. 4h!

In [None]:
all_nouns = []
tagger = ht.HanoverTagger('morphmodel_ger.pgz')
for w in tqdm(range(len(report_df['report_text'])), desc="Years"):
    sentences = nltk.sent_tokenize(report_df['report_text'][w], language='german')
    
    nouns = []
    sentences_tok = [nltk.tokenize.word_tokenize(sent) for sent in sentences]
    # try to lemmatize or stemming here instead of simply returning nouns
    for sent in tqdm(sentences_tok, desc="Sentences", colour="#03c2fc", leave=False):
        try:
            tags = tagger.tag_sent(sent) 
            nouns_from_sent = [lemma for (word,lemma,pos) in tags if pos == "NN" or pos == "NE"]
            # remove single character words
            nouns.extend([n for n in nouns_from_sent if len(n) > 1])
        except KeyError:
            continue
    #print(report_df['jahr'][w], nouns[(len(nouns)//2-2):(len(nouns)//2+2)])
    all_nouns.append(" ".join(nouns))

In [None]:
report_df['nouns'] = all_nouns
report_df.tail(1)

In [None]:
# Save the processed dataframe to disk
#report_df.to_pickle("./report_processed.pkl")
report_df = pd.read_pickle("./report_processed.pkl")

In [None]:
#calculate the frequency
word_frequency = pd.Series(' '.join(report_df['nouns']).split()).value_counts()[:1000]
word_frequency = word_frequency.to_dict()
word_frequency

In [None]:
wc = WordCloud(background_color='white', stopwords=stop_words, collocations=False)
wc.generate(" ".join(all_nouns))
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
# Remove stop words from word_frequencies
clean_freq = {k: v for k, v in word_frequency.items() if k not in stop_words}
clean_freq

In [None]:
#set the word cloud parameters
wordcloud = WordCloud(width=1800, height=1800, background_color='white', colormap='harmonic12', max_words=1000, min_font_size=20, min_word_length=2)
wordcloud.generate_from_frequencies(clean_freq)
#plot the word cloud
fig = plt.figure(figsize = (20,15), facecolor = None)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
#fig.savefig("wordcloud.png")