# Block 5: eigene Funktionen schreiben für Preprocessing von Texten

## Grundgesetz runterladen

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
url = "https://www.bundestag.de/gg"
response = requests.get(url)
html = response.text

In [None]:
soup = BeautifulSoup(html, "html.parser")
html_text = soup.find(attrs={"class": "bt-standard-content"})
text = html_text.get_text()

In [None]:
with open("grundgesetz.txt", "w", encoding="utf-8") as infile:
    infile.write(text)

## Text Datei einlesen

In [None]:
infile = open("grundgesetz.txt", encoding="utf-8")
grundgesetz = infile.read()
infile.close()

Alternative:

In [None]:
with open("grundgesetz.txt", encoding="utf-8") as infile:
    grundgesetz = infile.read()

## Text vorbereiten

### Punktation entfernen

In [None]:
def remove_punc(text):
    punctuation = '!@#$%^&*()_-+={}[]:;"\'|<>,.?/~`'
    
    for marker in punctuation:
        text = text.replace(marker, "")
    return text

## alternative aus Programming Historian Lesson: *Normalizing Textual Data with Python*

In [None]:
# Given a text string, remove all non-alphanumeric
# characters (using Unicode definition of alphanumeric).

def stripNonAlphaNum(text):
    import re
    return re.compile(r'\W+', re.UNICODE).split(text)

### Text säubern und Kleinschreibung

In [None]:
def clean_text(text):
    return remove_punc(text.lower())

text = clean_text(grundgesetz)

## Text in Worte aufteilen und in Liste speichern

In [None]:
words = text.split()

In [None]:
print("Anzahl aller Worte des Textes: ", (len(words)))
print("=======")
print(words[:10])

## bestimmtes Wort zählen

In [None]:
number_of_hits = 0
item_to_count = "gesetz"
for word in words:
    if word == item_to_count:
        number_of_hits += 1
        
print(number_of_hits)

### Methode .count()

In [None]:
grundgesetz.count("gesetz")

In [None]:
text.count("gesetz")

In [None]:
text.count(" gesetz ")

## Funktion zum Zählen eines bestimmten Wortes

In [None]:
def count_in_list(item_to_count, list_to_search): 
    number_of_hits = 0                            
    for item in list_to_search:                   
        if item == item_to_count:                 
            number_of_hits += 1                   
    return number_of_hits 

print(count_in_list("freiheit", words))

## jedes Wort zählen

In [None]:
for word in words[:10]:
    print(word, count_in_list(word, words))

## Funktion: Zählen aller Wörter (unique)

In [None]:
def counter(list_to_search):
    unique_words = set(list_to_search)
    print("Anzahl der mindestens einmal vorkommenden Worte: ",
          len(unique_words))
    for word in unique_words:
        print(word, count_in_list(word, list_to_search))
    
counter(words)

## Wörter zählen mit einem Dictionary

In [None]:
def counter_dict(list_to_search):                 
    counts = {}                              
    for word in list_to_search:              
        if word in counts:                   
            counts[word] = counts[word] + 1  
        else:                                
            counts[word] = 1                 
    return counts

counter2(words)

## Ergebnis der Wortverteilung in einer Datei schreiben

In [None]:
frequency_distribution = counter_dict(words)

with open("grundgesetz-wortverteilung.txt", "w", 
          encoding="utf-8)") as outfile:
    for word, frequency in frequency_distribution.items():
        outfile.write(word + " => " + str(frequency) + '\n')
    

## Wortverteilung absteigend sortieren

In [None]:
def freq_count(list_to_search):       
    counts = counter_dict(list_to_search)
    counts = [(counts[key], key) for key in counts]
    counts.sort()
    counts.reverse()
    return counts

freq_count(words)[:20]

### Variante 2: sortieren

In [None]:
def freq_count_2(list_to_search):       
    counts = counter_dict(list_to_search)

    return ([(k, counts[k]) for k in 
             sorted(counts, key=counts.get, reverse=True)])

freq_count_2(words)[:20]

## Ergebnis der sortierten Wortverteilung in eine Datei schreiben

In [None]:
frequency_distribution_2 = freq_count(words)

with open("grundgesetz-wortverteilung-sortiert.txt", "w", 
          encoding="utf-8") as outfile:
    for frequency, word  in frequency_distribution_2:
        outfile.write(word + " => " + str(frequency) + '\n')

In [None]:
def write_frequency_distribution(list_to_search):
    frequency_distribution = freq_count_2(list_to_search)

    with open("grundgesetz-wortverteilung-sortiert_f.txt", "w", 
              encoding="utf-8") as outfile:
        for word, frequency in frequency_distribution:
            outfile.write(word + " => " + str(frequency) + '\n')
            
write_frequency_distribution(words)

## Entfernen von Stoppwörtern

In [None]:
def remove_stopwords(list_to_search):
    with open("stoppwortliste_raw.txt", 
              encoding="utf-8") as file:
        stopwords = file.read()
    stopwords = stopwords.split()
    return [w for w in list_to_search if w not in stopwords]
  
remove_stopwords(words)

In [None]:
def remove_stopwords_prepared(list_to_search):
    with open("stoppwortliste_prepared.txt", 
              encoding="utf-8") as file:
        stopwords = file.read()
    stopwords = stopwords.split()
    return [w for w in list_to_search if w not in stopwords]
  
remove_stopwords_prepared(words)

## Workflow: Funktionsaufrufe

In [None]:
with open("grundgesetz.txt", encoding="utf-8") as infile:
    grundgesetz = infile.read()

text = clean_text(grundgesetz)
words = text.split()
clean_words = remove_stopwords(words)
write_frequency_distribution(clean_words)

In [None]:
with open("grundgesetz.txt", encoding="utf-8") as infile:
    grundgesetz = infile.read()

text = clean_text(grundgesetz)
words = text.split()
clean_words = remove_stopwords_prepared(words)
write_frequency_distribution(clean_words)

## Visualisieren der Worthäufigkeiten

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = freq_count(clean_words)

In [None]:
sns.set_style("white")

fig, ax = plt.subplots()
fig.dpi = 150

wert = 25 

words = [x[1] for x in data[:wert]]
values = [int(x[0]) for x in data[:wert]]
mybar = plt.bar(range(len(words)), values, alpha=0.4)
                
ax.set_xlabel('Wort Index')
ax.set_xticklabels([x for x in words], rotation=90)
ax.set_xticks([x for x in range(wert)])
ax.set_ylabel('Häufigkeiten')
ax.set_title('Worhäufigkeiten')

plt.show()
#plt.savefig("Barchart_Worthaeufigkeiten.png", dpi=300, bbox_inches="tight")

![Balkendiagramm](img_Preprocessing/Barchart_Worthaeufigkeiten.PNG)

In [None]:
sns.set_style("white")

fig, ax = plt.subplots()
fig.dpi = 150

wert = 25 

words = [x[1] for x in data[:wert]]
values = [int(x[0]) for x in data[:wert]]
mybar = plt.barh(range(len(words)), values, alpha=0.4)
                
ax.set_xlabel('Häufigkeiten')
ax.set_yticklabels([x for x in words])
ax.set_yticks([x for x in range(wert)])
ax.invert_yaxis() 
ax.set_ylabel('Wort Index')
ax.set_title('Worthäufigkeiten')

plt.show()
#plt.savefig("HBarchart_Worthaeufigkeiten.png", dpi=300, bbox_inches="tight")

![Horizontales Balkendiagramm](img_Preprocessing/HBarchart_Worthaeufigkeiten.PNG)