# **Imports**

In [1]:
import os
import re
import csv
from collections import Counter
import nltk
from nltk.corpus import stopwords
import ssl
import shutil

# **Configs**

In [2]:
INPUT_PATH = '../1 data_preprocessing/output'
OUTPUT_PATH = './top_words'

# **Downlad nltk stopwords**

In [3]:
# using SSL to download nltk (code from https://stackoverflow.com/questions/41348621/ssl-error-downloading-nltk-data)
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
stop_words = set(stopwords.words('german'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/niklas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **Functions**

In [4]:
def top_words(filepath, top_n=30):
    file = open(filepath, 'r', encoding='utf-8')
    text = file.read()
    file.close()

    text = text.lower()
    text = re.sub('[^a-zäöüß]', ' ', text)

    single_words = text.split()

    counter1 = Counter()
    for word in single_words:
        if word not in stop_words:
            counter1[word] += 1

    top_words = counter1.most_common(top_n)

    return top_words

# **Process all files**

In [6]:
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

for filename in os.listdir(INPUT_PATH):
    if filename.endswith('.txt'):
        file_path = os.path.join(INPUT_PATH, filename)
        top_worte_liste = top_words(file_path)

        csv_dateiname = filename.replace('.txt', '_top_worte.csv')
        csv_pfad = os.path.join(OUTPUT_PATH, csv_dateiname)

        with open(csv_pfad, 'w', newline='', encoding='utf-8') as csv_datei:
            csv_schreiber = csv.writer(csv_datei)
            csv_schreiber.writerow(['Wort', 'Anzahl'])
            for wort, anzahl in top_worte_liste:
                csv_schreiber.writerow([wort, anzahl])

# **Copy files into frontend/src**

In [10]:
# Copy folder into react frontend src folder so that react can access the data
shutil.copytree('./top_words', '../../frontend/src/pages/charts/data/top_words', dirs_exist_ok=True)

'../../frontend/src/pages/charts/data/top_words'