In [1]:
import pandas as pd
import re
import nltk
import spacy
from nltk.corpus import stopwords
from num2words import num2words

# Einmalig notwendige Downloads
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("punkt_tab")

# Stopwörter und SpaCy Modell
stop_words = set(stopwords.words("german"))
nlp = spacy.load("de_core_news_sm")


[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
def clean_url(text):
    return re.sub(r"http\S+", "", text)

def fix_contraction(text):
    return text  # aktuell leer, da keine deutschen Kontraktionen im Einsatz

def clean_non_alphanumeric(text):
    return re.sub(r"[^a-zA-Z0-9äöüÄÖÜß]", " ", text)

def clean_lowercase(text):
    return str(text).lower()

def clean_tokenization(text):
    return nltk.word_tokenize(text, language="german")

def clean_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

def numbers_to_words(tokens):
    output = []
    for token in tokens:
        if token.isnumeric():
            try:
                output.append(num2words(token, lang="de"))
            except:
                output.append(token)
        else:
            output.append(token)
    return output

def clean_lemmatization(tokens):
    doc = nlp(" ".join(tokens))
    return [token.lemma_ for token in doc]

def clean_length(tokens):
    return [word for word in tokens if len(word) > 2]

def count_word_frequencies(tokens):
    freq = nltk.FreqDist(tokens)
    return [(word, count / len(tokens)) for word, count in freq.most_common()]

def convert_to_string(tokens):
    return " ".join(tokens)

def pre_processing_pipeline(text):
    try:
        if pd.isna(text):
            return "", [], []
        text = clean_url(text)
        text = fix_contraction(text)
        text = clean_non_alphanumeric(text)
        text = clean_lowercase(text)
        tokens = clean_tokenization(text)
        tokens = numbers_to_words(tokens)
        tokens = clean_stopwords(tokens)
        tokens = clean_lemmatization(tokens)
        tokens = clean_length(tokens)
        freqs = count_word_frequencies(tokens)
        final = convert_to_string(tokens)
        return final, tokens, freqs
    except Exception as e:
        print("❌ Fehler bei Text:", "→", e)
        return "", [], []


In [3]:
import os
import pandas as pd
from multiprocessing import Pool

# 📁 Eingabe- und Ausgabeordner
input_folder = "splits_small"
output_folder = "results"
os.makedirs(output_folder, exist_ok=True)

# 📄 Alle CSV-Dateien im Eingabeordner sammeln
csv_files = sorted([
    f for f in os.listdir(input_folder)
    if f.endswith(".csv")
])

# 🔧 Funktion zur Verarbeitung einzelner Dateien
def process_single_file(file):
    path = os.path.join(input_folder, file)
    print(f"🚀 Starte Datei: {file}")  # Live-Feedback beim Start

    try:
        df = pd.read_csv(path)
        if "text" not in df.columns:
            print(f"❌ Spalte 'text' fehlt in {file}")
            return

        results = df["text"].apply(pre_processing_pipeline)
        df["processed_text"] = results.apply(lambda x: x[0])
        df["tokens"] = results.apply(lambda x: x[1])
        df["freqs"] = results.apply(lambda x: x[2])

        out_path = os.path.join(output_folder, file.replace(".csv", "_processed.csv"))
        df.to_csv(out_path, index=False)
        print(f"✅ Gespeichert: {out_path}")
    except Exception as e:
        print(f"💥 Fehler bei {file}: {e}")

# 🚀 Parallele Verarbeitung starten
if __name__ == "__main__":
    with Pool(processes=15) as pool:
        pool.map(process_single_file, csv_files)


🚀 Starte Datei: split_10_part14.csv🚀 Starte Datei: split_10_part4.csv🚀 Starte Datei: split_10_part1.csv🚀 Starte Datei: split_11_part13.csv🚀 Starte Datei: split_10_part9.csv🚀 Starte Datei: split_11_part18.csv
🚀 Starte Datei: split_11_part5.csv🚀 Starte Datei: split_12_part1.csv
🚀 Starte Datei: split_12_part3.csv🚀 Starte Datei: split_12_part8.csv
🚀 Starte Datei: split_13_part12.csv
🚀 Starte Datei: split_13_part4.csv

🚀 Starte Datei: split_13_part9.csv
🚀 Starte Datei: split_14_part5.csv🚀 Starte Datei: split_14_part13.csv







✅ Gespeichert: results/split_10_part14_processed.csv
🚀 Starte Datei: split_10_part15.csv
✅ Gespeichert: results/split_10_part9_processed.csv
🚀 Starte Datei: split_11_part1.csv
✅ Gespeichert: results/split_10_part1_processed.csv
🚀 Starte Datei: split_10_part10.csv
✅ Gespeichert: results/split_10_part4_processed.csv
🚀 Starte Datei: split_10_part5.csv
✅ Gespeichert: results/split_11_part18_processed.csv
🚀 Starte Datei: split_11_part19.csv
✅ Gespeichert: results/split_1