In [1]:
import os
import re
from gensim import corpora
from gensim.models import LdaMulticore
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector
import random

In [2]:
nlp = spacy.load("de_core_news_md")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Add custom stopwords
custom_stopwords = {"o", "d", "D", "v", "est", "ad", "qui", "non", "ut",
                     "sed", "deß", "et", "te", "at", "quae", "cum", "sic",
                       "hoc", "m", "--", "Herr", "herr", "Gott", "gott"} 
for word in custom_stopwords:
    nlp.Defaults.stop_words.add(word)


In [4]:

# Define preprocessing function
def preprocess_text_spacy(text):
    """
    Tokenize, remove stopwords, remove Latin sentences, and lemmatize German text using spaCy.
    """
    text = text.lower()
    text = re.sub(r'[^a-zäöüß\s.!?\\/]', '', text)  # Preserve German characters an slashes for sentence separation
    text = re.sub(r'\b\w\b', '', text)  # Remove single-character words

    # Process text with spaCy
    doc = nlp(text)

    # Lemmatize and remove stopwords
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

    return tokens

In [5]:
# Read text files from folder
def load_documents(folder_path):
    """
    Load all .txt files from the specified folder and return a list of documents.
    """
    documents = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                documents.append(file.read())
    return documents


In [6]:
# Preprocess and create dictionary & corpus
def preprocess_and_create_corpus(documents):
    """
    Preprocess documents and create dictionary and corpus for topic modeling.
    """
    print("Preprocessing text...")
    processed_docs = [preprocess_text_spacy(doc) for doc in documents]

    print("Creating dictionary and corpus...")
    dictionary = corpora.Dictionary(processed_docs)
    dictionary.filter_extremes(no_below=3, no_above=0.5)
    corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    return processed_docs, dictionary, corpus


In [7]:

# Run preprocessing
def run_preprocessing(folder_path, num_topics=5, num_words=10):
    # Load documents
    print("Loading documents...")
    documents = load_documents(folder_path)
    print(f"{len(documents)} documents loaded.")

    # Preprocess and create dictionary & corpus
    processed_docs, dictionary, corpus = preprocess_and_create_corpus(documents)

    return dictionary, corpus, processed_docs


In [8]:

# Train LDA model and display topics
def train_lda(corpus, dictionary, num_topics=5, num_words=10, passes=10, random_state=100):
    """
    Train an LDA model using the given corpus and dictionary.
    """
    print(f"Training LDA model with {num_topics} topics...")
    lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes, random_state=random_state, workers=12)
    print("Num_topics ", num_topics, ", passes", passes, ", random_state ", random_state)
    print("Topics:")
    for idx, topic in lda_model.print_topics(num_topics=num_topics, num_words=num_words):
        print(f"Topic {idx}: {topic}")
    return lda_model


In [9]:
# Specify the folder path
folder_path = "corpus_norm"

# Run topic modeling
dictionary, corpus, processed_docs = run_preprocessing(folder_path, num_topics=15, num_words=10)

Loading documents...
445 documents loaded.
Preprocessing text...
Creating dictionary and corpus...


In [10]:
def train_and_save_lda(corpus, dictionary, passes_list, topics_list, output_folder="topics_no_herrgott_filt"):
    """
    Train LDA models with different passes and topic numbers, then save results.

    Args:
        corpus (list): The bag-of-words representation of the corpus.
        dictionary (gensim.corpora.Dictionary): The dictionary mapping word IDs to words.
        passes_list (list): List of values for passes.
        topics_list (list): List of values for number of topics.
        output_folder (str): Folder to save the output files.
    """
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Iterate through all combinations of passes and topics
    for passes in passes_list:
        for num_topics in topics_list:
            print(f"Training LDA model with {num_topics} topics and {passes} passes...")

            # Train LDA model using multiple cores
            lda_model = LdaMulticore(
                corpus=corpus, id2word=dictionary,
                num_topics=num_topics, passes=passes, random_state=42, workers=6
            )

            # Define file paths
            base_filename = f"lda_topics_{num_topics}topics_{passes}passes"
            txt_filepath = os.path.join(output_folder, f"{base_filename}.txt")
            model_filepath = os.path.join(output_folder, f"{base_filename}.model")

            # Save topics to a text file
            with open(txt_filepath, "w", encoding="utf-8") as file:
                file.write(f"LDA Model with {num_topics} Topics and {passes} Passes\n")
                file.write("=" * 50 + "\n")
                for idx, topic in lda_model.print_topics(num_words=10):
                    file.write(f"Topic {idx}: {topic}\n")
                file.write("\n")

            # Save the trained LDA model
            lda_model.save(model_filepath)

            print(f"Saved: {txt_filepath}")
            print(f"Saved model: {model_filepath}")



In [11]:
# Define parameter values
passes_list = [10, 15, 20]
topics_list = [5, 10, 20 , 30, 40, 50, 100]

train_and_save_lda(corpus, dictionary, passes_list, topics_list)

Training LDA model with 5 topics and 10 passes...
Saved: topics_no_herrgott_filt\lda_topics_5topics_10passes.txt
Saved model: topics_no_herrgott_filt\lda_topics_5topics_10passes.model
Training LDA model with 10 topics and 10 passes...
Saved: topics_no_herrgott_filt\lda_topics_10topics_10passes.txt
Saved model: topics_no_herrgott_filt\lda_topics_10topics_10passes.model
Training LDA model with 20 topics and 10 passes...
Saved: topics_no_herrgott_filt\lda_topics_20topics_10passes.txt
Saved model: topics_no_herrgott_filt\lda_topics_20topics_10passes.model
Training LDA model with 30 topics and 10 passes...
Saved: topics_no_herrgott_filt\lda_topics_30topics_10passes.txt
Saved model: topics_no_herrgott_filt\lda_topics_30topics_10passes.model
Training LDA model with 40 topics and 10 passes...
Saved: topics_no_herrgott_filt\lda_topics_40topics_10passes.txt
Saved model: topics_no_herrgott_filt\lda_topics_40topics_10passes.model
Training LDA model with 50 topics and 10 passes...
Saved: topics_no_