In [1]:
import spacy
import os
import pandas as pd
import re

In [2]:
nlp = spacy.load("en_core_web_md")

Theres a lot to filter, see [this](http://localhost:8889/tree/Documents/Uni/DH/Data%20Science/Projekte/Topic%20Modelling/corpus_base/16338845__IM-PRESS__20050826-IPR-01421__EN.txt).

In [None]:
# Define the folder path
folder_path = 'corpus_base'

# Initialize lists to store data
titles = []
categories = []
contents = []
filenames = []

date_pattern = r'\d{4}-\d{2}-\d{2} - \d{2}:\d{2}'
dates_pattern = r'\d{2}\.\d{2}\.\d{2}'
phone_pattern = r'\(\+\d*\)\s\d*\s\d*\s\d*'
mark_pattern = r'[A-Z]{4,}'
ref_pattern = r'\d+\w+\d+'
mail_pattern = r'.*\@.*\.eu'

# Iterate over files in the folder
for file_name in os.listdir(folder_path):
    # Check if file has .txt extension
    if file_name.endswith('.txt'):
        file_path = os.path.join(folder_path, file_name)
        
        # Read the file
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            
            # Extract title, category, and content
            title = lines[0].strip()
            category = lines[1].strip()
            content_lines = lines[2:]  # Exclude the first two lines
            
            # Remove weird stuff
            content_lines = [re.sub(r'\/\S*', '', line) for line in content_lines]
            content_lines = [re.sub(date_pattern, '', line) for line in content_lines]
            content_lines = [re.sub(r'\*\.jpg', '', line) for line in content_lines]
            content_lines = [re.sub(phone_pattern, '', line) for line in content_lines]
            content_lines = [re.sub(mark_pattern, '', line) for line in content_lines]
            content_lines = [re.sub(ref_pattern, '', line) for line in content_lines]
            content_lines = [re.sub(mail_pattern, '', line) for line in content_lines]
            
            # Remove batches of 3 lines
            content_lines = [line for line in content_lines if not line.startswith((r'\d', 'LINK', 'EN'))]
            
            # Join the remaining lines to form the content
            content = ' '.join(content_lines).strip()            
            
            # Append data to lists
            titles.append(title)
            categories.append(category)
            contents.append(content)
            filenames.append(file_name)

# Create a pandas dataframe
data = {
    'title': titles,
    'category': categories,
    'content': contents,
    'filename': filenames
}

df = pd.DataFrame(data)
df

In [None]:
stopwords = list(nlp.Defaults.stop_words)+['%', '.', "?", "!", ",", "(", ")", ":", ";", "\"", "'", "=", "-", "--en", "http", "v0", "xml", "im-", "doc", "avi", 'XML', 'DOC', 'AVI', 'IM-', '.jpg']
stoptags = ['ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NUM', 'PART', 'PRON', 'PUNCT', 'SCONJ', 'SYM', 'SPACE']

In [None]:
stopwords

In [None]:
df["tokens"] = df.title.apply(lambda x: [t.lemma_.lower() for t in nlp(x, disable=["parser", "ner"]) if (t.text.strip() != "" and (t.text.lower() not in stopwords) and t.pos_ not in stoptags)])

In [None]:
df.iloc[356].tokens

In [None]:
from gensim.corpora.dictionary import Dictionary
dictionary = Dictionary(df['tokens'])
dictionary.filter_extremes(no_below=3, no_above=0.5)

l = list(dictionary.items())
print(len(l))
print(l)

In [None]:
corpus = [dictionary.doc2bow(a) for a in df['tokens']]

In [None]:
from gensim.models import LdaMulticore

In [None]:
# We will execute topic modeling 50 times, each time with the different number of topics
max_topics = 50 
models = []

for i in range(max_topics):
    print("Training LDA with " + str(i+1) + " topics.")
    
    lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=100, num_topics=i+1, workers = 12)
    models.append(lda_model)
          
print("Done.")

In [None]:
from gensim.models import CoherenceModel

coherence_measures = ['c_uci', 'u_mass', 'c_v', 'c_npmi']
scores = pd.DataFrame(columns=coherence_measures)
for measure in coherence_measures:
    scores_temp = []
    for i in range(len(models)):
        print("Computing " + str(measure) +" for the LDA model with " + str(i+1) + " topics.")
        cm = CoherenceModel(model=models[i], corpus=corpus, texts = df["tokens"], dictionary=dictionary, coherence=measure)
        score = cm.get_coherence()
        scores_temp.append(score)
    scores[measure] = scores_temp
    print("Done with " + measure)
print("Done.")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

num_topics = [i+1 for i in range(max_topics)]

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2)
best_v = scores['c_v'].nlargest(3)

axes[0, 0].plot(num_topics, list(scores['c_v']), color='purple')
axes[0, 0].axvline(x=scores['c_v'].idxmax()+1, color='r', linestyle='solid')
axes[0, 0].set_title(f'best v idx: {best_v}')

best_npmi = scores['c_npmi'].nlargest(3)

axes[0, 1].plot(num_topics, list(scores['c_npmi']), color='green')
axes[0, 1].axvline(x=scores['c_npmi'].idxmax()+1, color='r', linestyle='dotted')
axes[0, 1].set_title(f'best npmi idx: {best_npmi}')

best_mass = scores['u_mass'].nsmallest(3)

axes[1, 0].plot(num_topics, list(scores['u_mass']), color='red')
axes[1, 0].axvline(x=scores['u_mass'].idxmin()+1, color='r', linestyle='dashed')
axes[1, 0].set_title(f'best mass idx: {best_mass}')

best_uci = scores['c_uci'].nlargest(3)
                                
axes[1, 1].plot(num_topics, list(scores['c_uci']), color='blue')
axes[1, 1].axvline(x=scores['c_uci'].idxmax()+1, color='r', linestyle='dashdot')
axes[1, 1].set_title(f'best uci idx: {best_uci}')

plt.tight_layout()
plt.show()

In [None]:
models[24].print_topics(num_words=6, num_topics=max_topics)

In [None]:
for doc_id, doc in enumerate(corpus):
# Iterate over each document in the corpus
    # Get the topic distribution for the document
    topic_distribution = models[43].get_document_topics(doc)
    
    # Print the topic distribution for the current document
    print(f"Document ID: {doc_id}")
    for topic_id, topic_prob in topic_distribution:
        print(f"Topic ID: {topic_id}, Probability: {topic_prob}")
    print()


In [None]:
df.iloc[3].title

In [None]:
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()# Visualise inside a notebook

In [None]:
lda_display = pyLDAvis.gensim_models.prepare(models[max_topics-1], corpus, dictionary)
pyLDAvis.display(lda_display)