In [1]:
import pandas as pd
import os
import re
import nltk
import gensim
import pyLDAvis.gensim_models as gensimvis
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel

# Import directory paths from secret config file
from config import text_directory

# Download NLTK resources (this can be done outside the functions, as a setup step)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Removes stopwords, lemmatizes, and cleans document text
def preprocess(document):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = re.sub(r'\W+', ' ', document.lower()).split()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token.isalpha() and len(token) > 3]
    return tokens

# Loads and preprocesses all documents from a given directory
def load_documents(directory):
    documents = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            documents.append(preprocess(file.read()))
    return documents

# Creates a dictionary and corpus from preprocessed documents for LDA analysis
def create_dictionary_corpus(documents):
    dictionary = corpora.Dictionary(documents)
    corpus = [dictionary.doc2bow(text) for text in documents]
    return dictionary, corpus

# Builds and returns an LDA model from the corpus and dictionary
def apply_lda_model(corpus, dictionary, num_topics=4):
    lda_model = gensim.models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=10, workers=2)
    return lda_model

# Extracts and formats topics and their respective words and weights from the LDA model
def extract_topics(lda_model):
    def parse_topic_words(topic_str):
        word_weight_pairs = topic_str.split(' + ')
        parsed_pairs = [pair.split('*') for pair in word_weight_pairs]
        return [(float(weight.strip()), word.strip('"')) for weight, word in parsed_pairs]

    topic_data = []
    for idx, topic in lda_model.print_topics(-1):
        for weight, word in parse_topic_words(topic):
            topic_data.append([idx, word, weight])

    topic_df = pd.DataFrame(topic_data, columns=['Topic', 'Word', 'Weight'])
    return topic_df.sort_values(by=['Topic', 'Weight'], ascending=[True, False])

# Calculates and returns the coherence score of the LDA model
def calculate_coherence(lda_model, documents, dictionary):
    coherence_model_lda = CoherenceModel(model=lda_model, texts=documents, dictionary=dictionary, coherence='c_v')
    return coherence_model_lda.get_coherence()

# Prepares data for visualization of the LDA model
def prepare_visualization(lda_model, corpus, dictionary):
    return gensimvis.prepare(lda_model, corpus, dictionary)


# Load and preprocess documents
documents = load_documents(text_directory)
print(f"Loaded {len(documents)} documents.\n")

# Create dictionary and corpus
dictionary, corpus = create_dictionary_corpus(documents)
print("Dictionary and corpus created.\n")

# Apply LDA model
num_topics = 4
lda_model = apply_lda_model(corpus, dictionary, num_topics)
print(f"LDA model with {num_topics} topics applied.\n")

# Extract topics
topic_df = extract_topics(lda_model)
print(topic_df.info())
topic_df

Loaded 9 documents.

Dictionary and corpus created.

LDA model with 4 topics applied.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Topic   40 non-null     int64  
 1   Word    40 non-null     object 
 2   Weight  40 non-null     float64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.1+ KB
None


Unnamed: 0,Topic,Word,Weight
0,0,course,0.02
1,0,student,0.019
2,0,data,0.016
3,0,class,0.015
4,0,http,0.011
5,0,policy,0.01
6,0,university,0.009
7,0,text,0.008
8,0,work,0.008
9,0,project,0.007


In [2]:
# Group by topic
groups = topic_df.groupby('Topic')
topic_dfs = {}

for topic, group in groups:
    topic_dfs[topic] = group

for t in topic_dfs.keys():
    print(f'{topic_dfs[t]}\n')

   Topic        Word  Weight
0      0      course   0.020
1      0     student   0.019
2      0        data   0.016
3      0       class   0.015
4      0        http   0.011
5      0      policy   0.010
6      0  university   0.009
7      0        text   0.008
8      0        work   0.008
9      0     project   0.007

    Topic        Word  Weight
10      1        text   0.002
11      1      mining   0.001
12      1     student   0.001
13      1      course   0.001
14      1       class   0.001
15      1        data   0.001
16      1        work   0.001
17      1        week   0.001
18      1        http   0.001
19      1  university   0.001

    Topic        Word  Weight
20      2        http   0.020
21      2      course   0.015
22      2    language   0.012
23      2  processing   0.011
24      2     student   0.010
25      2   gutenberg   0.010
26      2        work   0.009
27      2       class   0.009
28      2        text   0.009
29      2        file   0.009

    Topic        W

In [3]:
# Calculate coherence
coherence_score = calculate_coherence(lda_model, documents, dictionary)
print(f"Coherence score: {coherence_score}\n")

Coherence score: 0.31385289848248316



In [4]:
# Apply prepare_visualization function
prepare_visualization(lda_model, corpus, dictionary)

PreparedData(topic_coordinates=              x         y  topics  cluster       Freq
topic                                                
3     -0.040804  0.064185       1        1  52.387989
0     -0.078861  0.002425       2        1  33.234427
2      0.002168 -0.092229       3        1  14.363213
1      0.117497  0.025619       4        1   0.014370, topic_info=            Term        Freq       Total Category  logprob  loglift
290         http  117.000000  117.000000  Default  30.0000  30.0000
583         text   72.000000   72.000000  Default  29.0000  29.0000
329     language   39.000000   39.000000  Default  28.0000  28.0000
453   processing   39.000000   39.000000  Default  27.0000  27.0000
155         data  336.000000  336.000000  Default  26.0000  26.0000
...          ...         ...         ...      ...      ...      ...
336     learning    0.001188   47.894081   Topic4  -7.5496  -1.7571
571     syllabus    0.001206   72.336923   Topic4  -7.5345  -2.1543
1151    exercise    0