In [None]:
# Minimal LDA exploration notebook (single-cell version)
# - fewer dependencies (no NLTK)
# - simple inline preprocessing
# - TruncatedSVD for 2D visualization of sparse DTM

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD


In [None]:

# -------- Configuration --------
N_TOPICS = 4
N_TOP_WORDS = 8
FILE_PATH = '../data/medical_corpus.txt'  # adjust if needed


In [None]:
# -------- Simple preprocessing (no external libs) --------
STOP = set(ENGLISH_STOP_WORDS)
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', ' ', text)         # keep letters and spaces
    tokens = [t for t in text.split() if len(t) > 2 and t not in STOP]
    return ' '.join(tokens)


In [None]:
# -------- Utility to show topics --------
def display_topics(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        top_indices = topic.argsort()[:-n_top_words - 1:-1]
        top_words = [feature_names[i] for i in top_indices]
        print(f"Topic {topic_idx}: {' '.join(top_words)}")


In [None]:
# -------- Load data --------
with open(FILE_PATH, 'r') as f:
    corpus = [line.strip() for line in f if line.strip()]

print(f"Loaded {len(corpus)} documents.")
df = pd.DataFrame({'document': corpus})

# -------- Preprocess --------
df['clean'] = df['document'].apply(preprocess_text)

# -------- Vectorize (DTM) --------
vectorizer = CountVectorizer(max_df=0.95, min_df=2)
dtm = vectorizer.fit_transform(df['clean'])
feature_names = vectorizer.get_feature_names_out()
print(f"DTM shape: {dtm.shape}")


In [None]:
# -------- LDA --------
lda = LatentDirichletAllocation(n_components=N_TOPICS, max_iter=10, learning_method='online', random_state=42)
lda.fit(dtm)
print("LDA trained. Top words per topic:")
display_topics(lda, feature_names, N_TOP_WORDS)

# -------- Document-topic assignments --------
doc_topic = lda.transform(dtm)
df['main_topic'] = np.argmax(doc_topic, axis=1)

In [None]:
# -------- 2D visualization (TruncatedSVD for sparse matrices) --------
svd = TruncatedSVD(n_components=2, random_state=42)
dtm_2d = svd.fit_transform(dtm)

plt.figure(figsize=(8,6))
sc = plt.scatter(dtm_2d[:,0], dtm_2d[:,1], c=df['main_topic'], cmap='tab10', s=50, alpha=0.7)
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.title('Documents colored by LDA main topic (TruncatedSVD 2D)')
plt.colorbar(sc, ticks=range(N_TOPICS), label='Topic index')
plt.grid(alpha=0.3)
plt.show()
print("Done.")