In [None]:
## Unsupervised Medical Topic Discovery: Exploration & Visualization

# --- Imports and Setup ---
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, PCA


In [None]:
# Import the custom utility functions
# Make sure you are running this notebook from the 'medical-topic-discovery' root or have
# added the 'src/' directory to your path.
import sys
sys.path.append('../src') 
from utils import preprocess_text, display_topics

In [None]:
# --- Configuration ---
N_TOPICS = 4  
N_TOP_WORDS = 8
FILE_PATH = '../data/medical_corpus.txt'

# Download NLTK resources (needed for stopwords and lemmatization)
print("Downloading NLTK data (if not already present)...")
try:
    nltk.download('stopwords')
    nltk.download('wordnet')
    print("NLTK downloads complete.")
except Exception as e:
    print(f"Error downloading NLTK data: {e}. Please check your internet connection.")

In [None]:


# # 1. Data Acquisition and Inspection
print(f"\nLoading data from: {FILE_PATH}")
try:
    with open(FILE_PATH, 'r') as f:
        corpus = f.read().splitlines()
    print(f"Loaded {len(corpus)} documents.")
    
    # Convert to DataFrame for easier inspection
    df = pd.DataFrame({'document': corpus})
    display(df.head())
except FileNotFoundError:
    print(f"ERROR: Data file not found at {FILE_PATH}. Please check the path.")

# # 2. Preprocessing
print("\nApplying preprocessing (cleaning, stop word removal, lemmatization)...")
df['cleaned_document'] = df['document'].apply(preprocess_text)
display(df.head())


In [None]:


# # 3. Feature Engineering (Count Vectorization)
print("\nCreating the Document-Term Matrix (DTM) with CountVectorizer...")
vectorizer = CountVectorizer(max_df=0.95, min_df=2) # Tweak these parameters to refine features
data_vectorized = vectorizer.fit_transform(df['cleaned_document'])
feature_names = vectorizer.get_feature_names_out()

print(f"DTM Shape: {data_vectorized.shape} (Documents x Features/Words)")


# # 4. Topic Modeling (LDA)
print(f"\nTraining Latent Dirichlet Allocation (LDA) model with K={N_TOPICS} topics...")

# Initialize and train the LDA model
lda = LatentDirichletAllocation(
    n_components=N_TOPICS,
    max_iter=10, # Increased iterations for better convergence in notebook
    learning_method='online',
    random_state=42,
    n_jobs=-1 # Use all processors
)
lda.fit(data_vectorized)

print("Model training complete.")

In [None]:


# Interpretation: Print the top words for each discovered topic
display_topics(lda, feature_names, N_TOP_WORDS)


# # 5. Visualization using Dimensionality Reduction (PCA)

# 5.1 Transform the vectorized data to get Document-Topic distribution
doc_topic_distribution = lda.transform(data_vectorized)
df['main_topic'] = np.argmax(doc_topic_distribution, axis=1)

# 5.2 Reduce the *Document-Term Matrix* to 2 dimensions using PCA
# We use PCA for simplicity in this exploration step
print("\nReducing DTM dimensions using PCA for visualization...")
pca = PCA(n_components=2, random_state=42)
data_pca = pca.fit_transform(data_vectorized.toarray())


In [None]:





# Create a DataFrame for plotting
plot_df = pd.DataFrame({
    'PC1': data_pca[:, 0],
    'PC2': data_pca[:, 1],
    'Topic': df['main_topic']
})

# 5.3 Plotting the Topics
plt.figure(figsize=(10, 8))
sns.scatterplot(
    x='PC1', 
    y='PC2', 
    hue='Topic', 
    data=plot_df, 
    palette='Spectral', 
    s=100, 
    alpha=0.7
)

plt.title(f'Document Clustering based on LDA Topic Assignment (Reduced via PCA)')
plt.xlabel(f'Principal Component 1 ({pca.explained_variance_ratio_[0]*100:.1f}%)')
plt.ylabel(f'Principal Component 2 ({pca.explained_variance_ratio_[1]*100:.1f}%)')
plt.legend(title='Discovered Topic Index')
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

print("\nVisualization complete. Clear separation in the plot indicates strong topic discovery!")