In [1]:
import nltk
import gensim
from gensim import corpora
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Sample text documents
documents = [
    "The economy is struggling with inflation and high interest rates.",
    "The government passed a new healthcare reform bill.",
    "Scientists discovered a new planet outside the solar system.",
    "The stock market saw a significant rise today.",
    "NASA plans a new mission to Mars next year."
]

# Tokenization & Stopword Removal
nltk.download("stopwords")
nltk.download("punkt")
stop_words = set(stopwords.words("english"))
processed_docs = [[word.lower() for word in word_tokenize(doc) if word.isalpha() and word.lower() not in stop_words] for doc in documents]

# Create a dictionary and corpus
dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(text) for text in processed_docs]

# Train LDA Model
lda_model = gensim.models.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=10)

# Display topics
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\navin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\navin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


(0, '0.053*"new" + 0.053*"planet" + 0.053*"outside" + 0.053*"discovered" + 0.053*"solar"')
(1, '0.074*"new" + 0.044*"saw" + 0.044*"rise" + 0.044*"today" + 0.044*"stock"')


In [2]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text into TF-IDF vectors
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(documents)

# Apply LSA
lsa = TruncatedSVD(n_components=2, random_state=42)
lsa_matrix = lsa.fit_transform(X)

# Print topics
terms = vectorizer.get_feature_names_out()
for i, topic in enumerate(lsa.components_):
    print(f"Topic {i}: ", [terms[i] for i in topic.argsort()[-5:]])


Topic 0:  ['government', 'reform', 'passed', 'healthcare', 'new']
Topic 1:  ['economy', 'rates', 'inflation', 'struggling', 'high']
