1. Import Libraries

In [17]:
# For text preprocessing
import nltk
from nltk.corpus import stopwords 
from nltk. tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer

# For topic modeling
from gensim import corpora 
from gensim.models import LdaModel

# Download NLTK Resources
nltk. download ('stopwords')
nltk. download('punkt')
nltk. download('wordnet')

import pandas as pd

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Marsya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Marsya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Marsya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#2. Load the Data

In [6]:
documents = [
    "Rafael Nadal Joins Roger Federer in Missing U.S. Open",
    "Rafael Nadal Is Out of the Australian Open",
    "Biden Announces Virus Measures",
    "Biden's Virus Plans Meet Reality",
    "Where Biden's Virus Plan Stands"
]

3. Preprocess the Data

In [8]:
# create a set of Eng stopwords
stop_words = set(stopwords.words('english'))

# initialize a Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()

# tokenize the test into words, convert to lowercase, filter non-alphanumeric, 
# remove stopwords and lemmatize
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum()]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

#preprocess each doc into a list
preprocessed_documents = [preprocess_text(doc) for doc in documents]
preprocessed_documents

[['rafael', 'nadal', 'join', 'roger', 'federer', 'missing', 'open'],
 ['rafael', 'nadal', 'australian', 'open'],
 ['biden', 'announces', 'virus', 'measure'],
 ['biden', 'virus', 'plan', 'meet', 'reality'],
 ['biden', 'virus', 'plan', 'stand']]

4. Create a doc-term matrix

In [9]:
# create a Gensim dictionary object from the preprocesses docs
dictionary = corpora.Dictionary(preprocessed_documents)
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

5. Run LDA

In [13]:
#corpus: bag-of-words representation of the documents 
#num_topics: number of topics to be extracted by the model 
#idword-dictionary: dictionary mapping from word IDs to words 
#passes: number of passes through the corpus during training
# Train an LDA model on the corpus with 4 topics using Gensim's LdaModel class
lda_model = LdaModel (corpus, num_topics=2, id2word=dictionary, passes=15)

6. Interpret Results

In [14]:
# empty List to store dominant topic Labels for each document
article_labels = []

# iterate over each processed document
for i, doc in enumerate(preprocessed_documents):
# for each document, convert to box representation
    bow = dictionary.doc2bow(doc)
    # get list of topic probabilities
    topics = lda_model.get_document_topics(bow)
    # determine topic with highest probability
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    # appenf to the list
    article_labels.append(dominant_topic)

In [18]:
# create dataframe
df = pd.DataFrame({"Article": documents, "Topic": article_labels})

# print the datafram
print("Table with Articles and Topic")
print(df)
print()

Table with Articles and Topic
                                             Article  Topic
0  Rafael Nadal Joins Roger Federer in Missing U....      1
1         Rafael Nadal Is Out of the Australian Open      1
2                     Biden Announces Virus Measures      0
3                   Biden's Virus Plans Meet Reality      0
4                    Where Biden's Virus Plan Stands      0



In [19]:
print("Top Terms for Each Topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic: {idx}:")
    terms = [term.strip() for term in topic.split("+")]

    for term in terms:
        weight, word = term.split("*")
        print(f"- {word.strip()} (weight: {weight.strip()})")
    print()

Top Terms for Each Topic:
Topic: 0:
- "virus" (weight: 0.166)
- "biden" (weight: 0.166)
- "plan" (weight: 0.119)
- "meet" (weight: 0.071)
- "reality" (weight: 0.071)
- "announces" (weight: 0.071)
- "measure" (weight: 0.071)
- "stand" (weight: 0.071)
- "australian" (weight: 0.024)
- "nadal" (weight: 0.024)

Topic: 1:
- "rafael" (weight: 0.131)
- "open" (weight: 0.131)
- "nadal" (weight: 0.131)
- "federer" (weight: 0.079)
- "missing" (weight: 0.079)
- "roger" (weight: 0.079)
- "join" (weight: 0.079)
- "australian" (weight: 0.079)
- "biden" (weight: 0.027)
- "virus" (weight: 0.027)

