In [50]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models import LdaModel
import pandas as pd

In [52]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [56]:

df = pd.read_csv('npr.csv')  
documents = df['Article'].tolist()



In [59]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalnum()]
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens

preprocessed_documents = [preprocess(doc) for doc in documents]

In [60]:
# Create Document-Term Matrix
dictionary = corpora.Dictionary(preprocessed_documents)
dictionary.filter_extremes(no_below=15, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]


In [63]:
# Run LDA Model
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)


In [66]:
# Assign Topics to Each Document
article_labels = []
for doc in preprocessed_documents:
    bow = dictionary.doc2bow(doc)
    topics = lda_model.get_document_topics(bow)
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    article_labels.append(dominant_topic)

In [68]:

# Create Result DataFrame
df_result = pd.DataFrame({"Article": documents, "Topic": article_labels})


In [70]:
# Display Articles with Assigned Topics
print("Table with Articles and Assigned Topics:")
print(df_result)

Table with Articles and Assigned Topics:
                                                 Article  Topic
0      In the Washington of 2016, even when the polic...      4
1        Donald Trump has used Twitter  —   his prefe...      0
2        Donald Trump is unabashedly praising Russian...      4
3      Updated at 2:50 p. m. ET, Russian President Vl...      0
4      From photography, illustration and video, to d...      2
...                                                  ...    ...
11987  The number of law enforcement officers shot an...      0
11988    Trump is busy these days with victory tours,...      4
11989  It’s always interesting for the Goats and Soda...      3
11990  The election of Donald Trump was a surprise to...      4
11991  Voters in the English city of Sunderland did s...      3

[11992 rows x 2 columns]


In [72]:
# Show Top Terms for Each Topic
print("\nTop Terms for Each Topic:")
for topic_id in range(lda_model.num_topics):
    print(f"Topic #{topic_id}:")
    top_terms = lda_model.show_topic(topic_id, topn=10)
    print([term[0] for term in top_terms])
    print()


Top Terms for Each Topic:
Topic #0:
['police', 'report', 'state', 'government', 'country', 'court', 'law', 'told', 'attack', 'official']

Topic #1:
['health', 'school', 'child', 'student', 'study', 'care', 'percent', 'woman', 'state', 'program']

Topic #2:
['know', 'think', 'thing', 'life', 'really', 'woman', 'story', 'show', 'world', 'book']

Topic #3:
['food', 'water', 'company', 'world', 'country', 'million', 'city', '000', 'area', 'percent']

Topic #4:
['trump', 'clinton', 'president', 'state', 'republican', 'campaign', 'election', 'obama', 'vote', 'house']



In [76]:

print("Top Terms with Weights:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    for term in terms:
        weight, word = term.split("*")
        print(f"- {word.strip()} (weight: {weight.strip()})")
    print()

Top Terms with Weights:
Topic 0:
- "police" (weight: 0.007)
- "report" (weight: 0.006)
- "state" (weight: 0.005)
- "government" (weight: 0.005)
- "country" (weight: 0.005)
- "court" (weight: 0.004)
- "law" (weight: 0.004)
- "told" (weight: 0.004)
- "attack" (weight: 0.004)
- "official" (weight: 0.004)

Topic 1:
- "health" (weight: 0.010)
- "school" (weight: 0.008)
- "child" (weight: 0.006)
- "student" (weight: 0.006)
- "study" (weight: 0.006)
- "care" (weight: 0.006)
- "percent" (weight: 0.005)
- "woman" (weight: 0.004)
- "state" (weight: 0.004)
- "program" (weight: 0.004)

Topic 2:
- "know" (weight: 0.005)
- "think" (weight: 0.005)
- "thing" (weight: 0.005)
- "life" (weight: 0.005)
- "really" (weight: 0.004)
- "woman" (weight: 0.004)
- "story" (weight: 0.004)
- "show" (weight: 0.003)
- "world" (weight: 0.003)
- "book" (weight: 0.003)

Topic 3:
- "food" (weight: 0.007)
- "water" (weight: 0.005)
- "company" (weight: 0.005)
- "world" (weight: 0.004)
- "country" (weight: 0.003)
- "million