In [13]:
# For text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# For topic modeling
from gensim import corpora
from gensim.models import LdaModel
import pandas as pd

# Download NLTK Resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
df = pd.read_csv('npr.csv')
documents = df['Article'].tolist()

In [15]:
stop_words = set(stopwords.words('english')) # Create a set of English stopwords
lemmatizer = WordNetLemmatizer() # Initialize a WordNet lemmatizer
def preprocess_text(text):
    tokens = word_tokenize(text.lower()) # Tokenize the text into words and convert to lowercase
    tokens = [token for token in tokens if token.isalnum()] # Filter out non-alphanumeric tokens
    tokens = [token for token in tokens if token not in stop_words] # Remove stopwords from the tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens] # Lemmatize each token
    return tokens # Return the preprocessed tokens
preprocessed_documents = [preprocess_text(doc) for doc in documents] # Preprocess each documentin the list
print(preprocessed_documents[0])

['washington', '2016', 'even', 'policy', 'bipartisan', 'politics', 'sense', 'year', 'show', 'little', 'sign', 'ending', 'president', 'obama', 'moved', 'sanction', 'russia', 'alleged', 'interference', 'election', 'concluded', 'republican', 'long', 'called', 'similar', 'severe', 'measure', 'could', 'scarcely', 'bring', 'approve', 'house', 'speaker', 'paul', 'ryan', 'called', 'obama', 'measure', 'appropriate', 'also', 'overdue', 'prime', 'example', 'administration', 'ineffective', 'foreign', 'policy', 'left', 'america', 'weaker', 'eye', 'gop', 'leader', 'sounded', 'much', 'theme', 'urging', 'president', 'obama', 'year', 'take', 'strong', 'action', 'deter', 'russia', 'worldwide', 'aggression', 'including', 'operation', 'wrote', 'devin', 'nunes', 'chairman', 'house', 'intelligence', 'committee', 'week', 'left', 'office', 'president', 'suddenly', 'decided', 'stronger', 'measure', 'indeed', 'appearing', 'cnn', 'frequent', 'obama', 'critic', 'trent', 'frank', 'called', 'much', 'tougher', 'acti

In [16]:
# Create a Gensim Dictionary object from the preprocessed documents
dictionary = corpora.Dictionary(preprocessed_documents)
# Filter out tokens that appear in less than 15 documents or more than 50% of the documents
dictionary.filter_extremes(no_below=15, no_above=0.5)
# Convert each preprocessed document into a bag-of-words representation using the dictionary
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

In [17]:
# Run LDA
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15) # Train an LDA modelon the corpus with 2 topics using Gensim's LdaModel class

In [18]:
# empty list to store dominant topic labels for each document
article_labels = []
# iterate over each processed document
for i, doc in enumerate(preprocessed_documents):
# for each document, convert to bag-of-words representation
    bow = dictionary.doc2bow(doc)
    # get list of topic probabilities
    topics = lda_model.get_document_topics(bow)
    # determine topic with highest probability
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    # append to the list
    article_labels.append(dominant_topic)
# Create DataFrame
df_result = pd.DataFrame({"Article": documents, "Topic": article_labels})
# Print the DataFrame
print("Table with Articles and Topic:")
print(df_result)
print()

Table with Articles and Topic:
                                                 Article  Topic
0      In the Washington of 2016, even when the polic...      0
1        Donald Trump has used Twitter  —   his prefe...      0
2        Donald Trump is unabashedly praising Russian...      0
3      Updated at 2:50 p. m. ET, Russian President Vl...      0
4      From photography, illustration and video, to d...      1
...                                                  ...    ...
11987  The number of law enforcement officers shot an...      1
11988    Trump is busy these days with victory tours,...      0
11989  It’s always interesting for the Goats and Soda...      3
11990  The election of Donald Trump was a surprise to...      0
11991  Voters in the English city of Sunderland did s...      4

[11992 rows x 2 columns]



In [19]:
# Print top terms for each topic
for topic_id in range(lda_model.num_topics):
    print(f"Top terms for Topic #{topic_id}:")
    top_terms = lda_model.show_topic(topic_id, topn=10)
    print([term[0] for term in top_terms])
    print()

Top terms for Topic #0:
['trump', 'president', 'state', 'clinton', 'campaign', 'republican', 'law', 'obama', 'court', 'house']

Top terms for Topic #1:
['police', 'country', 'city', 'report', 'two', 'day', 'government', 'attack', 'world', 'war']

Top terms for Topic #2:
['know', 'think', 'thing', 'life', 'really', 'woman', 'story', 'show', 'book', 'u']

Top terms for Topic #3:
['health', 'school', 'study', 'student', 'child', 'care', 'university', 'program', 'patient', 'drug']

Top terms for Topic #4:
['percent', 'state', 'food', 'company', 'million', 'tax', 'job', 'voter', 'american', 'clinton']



In [20]:
# Print the top terms for each topic with weight
print("Top Terms for Each Topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    for term in terms:
        weight, word = term.split("*")
        print(f"- {word.strip()} (weight: {weight.strip()})")
    print()

Top Terms for Each Topic:
Topic 0:
- "trump" (weight: 0.024)
- "president" (weight: 0.010)
- "state" (weight: 0.008)
- "clinton" (weight: 0.008)
- "campaign" (weight: 0.006)
- "republican" (weight: 0.006)
- "law" (weight: 0.005)
- "obama" (weight: 0.005)
- "court" (weight: 0.005)
- "house" (weight: 0.004)

Topic 1:
- "police" (weight: 0.006)
- "country" (weight: 0.005)
- "city" (weight: 0.005)
- "report" (weight: 0.005)
- "two" (weight: 0.004)
- "day" (weight: 0.004)
- "government" (weight: 0.004)
- "attack" (weight: 0.004)
- "world" (weight: 0.003)
- "war" (weight: 0.003)

Topic 2:
- "know" (weight: 0.005)
- "think" (weight: 0.005)
- "thing" (weight: 0.005)
- "life" (weight: 0.005)
- "really" (weight: 0.004)
- "woman" (weight: 0.004)
- "story" (weight: 0.004)
- "show" (weight: 0.003)
- "book" (weight: 0.003)
- "u" (weight: 0.003)

Topic 3:
- "health" (weight: 0.010)
- "school" (weight: 0.008)
- "study" (weight: 0.006)
- "student" (weight: 0.006)
- "child" (weight: 0.006)
- "care" (wei