In [None]:
# 1_Topic_Modeling_Enhanced.ipynb

import pandas as pd
import re
import string
import spacy
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import matplotlib.pyplot as plt

nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

true_df = pd.read_csv('/Users/maryamayman/fake_news_project/data/raw/True.csv')
fake_df = pd.read_csv('/Users/maryamayman/fake_news_project/data/raw/True.csv')
true_df['label'] = 'true'
fake_df['label'] = 'fake'
data = pd.concat([true_df, fake_df], ignore_index=True)
data['article_id'] = data.index

stop_words = spacy.lang.en.stop_words.STOP_WORDS

def preprocess_and_lemmatize(text):
    text = text.lower()
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = re.sub(r'\d+', '', text)
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in stop_words and len(token.text) > 3]
    return tokens

data['processed_text'] = data['text'].apply(preprocess_and_lemmatize)

documents = data['processed_text'].tolist()
dictionary = Dictionary(documents)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
corpus = [dictionary.doc2bow(doc) for doc in documents]

# For speed, we will set num_topics manually. You can uncomment the coherence code to run it later.
optimal_num_topics = 7
print(f"Using {optimal_num_topics} topics.")

lda_model = LdaModel(corpus=corpus, num_topics=optimal_num_topics, id2word=dictionary, random_state=42, passes=10)



Using 7 topics.


TypeError: DataFrame.drop() takes from 1 to 2 positional arguments but 3 were given

In [4]:
# --- Simple Text Report ---

print("--- Topic Modeling Report ---")
print("Top words for each topic:\n")

# Use the built-in function to get the topics
topics = lda_model.print_topics(num_words=10)

# Print each topic neatly
for topic in topics:
    print(topic)
    print("-" * 50) # Separator line

--- Topic Modeling Report ---
Top words for each topic:

(0, '0.020*"trump" + 0.015*"russia" + 0.014*"russian" + 0.009*"official" + 0.009*"report" + 0.008*"investigation" + 0.008*"moscow" + 0.008*"house" + 0.007*"committee" + 0.007*"campaign"')
--------------------------------------------------
(1, '0.020*"party" + 0.013*"minister" + 0.012*"government" + 0.010*"european" + 0.009*"election" + 0.008*"britain" + 0.007*"union" + 0.007*"parliament" + 0.007*"leader" + 0.007*"talk"')
--------------------------------------------------
(2, '0.029*"trump" + 0.013*"election" + 0.011*"clinton" + 0.011*"republican" + 0.011*"campaign" + 0.010*"party" + 0.010*"presidential" + 0.009*"candidate" + 0.007*"state" + 0.007*"vote"')
--------------------------------------------------
(3, '0.009*"government" + 0.009*"state" + 0.008*"people" + 0.008*"force" + 0.007*"group" + 0.006*"police" + 0.006*"attack" + 0.006*"military" + 0.006*"country" + 0.006*"kill"')
--------------------------------------------------


In [5]:
# Create a DataFrame with the dominant topic and terms for each article
data['dominant_topic'] = [lda_model.get_document_topics(doc_bow, minimum_probability=0.0)[0][0] for doc_bow in corpus]
data['topic_terms'] = data['dominant_topic'].apply(lambda topic_id: ', '.join([word for word, _ in lda_model.show_topic(topic_id, 10)]))
topics_df = data[['article_id', 'dominant_topic', 'topic_terms']]

# --- Displaying the DataFrame Head ---
print("### Head of the Topics DataFrame ###")
print(topics_df.head())
print("\n" + "="*50 + "\n")


# --- Simple Text Report of Topics ---
print("### List of All Topics and Their Top Words ###\n")

# Use the built-in function to get the topics
topics = lda_model.print_topics(num_words=10)

# Print each topic neatly
for topic in topics:
    print(topic)
    print("-" * 50) # Separator line

### Head of the Topics DataFrame ###
   article_id  dominant_topic   
0           0               0  \
1           1               0   
2           2               0   
3           3               0   
4           4               0   

                                         topic_terms  
0  trump, russia, russian, official, report, inve...  
1  trump, russia, russian, official, report, inve...  
2  trump, russia, russian, official, report, inve...  
3  trump, russia, russian, official, report, inve...  
4  trump, russia, russian, official, report, inve...  


### List of All Topics and Their Top Words ###

(0, '0.020*"trump" + 0.015*"russia" + 0.014*"russian" + 0.009*"official" + 0.009*"report" + 0.008*"investigation" + 0.008*"moscow" + 0.008*"house" + 0.007*"committee" + 0.007*"campaign"')
--------------------------------------------------
(1, '0.020*"party" + 0.013*"minister" + 0.012*"government" + 0.010*"european" + 0.009*"election" + 0.008*"britain" + 0.007*"union" + 0.007*"parli

In [6]:
data.head()

  return method()
  return method()


Unnamed: 0,title,text,subject,date,label,article_id,processed_text,dominant_topic,topic_terms
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",True,0,"[washington, reuter, head, conservative, repub...",0,"trump, russia, russian, official, report, inve..."
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",True,1,"[washington, reuters, transgender, people, all...",0,"trump, russia, russian, official, report, inve..."
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",True,2,"[washington, reuter, special, counsel, investi...",0,"trump, russia, russian, official, report, inve..."
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",True,3,"[washington, reuters, trump, campaign, adviser...",0,"trump, russia, russian, official, report, inve..."
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",True,4,"[seattlewashington, reuters, president, donald...",0,"trump, russia, russian, official, report, inve..."


In [9]:
print("### Basic Dataset Information ###\n")

# 1. Shape of the dataset
print(f"Total articles (rows): {data.shape[0]}")
print(f"Total columns: {data.shape[1]}\n")

# 2. Check for missing values
print("--- Missing Values ---")
print(data.isnull().sum())
print("\n" + "="*30 + "\n")

# 3. Distribution of true vs. fake news
print("--- Article Distribution ---")
print(data['label'].value_counts())
print("\n" + "="*30 + "\n")

# 4. Show a preview of the combined data
print("--- Data Preview ---")
print(data.head())
print("\n" + "="*30 + "\n")

### Basic Dataset Information ###

Total articles (rows): 42834
Total columns: 9

--- Missing Values ---
title             0
text              0
subject           0
date              0
label             0
article_id        0
processed_text    0
dominant_topic    0
topic_terms       0
dtype: int64


--- Article Distribution ---
label
true    21417
fake    21417
Name: count, dtype: int64


--- Data Preview ---
                                               title   
0  As U.S. budget fight looms, Republicans flip t...  \
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject   
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews  \
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special cou

In [10]:
# --- Simple Text Report ---

print("--- Topic Modeling Report ---")
print("Displaying all topics and their top words:\n")

# Use the built-in function to get all topics
topics = lda_model.print_topics(num_topics=optimal_num_topics, num_words=10)

# Print each topic neatly
for topic in topics:
    print(topic)
    print("-" * 50) # Separator line

--- Topic Modeling Report ---
Displaying all topics and their top words:

(0, '0.020*"trump" + 0.015*"russia" + 0.014*"russian" + 0.009*"official" + 0.009*"report" + 0.008*"investigation" + 0.008*"moscow" + 0.008*"house" + 0.007*"committee" + 0.007*"campaign"')
--------------------------------------------------
(1, '0.020*"party" + 0.013*"minister" + 0.012*"government" + 0.010*"european" + 0.009*"election" + 0.008*"britain" + 0.007*"union" + 0.007*"parliament" + 0.007*"leader" + 0.007*"talk"')
--------------------------------------------------
(2, '0.029*"trump" + 0.013*"election" + 0.011*"clinton" + 0.011*"republican" + 0.011*"campaign" + 0.010*"party" + 0.010*"presidential" + 0.009*"candidate" + 0.007*"state" + 0.007*"vote"')
--------------------------------------------------
(3, '0.009*"government" + 0.009*"state" + 0.008*"people" + 0.008*"force" + 0.007*"group" + 0.006*"police" + 0.006*"attack" + 0.006*"military" + 0.006*"country" + 0.006*"kill"')
----------------------------------

In [11]:
# --- Clean and Readable Report ---

print("--- Topic Modeling Report ---")
print("Displaying all topics and their top 10 words:\n")

# Get the topics and their top words
topics = lda_model.show_topics(num_topics=optimal_num_topics, num_words=10, formatted=False)

# Print each topic neatly
for topic_num, topic_words in topics:
    # Get just the words
    words = [word for word, _ in topic_words]
    print(f"Topic {topic_num}: {', '.join(words)}")
    print("-" * 50)

--- Topic Modeling Report ---
Displaying all topics and their top 10 words:

Topic 0: trump, russia, russian, official, report, investigation, moscow, house, committee, campaign
--------------------------------------------------
Topic 1: party, minister, government, european, election, britain, union, parliament, leader, talk
--------------------------------------------------
Topic 2: trump, election, clinton, republican, campaign, party, presidential, candidate, state, vote
--------------------------------------------------
Topic 3: government, state, people, force, group, police, attack, military, country, kill
--------------------------------------------------
Topic 4: north, korea, united, china, trump, states, nuclear, iran, missile, south
--------------------------------------------------
Topic 5: court, house, bill, senate, trump, republican, state, congress, plan, vote
--------------------------------------------------
Topic 6: year, million, company, city, people, billion, cas

Topic 0: Trump-Russia Investigation 


Topic 1: European Politics & Brexit 


Topic 2: US Elections 


Topic 3: Military Conflict & Police Action


Topic 4: International Relations (US, North Korea, China)


Topic 5: US Government & Legislation


Topic 6: Business & Finance



In [13]:
import pandas as pd

print("--- Topic Distribution in True vs. Fake News ---")
print("This table shows how many articles from each category were assigned to each topic.\n")

# Create a crosstab to get the exact counts of topics for each label
topic_counts = pd.crosstab(data['dominant_topic'], data['label'])

# Print the resulting table
print(topic_counts)

print("\n" + "="*40)
print("\n--- Topic Key ---")
# Print the topics again so you can easily reference them
topics = lda_model.show_topics(num_topics=optimal_num_topics, num_words=5, formatted=False)
for topic_num, topic_words in topics:
    words = [word for word, _ in topic_words]
    print(f"Topic {topic_num}: {', '.join(words)}")

--- Topic Distribution in True vs. Fake News ---
This table shows how many articles from each category were assigned to each topic.



TypeError: unstack() takes from 2 to 3 positional arguments but 4 were given

In [15]:
import pandas as pd

# This assumes your 'data' DataFrame and 'lda_model' are already created.

print("--- Topic Distribution Report (True vs. Fake News) ---")
print("This table shows the keywords for each topic and the number of articles from each category.\n")

# 1. A more robust way to get the counts without using crosstab
# First, group by topic and label, then get the size of each group
topic_counts_series = data.groupby(['dominant_topic', 'label']).size()
# Then, unstack the series to create the 'true' and 'fake' columns
topic_counts = topic_counts_series.unstack(fill_value=0)

# 2. Get the keywords for each topic
topic_keywords = []
for i in range(num_topics):
    words = [word for word, _ in lda_model.show_topic(i, topn=10)]
    topic_keywords.append(", ".join(words))

# 3. Add the keywords to the topic counts table
topic_counts['topic_terms'] = topic_keywords

# 4. Add a total column and sort
topic_counts['total'] = topic_counts['true'] + topic_counts['fake']
topic_counts_sorted = topic_counts.sort_values(by='total', ascending=False)

# Print the final report
print(topic_counts_sorted[['topic_terms', 'true', 'fake', 'total']])

--- Topic Distribution Report (True vs. Fake News) ---
This table shows the keywords for each topic and the number of articles from each category.



TypeError: unstack() takes from 2 to 3 positional arguments but 4 were given

In [16]:
data.head()

  return method()
  return method()


Unnamed: 0,title,text,subject,date,label,article_id,processed_text,dominant_topic,topic_terms
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",True,0,"[washington, reuter, head, conservative, repub...",0,"trump, russia, russian, official, report, inve..."
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",True,1,"[washington, reuters, transgender, people, all...",0,"trump, russia, russian, official, report, inve..."
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",True,2,"[washington, reuter, special, counsel, investi...",0,"trump, russia, russian, official, report, inve..."
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",True,3,"[washington, reuters, trump, campaign, adviser...",0,"trump, russia, russian, official, report, inve..."
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",True,4,"[seattlewashington, reuters, president, donald...",0,"trump, russia, russian, official, report, inve..."


In [19]:
import pandas as pd

# This assumes your 'data' DataFrame is already loaded.

print("--- Distribution of True vs. Fake News by Subject ---")
print("This table shows how many true and fake articles belong to each subject category.\n")

# Step 1: Group the data and get the counts. This part is working for you.
subject_distribution = data.groupby(['subject', 'label']).size()

# Step 2: Manually create the table to avoid using the broken unstack() function
report = {}
for (subject, label), count in subject_distribution.items():
    if subject not in report:
        report[subject] = {'true': 0, 'fake': 0}
    report[subject][label] = count

# Step 3: Convert the manually created report into a DataFrame
subject_table = pd.DataFrame.from_dict(report, orient='index')

# Step 4: Add a total and sort the table
subject_table['total'] = subject_table['true'] + subject_table['fake']
subject_table_sorted = subject_table.sort_values(by='total', ascending=False)

# Print the final report
print(subject_table_sorted)

--- Distribution of True vs. Fake News by Subject ---
This table shows how many true and fake articles belong to each subject category.

               true   fake  total
politicsNews  11272  11272  22544
worldnews     10145  10145  20290


In [20]:
df

  return method()


Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",true
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",true
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",true
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",true
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",true
...,...,...,...,...,...
44893,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",fake
44894,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",fake
44895,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",fake
44896,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",fake
