In [1]:
import pandas as pd
import numpy as np
import pickle
import spacy
!pip install gensim
import gensim
import gensim.corpora as corpora
from gensim.models import LdaModel
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from collections import Counter
import re
from sklearn.feature_extraction.text import CountVectorizer



## Upload datasets from google drive

In [2]:
fake_news_df = pd.read_csv("https://drive.google.com/uc?id=1dxQ2RZEwDdWSTlLFei8Jo493xy6Edy4B&export=download")
true_news_df = pd.read_csv("https://drive.google.com/uc?id=1pJmlowqJ-biXiFuZ3xE9orvB4YGmjNAz&export=download")

In [3]:
print("\nFake News preview:")
fake_news_df.head()


Fake News preview:


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [4]:
print("\nTrue News preview:")
true_news_df.head()


True News preview:


Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [None]:
print("\nNull values in the Fake News dataset:")
print(fake_news_df.isnull().sum())

In [None]:
print("\nNull values in the True News dataset:")
print(true_news_df.isnull().sum())

# Use spacy to create the function to clean the dataset

In [5]:
# Add label for true and fake news
fake_news_df["label"] = 0
true_news_df["label"] = 1

Let's convert to lowercase, remove multiple spaces, punctuation and replace with space

In [6]:
nlp = spacy.load("en_core_web_sm")
import nltk
nltk.download('stopwords')

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    doc = nlp(text)
    words = [token.lemma_ for token in doc if token.is_alpha and token.text not in stopwords.words("english")]
    return words

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
# Analyzing recurring patterns in fake news headlines.
fake_news_df["processed_title"] = fake_news_df["title"].apply(preprocess_text)
fake_titles = [" ".join(title) for title in fake_news_df["processed_title"]]
vectorizer = CountVectorizer(ngram_range=(1,2), max_features=20)
X_fake = vectorizer.fit_transform(fake_titles)
ngram_counts = dict(zip(vectorizer.get_feature_names_out(), np.asarray(X_fake.sum(axis=0)).flatten()))
sorted_ngrams = sorted(ngram_counts.items(), key=lambda item: item[1], reverse=True)
print("Most frequent patterns in fake news headlines:", sorted_ngrams)

Pattern più frequenti nei titoli delle fake news: [('trump', 9349), ('video', 8556), ('obama', 2594), ('hillary', 2322), ('watch', 1982), ('get', 1321), ('clinton', 1173), ('president', 1165), ('go', 1110), ('break', 1106), ('say', 1094), ('make', 1091), ('black', 977), ('tweet', 969), ('new', 915), ('white', 905), ('call', 890), ('news', 883), ('donald', 848), ('muslim', 808)]


Let's cleans up titles (removes stopwords, punctuation, lemmatizes)
Vectorizes titles with CountVectorizer to find unigrams and bigrams
Sort and display the 20 most common patterns

In [8]:
df = pd.concat([fake_news_df, true_news_df], axis=0).reset_index(drop=True)

# Let's check the balance of classes.
print(df["label"].value_counts())

print(df["label"].value_counts(normalize=True))  # % of true and false news



The dataset has balanced values

In [9]:
# Apply cleaning to datasets after merging datasets.
df["processed_text"] = df["text"].apply(preprocess_text)
df["processed_title"] = df["title"].apply(preprocess_text)

In [10]:
df.head()

Unnamed: 0,title,text,subject,date,label,processed_title,processed_text
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0,"[donald, trump, send, embarrass, new, year, ev...","[donald, trump, wish, americans, happy, new, y..."
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0,"[drunk, bragging, trump, staffer, start, russi...","[house, intelligence, committee, chairman, dev..."
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0,"[sheriff, david, clarke, become, internet, jok...","[friday, reveal, former, milwaukee, sheriff, d..."
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0,"[trump, obsessed, even, obama, name, code, web...","[christmas, day, donald, trump, announce, woul..."
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0,"[pope, francis, call, donald, trump, christmas...","[pope, francis, use, annual, christmas, day, m..."


In [11]:
#Analyzing recurring patterns for all news headlines
title_words = [word for title in df["processed_title"] for word in title]
title_word_counts = Counter(title_words)
print("Most frequent words in headlines:", title_word_counts.most_common(10))

Parole più frequenti nei titoli: [('trump', 14916), ('video', 8596), ('u', 5251), ('say', 4335), ('obama', 3260), ('hillary', 2369), ('watch', 2018), ('house', 2011), ('clinton', 1832), ('new', 1791)]


In [34]:
from gensim.models import CoherenceModel

dictionary = corpora.Dictionary(df["processed_text"])
corpus = [dictionary.doc2bow(text) for text in df["processed_text"]]

num_topics = 15
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42, passes=30)

coherence_model_lda = CoherenceModel(model=lda_model, texts=df["processed_text"], dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model_lda.get_coherence()
print(f"Coherence Score del modello LDA: {coherence_score:.4f}")


Coherence Score del modello LDA: 0.5003


 We create dictionary and corpus for Topic Modeling.
 Train the LDA model.
 Calculate Coherence Score to assess topic qualityù
After several trials to best optimize the model, I decided to utlize 15 topics and 30 steps to get an acceptable coherence score value

In [35]:
# Function to get topic distribution
def get_topic_features(text, dictionary, lda_model):
    bow_vector = dictionary.doc2bow(preprocess_text(text))
    topic_distribution = lda_model.get_document_topics(bow_vector, minimum_probability=0.0)
    return [prob for _, prob in topic_distribution]


In [36]:
# Create topic features
df["topic_features"] = df["text"].apply(lambda x: get_topic_features(x, dictionary, lda_model))

I decided to add this feature because Integrating topic distribution with other vectorization techniques (such as Word2Vec) can improve the performance of the classification model.

In [37]:
# Vectorization with Word2Vec
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression

word2vec_model = Word2Vec(sentences=df["processed_text"], vector_size=100, window=5, min_count=2, workers=4)

In [38]:
def get_avg_word2vec(text, model):
    vectors = [model.wv[word] for word in text if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Create Word2Vec features
df["word2vec_features"] = df["processed_text"].apply(lambda x: get_avg_word2vec(x, word2vec_model))


In [39]:
# Concatenate Word2Vec with argument features.
X = np.hstack((np.vstack(df["word2vec_features"]), np.vstack(df["topic_features"])))
y = df["label"].values

I now proceed to split the dataset into training and testing and train the model

In [40]:
# Divide in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model with logistic regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [41]:
# Evaluation of the model
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
train_precision = precision_score(y_train, y_train_pred)
test_precision = precision_score(y_test, y_test_pred)
train_recall = recall_score(y_train, y_train_pred)
test_recall = recall_score(y_test, y_test_pred)
train_f1 = f1_score(y_train, y_train_pred)
test_f1 = f1_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}")
print(f"Training Precision: {train_precision:.4f}, Test Precision: {test_precision:.4f}")
print(f"Training Recall: {train_recall:.4f}, Test Recall: {test_recall:.4f}")
print(f"Training F1-Score: {train_f1:.4f}, Test F1-Score: {test_f1:.4f}")

Training Accuracy: 0.9615, Test Accuracy: 0.9622
Training Precision: 0.9546, Test Precision: 0.9580
Training Recall: 0.9653, Test Recall: 0.9623
Training F1-Score: 0.9599, Test F1-Score: 0.9602


We check the precision recall f1-score values and can assume that the model is well trained and the model is not in overfitting


In [42]:
# Function to test new news
def predict_news(news_text, model, word2vec_model, lda_model, dictionary):
    processed_text = preprocess_text(news_text)
    word2vec_features = get_avg_word2vec(processed_text, word2vec_model)
    topic_features = get_topic_features(news_text, dictionary, lda_model)
    features = np.hstack((word2vec_features, topic_features)).reshape(1, -1)
    prediction = model.predict(features)
    return "Fake News" if prediction == 0 else "True News"

In [49]:
# Testing with new news
new_articles = [
  # texts taken from the database
    "Texas Governor Just Broke With Trump; This Is Unprecedented",
    "Trumpsters Launch Insane Conspiracy Theory About The Boot On John McCain’s Foot",
    "Companies have up to a year for new U.S. tax bill reporting: SEC",
    "Pence to preside over Senate tax bill vote, his office confirms",
    # made-up texts or taken from forbes.com and wallstreetjournal.com
    "World War II saw Germany triumph in 1960",
    "President Donald Trump said he would impose a 50% tariff on all Canadian steel and aluminum products Wednesday, doubling the planned global levy for the products and deepening the trade war with the U.S.’s northern neighbor. Trump announced on Truth Social Tuesday he would impose an additional 25% tariff on steel and aluminum imports for Canada—on top of the 25% already planned for global steel and aluminum—in retaliation for Ontario’s 25% charge on electricity imports. He also said he would declare a “National Emergency on Electricity within the threatened area,” which primarily impacts Minnesota, Michigan and New York, writing that it “will allow the U.S. to quickly do what has to be done to alleviate this abusive threat from Canada.” Trump threatened to “substantially increase” tariffs on cars from Canada on April 2—when a wave of global reciprocal tariffs on imports to the U.S. is set to take effect—if Canada does not eliminate “other egregious, longtime tariffs,” warning it would “essentially, permanently shut down the automobile manufacturing business in Canada.”",
    "Election Scandal: Millions of Fake Votes Discovered in Favor of Trump A secret investigation has revealed that millions of fraudulent votes were counted in the last U.S. presidential election in favor of Trump. Leaked documents show that ballot boxes filled with fake votes were introduced in several key states to manipulate the outcome. Despite official denials, suspicions are growing that the entire electoral process has been compromised.These examples illustrate how fake news often uses sensationalist headlines and unverified claims to mislead the public.",
    "Elon Musk cofounded seven companies, including electric car maker Tesla, rocket producer SpaceX and artificial intelligence startup xAI.",
    "Bill Laden became president of the United States after making a coup.",
    "Economists are divided on the impact of this decision. Some argue that it will make economic trends more understandable to the average citizen, while others worry about the long-term effects of linking a nation’s financial stability to burger prices. Meanwhile, Burger King has announced plans to launch its own rival currency, the “WhopperCoin,” leading experts to predict an upcoming “Fast-Food Currency War.”",
    "The EU is targeting a range of American goods, including whiskey and motorcycles, with 50% tariffs, taking aim at politically sensitive products. Canada’s tariffs are set to take effect Thursday.",
    "In a government controlled by Republicans, Democrats see the funding deadline as their best chance to push back against Trump and his allies.",
    "The new system will feature weekly elimination rounds where candidates must compete in challenges such as “Escape the Lobbyists,” “Survive a Senate Hearing,” and “Who Can Fix Healthcare in 60 Seconds?” Voters will cast their ballots through an interactive app, and candidates can earn extra votes by performing viral TikTok dances or surviving a 24-hour livestream debate in a locked room."
]

for article in new_articles:
    result = predict_news(article, model, word2vec_model, lda_model, dictionary)
    print(f"News: {article}\nClassification: {result}\n")

News: Texas Governor Just Broke With Trump; This Is Unprecedented
Classification: Fake News

News: Trumpsters Launch Insane Conspiracy Theory About The Boot On John McCain’s Foot
Classification: Fake News

News: Companies have up to a year for new U.S. tax bill reporting: SEC
Classification: True News

News: Pence to preside over Senate tax bill vote, his office confirms
Classification: True News

News: World War II saw Germany triumph in 1960
Classification: Fake News

Classification: True News

News: Election Scandal: Millions of Fake Votes Discovered in Favor of Trump A secret investigation has revealed that millions of fraudulent votes were counted in the last U.S. presidential election in favor of Trump. Leaked documents show that ballot boxes filled with fake votes were introduced in several key states to manipulate the outcome. Despite official denials, suspicions are growing that the entire electoral process has been compromised.These examples illustrate how fake news often use

# From the tests carried out on the new news, the model shows excellent training results

Now let's export the model in pickle format


In [44]:
# Save the model and the vectorizer.
with open("fake_news_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

with open("word2vec_model.pkl", "wb") as vec_file:
    pickle.dump(word2vec_model, vec_file)

with open("lda_model.pkl", "wb") as lda_file:
    pickle.dump(lda_model, lda_file)

with open("dictionary.pkl", "wb") as dict_file:
    pickle.dump(dictionary, dict_file)

print("Models and vectorizer saved correctly")


Modelli e vectorizer salvati correttamente
