01/14/2026

Author: Nelson Nishio

Fake_News_Detection_Model.ipynb

Python Notebook for training and exporting Logistic Regression model on Fake News Detection.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
import kagglehub

path = kagglehub.dataset_download("emineyetm/fake-news-detection-datasets")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'fake-news-detection-datasets' dataset.
Path to dataset files: /kaggle/input/fake-news-detection-datasets


In [None]:
fake_news_df = pd.read_csv('/kaggle/input/fake-news-detection-datasets/News _dataset/Fake.csv')

In [None]:
true_news_df = pd.read_csv('/kaggle/input/fake-news-detection-datasets/News _dataset/True.csv')

In [None]:
fake_news_df['label'] = 1
true_news_df['label'] = 0
news_df = pd.concat([fake_news_df, true_news_df], axis=0, ignore_index=True)
news_df

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1
...,...,...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",0
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",0
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",0
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",0




In [None]:
news_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,23481
0,21417


# 1 Preprocessing

In [None]:
news_df.isnull().sum()

Unnamed: 0,0
title,0
text,0
label,0


# separating the data & label

In [None]:
X = news_df.drop('label',axis=1)
y = news_df['label']

In [None]:
print(X)

                                                   title  \
0       Donald Trump Sends Out Embarrassing New Year’...   
1       Drunk Bragging Trump Staffer Started Russian ...   
2       Sheriff David Clarke Becomes An Internet Joke...   
3       Trump Is So Obsessed He Even Has Obama’s Name...   
4       Pope Francis Just Called Out Donald Trump Dur...   
...                                                  ...   
44893  'Fully committed' NATO backs new U.S. approach...   
44894  LexisNexis withdrew two products from Chinese ...   
44895  Minsk cultural hub becomes haven from authorities   
44896  Vatican upbeat on possibility of Pope Francis ...   
44897  Indonesia to buy $1.14 billion worth of Russia...   

                                                    text    subject  \
0      Donald Trump just couldn t wish all Americans ...       News   
1      House Intelligence Committee Chairman Devin Nu...       News   
2      On Friday, it was revealed that former Milwauk...       New

# Stemming

# Steps:
lower case                 
splitting                             
removing stopwords                              
stemming                                   

In [None]:
import nltk
nltk.download('stopwords')
ps = PorterStemmer()
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [ps.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from tqdm import tqdm

tqdm.pandas()

news_df['text'] = news_df['text'].progress_apply(stemming)
news_df['text']

100%|██████████| 44898/44898 [33:45<00:00, 22.16it/s]


Unnamed: 0,text
0,donald trump wish american happi new year leav...
1,hous intellig committe chairman devin nune go ...
2,friday reveal former milwauke sheriff david cl...
3,christma day donald trump announc would back w...
4,pope franci use annual christma day messag reb...
...,...
44893,brussel reuter nato alli tuesday welcom presid...
44894,london reuter lexisnexi provid legal regulator...
44895,minsk reuter shadow disus soviet era factori m...
44896,moscow reuter vatican secretari state cardin p...


In [None]:
news_df.to_csv("/content/drive/MyDrive/! Grade 12/news_stemmed.csv", index=False)

# separating the data and label


In [None]:
X = news_df['text'].values
y = news_df['label'].values

# converting the textual data to numerical data

In [None]:
vector = TfidfVectorizer()
vector.fit(X)
X = vector.transform(X)

In [None]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6792176 stored elements and shape (44898, 89633)>
  Coords	Values
  (0, 473)	0.030353709218059625
  (0, 1739)	0.05240236580522227
  (0, 1749)	0.0900274557950121
  (0, 2170)	0.023621442564174935
  (0, 2300)	0.015336876232104514
  (0, 2414)	0.031192561193349677
  (0, 2560)	0.06654066606741882
  (0, 2573)	0.03693793784788616
  (0, 2915)	0.04016842626578405
  (0, 3028)	0.038268893857811306
  (0, 3143)	0.048562349447150516
  (0, 3438)	0.03863174373467533
  (0, 5435)	0.04513133662532027
  (0, 6659)	0.024236107164094356
  (0, 7850)	0.054123995505274636
  (0, 9424)	0.03105361940271527
  (0, 10476)	0.06265812140551041
  (0, 11074)	0.06939178681994727
  (0, 11075)	0.07979215123038806
  (0, 12573)	0.046523344482246265
  (0, 13710)	0.029029336620933273
  (0, 13948)	0.03169835392055289
  (0, 14665)	0.025889069776583035
  (0, 15103)	0.024967361223590508
  (0, 15344)	0.025136891091617198
  :	:
  (44897, 72602)	0.15067642805775505
  (44897,

# Splitting the dataset to training & test data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=2)

In [None]:
X_train.shape

(35918, 89633)

# Training the Model: Logistic Regression

In [None]:
model = LogisticRegression()
model.fit(X_train,Y_train)

In [None]:
# on training set
train_y_pred = model.predict(X_train)
print(accuracy_score(train_y_pred,Y_train))

0.9898936466395679


In [None]:
# on testing set
testing_y_pred = model.predict(X_test)
print(accuracy_score(testing_y_pred,Y_test))

0.987750556792873


In [None]:
import joblib

# Save the trained Logistic Regression model
joblib.dump(model, "/content/drive/MyDrive/! Grade 12/logistic_regression_model.joblib")

# Save the TfidfVectorizer
joblib.dump(vector, "/content/drive/MyDrive/! Grade 12/tfidf_vectorizer.joblib")

print("Model and TF-IDF vectorizer saved successfully!")

Model and TF-IDF vectorizer saved successfully!


In [None]:
"""
Export trained logistic regression model to JSON for browser extension
Cnvert .joblib model to JSON format
"""

import joblib
import json
import numpy as np

# Load your trained model
model = joblib.load("/content/drive/MyDrive/! Grade 12/logistic_regression_model.joblib")

# Check if it's a Pipeline or standalone model
if hasattr(model, 'named_steps'):
    # It's a Pipeline
    vectorizer = model.named_steps['tfidfvectorizer']  # Adjust name if different
    classifier = model.named_steps['logisticregression']  # Adjust name if different
else:
    # It's a standalone model - you need to load vectorizer separately
    print("Model is not a Pipeline. Loading vectorizer separately...")

    # Try to load vectorizer from a separate file
    # Adjust the filename to match your vectorizer file
    try:
        vectorizer = joblib.load("/content/drive/MyDrive/! Grade 12/tfidf_vectorizer.joblib")
        print("Loaded vectorizer from tfidf_vectorizer.joblib")
    except FileNotFoundError:
        print("ERROR: Could not find vectorizer file.")
        print("Please provide the vectorizer separately or specify the correct filename.")
        print("\nIf you trained them together, you might have saved them like this:")
        print("  X_train_tfidf = vectorizer.fit_transform(X_train)")
        print("  model.fit(X_train_tfidf, y_train)")
        print("\nYou need to save the vectorizer:")
        print("  joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')")
        exit(1)

    classifier = model

# Extract model parameters
model_params = {
    'vocabulary': vectorizer.get_feature_names_out().tolist(),
    'idf_values': vectorizer.idf_.tolist(),
    'coefficients': classifier.coef_[0].tolist(),
    'intercept': float(classifier.intercept_[0])
}

# Save to JSON
with open('model_params.json', 'w') as f:
    json.dump(model_params, f)

print(f"Model exported successfully!")
print(f"Vocabulary size: {len(model_params['vocabulary'])}")
print(f"Feature count: {len(model_params['coefficients'])}")
print(f"Intercept: {model_params['intercept']}")

Model is not a Pipeline. Loading vectorizer separately...
Loaded vectorizer from tfidf_vectorizer.joblib
Model exported successfully!
Vocabulary size: 89633
Feature count: 89633
Intercept: 2.0121161981010878
