<a href="https://colab.research.google.com/github/manishasiddi24/Fake_News_Classification_Project/blob/main/Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip install kaggle

import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score
import joblib



In [8]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
from google.colab import files
files.upload()   # Upload kaggle.json

Saving kaggle.json to kaggle (1).json


{'kaggle (1).json': b'{"username":"manishasiddi","key":"908dc90efa88c22800d1fe68200a7a83"}'}

In [13]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [11]:
!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset

Dataset URL: https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset
License(s): CC-BY-NC-SA-4.0
Downloading fake-and-real-news-dataset.zip to /content
  0% 0.00/41.0M [00:00<?, ?B/s]
100% 41.0M/41.0M [00:00<00:00, 759MB/s]


In [12]:
import zipfile

with zipfile.ZipFile("fake-and-real-news-dataset.zip", 'r') as zip_ref:
    zip_ref.extractall("fake_news_data")

In [14]:
!ls -l fake_news_data

total 113648
-rw-r--r-- 1 root root 62789876 Sep 15 07:50 Fake.csv
-rw-r--r-- 1 root root 53582940 Sep 15 07:50 True.csv


In [15]:
fake_df = pd.read_csv("fake_news_data/Fake.csv")
real_df = pd.read_csv("fake_news_data/True.csv")
fake_df["label"] = 1
real_df["label"] = 0
df = pd.concat([fake_df, real_df], axis=0).reset_index(drop=True)
print("Dataset shape:", df.shape)
df.head()

Dataset shape: (44898, 5)


Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [16]:
from nltk.corpus import stopwords
import string
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = [w for w in text.split() if w not in stop_words]
    return " ".join(words)

df['clean_text'] = df['text'].apply(clean_text)
df[['text','clean_text']].head()

Unnamed: 0,text,clean_text
0,Donald Trump just couldn t wish all Americans ...,donald trump wish americans happy new year lea...
1,House Intelligence Committee Chairman Devin Nu...,house intelligence committee chairman devin nu...
2,"On Friday, it was revealed that former Milwauk...",friday revealed former milwaukee sheriff david...
3,"On Christmas day, Donald Trump announced that ...",christmas day donald trump announced would bac...
4,Pope Francis used his annual Christmas Day mes...,pope francis used annual christmas day message...


In [17]:
X = df['clean_text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

Train size: (35918,)
Test size: (8980,)


In [18]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
print(X_train_tfidf.shape, X_test_tfidf.shape)

(35918, 5000) (8980, 5000)


In [19]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
y_pred = lr_model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

Accuracy: 0.9899777282850779
F1 Score: 0.9904782056707575


In [20]:
joblib.dump(lr_model, "fake_news_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

In [21]:
!ls -l

total 42212
-rw-r--r-- 1 root root 42975911 Apr 19  2024  fake-and-real-news-dataset.zip
drwxr-xr-x 2 root root     4096 Sep 15 07:50  fake_news_data
-rw-r--r-- 1 root root    40863 Sep 15 07:56  fake_news_model.pkl
-rw-r--r-- 1 root root       68 Sep 15 07:49 'kaggle (1).json'
-rw-r--r-- 1 root root       68 Sep 15 07:42  kaggle.json
drwxr-xr-x 1 root root     4096 Sep  9 13:46  sample_data
-rw-r--r-- 1 root root   184633 Sep 15 07:56  tfidf_vectorizer.pkl


In [23]:
def predict_news(text):
    # Clean the text same way we did during training
    text_clean = clean_text(text)
    # Transform into TF-IDF vector
    text_tfidf = vectorizer.transform([text_clean])
    # Predict using logistic regression model
    pred = lr_model.predict(text_tfidf)[0]
    # Return label
    return "FAKE" if pred == 1 else "REAL"

In [24]:
print(predict_news("Aliens landed in New York today!"))
print(predict_news("Government announces new economic policy."))

FAKE
FAKE
