<a href="https://colab.research.google.com/github/nitinlodhi019/FakeNews_Detection/blob/main/FakeNews_Detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Basic libraries
import pandas as pd
import numpy as np

# NLP and preprocessing
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Model & Evaluation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Saving model
import joblib


In [None]:
# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Load dataset
df = pd.read_csv('/content/Dataset.csv', on_bad_lines='skip')
df.head()


Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [None]:
df.shape

(72134, 4)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  72134 non-null  int64 
 1   title       71576 non-null  object
 2   text        72095 non-null  object
 3   label       72134 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.2+ MB


In [None]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,37106
0,35028


In [None]:
# Check for missing values
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
title,558
text,39
label,0


In [None]:
# Drop rows with missing values
df.dropna(inplace=True)
df.shape

(71537, 4)

In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove punctuation and numbers
    text = re.sub(r'\W+', ' ', text)
    # Remove stopwords and lemmatize
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply cleaning
df['cleaned_text'] = df['text'].apply(clean_text)


In [None]:
# Train-test split
X = df['cleaned_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
X_train

Unnamed: 0,cleaned_text
51405,suggesting person killed left disagrees desire...
42812,conservative effusive praise tuesday evening p...
36324,read 358 people article written brandon smith ...
31669,lyon france reuters two woman injured friday a...
60173,washington reuters death justice antonin scali...
...,...
37473,wikileaks destroys hillary mouthpiece donna br...
6314,time story headlined obama privately tell dono...
55327,italian week cheered electoral defeat mayor si...
869,authority florida said friday omar mateen whos...


In [None]:
y_train

Unnamed: 0,label
51405,1
42812,0
36324,1
31669,0
60173,0
...,...
37473,1
6314,0
55327,0
869,0


In [None]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [None]:
y_pred = model.predict(X_test_tfidf)

In [None]:
from tabulate import tabulate


# 1. Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"\033[1m✅ Model Accuracy:\033[0m {acc:.4f} ({acc * 100:.2f}%)\n")

# 2. Confusion Matrix (with heatmap)
cm = confusion_matrix(y_test, y_pred)
labels = ['Fake (0)', 'Real (1)']

# 3. Pretty-print Confusion Matrix as Table
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print("\n\033[1m🟦 Confusion Matrix Table:\033[0m")
print(tabulate(cm_df, headers='keys', tablefmt='pretty'))

# 4. Classification Report
report = classification_report(y_test, y_pred, target_names=labels, output_dict=True)
report_df = pd.DataFrame(report).transpose()
report_df = report_df.round(2)

print("\n\033[1m📊 Classification Report:\033[0m")
print(tabulate(report_df, headers='keys', tablefmt='pretty'))


[1m✅ Model Accuracy:[0m 0.9420 (94.20%)


[1m🟦 Confusion Matrix Table:[0m
+----------+----------+----------+
|          | Fake (0) | Real (1) |
+----------+----------+----------+
| Fake (0) |   6607   |   474    |
| Real (1) |   356    |   6871   |
+----------+----------+----------+

[1m📊 Classification Report:[0m
+--------------+-----------+--------+----------+---------+
|              | precision | recall | f1-score | support |
+--------------+-----------+--------+----------+---------+
|   Fake (0)   |   0.95    |  0.93  |   0.94   | 7081.0  |
|   Real (1)   |   0.94    |  0.95  |   0.94   | 7227.0  |
|   accuracy   |   0.94    |  0.94  |   0.94   |  0.94   |
|  macro avg   |   0.94    |  0.94  |   0.94   | 14308.0 |
| weighted avg |   0.94    |  0.94  |   0.94   | 14308.0 |
+--------------+-----------+--------+----------+---------+


In [None]:
from google.colab import files

# Save model
joblib.dump(model, 'fake_news_model.pkl')

# Save vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

# Download to local machine (optional)
files.download('fake_news_model.pkl')
files.download('tfidf_vectorizer.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>