# Fake News Detection using NLP and Supervised Machine Learning

Algorithms used:
- Naive Bayes
- K-Nearest Neighbours (KNN)
- Support Vector Machine (SVM)

Dataset: ISOT Fake News Dataset

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

In [3]:
fake = pd.read_csv("Fake.csv")
real = pd.read_csv("True.csv")

fake["label"] = 0   # Fake News
real["label"] = 1   # Real News

data = pd.concat([fake, real])
data = data.sample(frac=1).reset_index(drop=True)

data.head()

Unnamed: 0,title,text,subject,date,label
0,House panel approves plan to privatize air tra...,WASHINGTON (Reuters) - A Republican-controlled...,politicsNews,"February 11, 2016",1
1,Trump populism comes to Canada as Conservative...,OTTAWA (Reuters) - Canada’s answer to Donald T...,politicsNews,"December 29, 2016",1
2,"Maradona backs Venezuela's Maduro, signs for W...",CARACAS (Reuters) - Former Argentine soccer gr...,worldnews,"November 8, 2017",1
3,PBS HOST ASKS HILLARY:“Do You Believe The Pres...,PBS host Judy Woodroof asked Hillary if she be...,politics,"Sep 16, 2017",0
4,Republican Trump releases healthcare proposals,WASHINGTON (Reuters) - U.S. Republican preside...,politicsNews,"March 3, 2016",1


In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [5]:
data["text"] = data["text"].apply(clean_text)

data["text"].head()

0    washington reuters a republican controlled u s...
1    ottawa reuters canada s answer to donald trump...
2    caracas reuters former argentine soccer great ...
3    pbs host judy woodroof asked hillary if she be...
4    washington reuters u s republican presidential...
Name: text, dtype: object

In [6]:
X = data["text"]
y = data["label"]

vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)
X_tfidf = vectorizer.fit_transform(X)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42
)

In [8]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

nb_pred = nb_model.predict(X_test)

print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_pred))
print(classification_report(y_test, nb_pred))

Naive Bayes Accuracy: 0.9351893095768374
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      4654
           1       0.94      0.93      0.93      4326

    accuracy                           0.94      8980
   macro avg       0.94      0.93      0.94      8980
weighted avg       0.94      0.94      0.94      8980



In [9]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

knn_pred = knn_model.predict(X_test)

print("KNN Accuracy:", accuracy_score(y_test, knn_pred))
print(classification_report(y_test, knn_pred))

KNN Accuracy: 0.6426503340757238
              precision    recall  f1-score   support

           0       0.59      0.99      0.74      4654
           1       0.96      0.27      0.42      4326

    accuracy                           0.64      8980
   macro avg       0.78      0.63      0.58      8980
weighted avg       0.77      0.64      0.59      8980



In [10]:
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

svm_pred = svm_model.predict(X_test)

print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print(classification_report(y_test, svm_pred))

SVM Accuracy: 0.994543429844098
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      4654
           1       1.00      0.99      0.99      4326

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [12]:
def predict_news(news_text):
    cleaned_text = clean_text(news_text)
    vectorized_text = vectorizer.transform([cleaned_text])
    prediction = svm_model.predict(vectorized_text)
    return "Real News" if prediction[0] == 1 else "Fake News"

In [13]:
sample_news = "The government announced new economic policies today."
print("Prediction:", predict_news(sample_news))

Prediction: Fake News
