# Fake News Detection Using Naive Bayes Classification

In [17]:
#import library 
import numpy as np
import pandas as pd

In [18]:
#load news data
df = pd.read_csv('news.csv')

In [3]:
df

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [19]:
df.shape

(6335, 4)

In [20]:
df.isnull().sum()

id       0
title    0
text     0
label    0
dtype: int64

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      6335 non-null   int64 
 1   title   6335 non-null   object
 2   text    6335 non-null   object
 3   label   6335 non-null   object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


In [22]:
df.label.value_counts()

label
REAL    3171
FAKE    3164
Name: count, dtype: int64

we can say that it is a balance dataset

### Train Test Split

In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.text,df.label,test_size=0.2,random_state=42)

In [5]:
x_train.shape

(5068,)

In [6]:
x_test.shape

(1267,)

In [12]:
y_train

1142    REAL
2654    FAKE
5395    FAKE
1170    FAKE
4371    FAKE
        ... 
3772    REAL
5191    FAKE
5226    FAKE
5390    REAL
860     FAKE
Name: label, Length: 5068, dtype: object

## Now make the features into 'Bag of words'
- using CounterVectorizer

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
countV = CountVectorizer()
x_train_cv = countV.fit_transform(x_train)
x_test_cv = countV.transform(x_test)

In [8]:
x_train_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 1713334 stored elements and shape (5068, 61667)>

In [9]:
x_test_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 438518 stored elements and shape (1267, 61667)>

In [10]:
x_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(5068, 61667))

In [11]:
x_test_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(1267, 61667))

## Using Naive Bayes Classifiers and Import MultinomialNB and create an object to fit data

In [13]:
from sklearn.naive_bayes import MultinomialNB
bayes_model = MultinomialNB()
bayes_model.fit(x_train_cv, y_train)

In [14]:
bayes_model.score(x_test_cv,y_test)

0.898973954222573

#### import metrics to check score 

In [15]:
from sklearn.metrics import accuracy_score, classification_report

In [16]:
accuracy = bayes_model.score(x_test_cv, y_test)
print("Accuracy:", accuracy)

y_pred = bayes_model.predict(x_test_cv)
print(classification_report(y_test, y_pred))

Accuracy: 0.898973954222573
              precision    recall  f1-score   support

        FAKE       0.92      0.87      0.89       628
        REAL       0.88      0.93      0.90       639

    accuracy                           0.90      1267
   macro avg       0.90      0.90      0.90      1267
weighted avg       0.90      0.90      0.90      1267

