# "Build a model to detect fake news using ML
Fake news spreads like a wildfire and this is a big issue in this era.
You can learn how to distinguish fake news from a real one. You can use supervised learning to implement a model like this.
Dataset: Detecting Fake News Dataset https://drive.google.com/file/d/1er9NJTLUA3qnRuyhfzuN0XUsoIC4a-_q/view
"

# Importing required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import nltk
import re

# Read the dataset

In [2]:
df=pd.read_csv('news.csv')
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


# Explore Data

In [3]:
df.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6335 entries, 0 to 6334
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  6335 non-null   int64 
 1   title       6335 non-null   object
 2   text        6335 non-null   object
 3   label       6335 non-null   object
dtypes: int64(1), object(3)
memory usage: 198.1+ KB


In [6]:
df.describe()

Unnamed: 0.1,Unnamed: 0
count,6335.0
mean,5280.415627
std,3038.503953
min,2.0
25%,2674.5
50%,5271.0
75%,7901.0
max,10557.0


In [7]:
df.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [8]:
df.shape

(6335, 4)

In [9]:
df.nunique()#get unique values in each column

Unnamed: 0    6335
title         6256
text          6060
label            2
dtype: int64

# Use of LabelEncoder to encode text(Fake/Real) data

In [10]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df.label=le.fit_transform(df.label)
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1


# Briefing data using nlp(Natural Language Processing)

In [40]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wl=WordNetLemmatizer()

In [41]:
nltk.download('stopwords')
stopwords=stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\naksh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\naksh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Cleaning data for its efficient use

In [15]:
def cleaning_data(row):
    row=row.lower()
    row=re.sub('[^a-zA-Z]',' ',row)
    token = row.split()
    news=[wl.lemmatize(word) for word in token if not word in stopwords]
    cleaned_news=' '.join(news)
    return cleaned_news

In [16]:
df['title'] = df['title'].apply(lambda x : cleaning_data(x))
df['text'] = df['text'].apply(lambda x : cleaning_data(x))

In [17]:
df.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

# Feature Extraction

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
vector_df=TfidfVectorizer(max_features = 7000 , lowercase=False ,ngram_range=(1,4))

In [20]:
df.shape

(6335, 4)

In [21]:
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,smell hillary fear,daniel greenfield shillman journalism fellow f...,0
1,10294,watch exact moment paul ryan committed politic...,google pinterest digg linkedin reddit stumbleu...,0
2,3608,kerry go paris gesture sympathy,u secretary state john f kerry said monday sto...,1
3,10142,bernie supporter twitter erupt anger dnc tried...,kaydee king kaydeeking november lesson tonight...,0
4,875,battle new york primary matter,primary day new york front runner hillary clin...,1
...,...,...,...,...
6330,4490,state department say find email clinton specia...,state department told republican national comm...,1
6331,8062,p pb stand plutocratic pentagon,p pb stand plutocratic pentagon posted oct wik...,0
6332,8622,anti trump protester tool oligarchy information,anti trump protester tool oligarchy reform alw...,0
6333,4021,ethiopia obama seek progress peace security ea...,addis ababa ethiopia president obama convened ...,1


In [22]:
X=df.iloc[:3500,2]
Y=df.iloc[:3500,3]
print(X.head())
print(Y.head())

0    daniel greenfield shillman journalism fellow f...
1    google pinterest digg linkedin reddit stumbleu...
2    u secretary state john f kerry said monday sto...
3    kaydee king kaydeeking november lesson tonight...
4    primary day new york front runner hillary clin...
Name: text, dtype: object
0    0
1    0
2    1
3    0
4    1
Name: label, dtype: int32


# Splitting using train_test_split

In [23]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.25,random_state=0)
vec_x_train=vector_df.fit_transform(x_train).toarray()
print(x_train.shape,x_test.shape)
vec_x_test=vector_df.transform(x_test).toarray()
print(vec_x_train.shape,vec_x_test.shape)

(2625,) (875,)
(2625, 7000) (875, 7000)


In [24]:
y_train.value_counts()

1    1314
0    1311
Name: label, dtype: int64

In [25]:
y_test.value_counts()

0    450
1    425
Name: label, dtype: int64

In [27]:
x_train_data=pd.DataFrame(vec_x_train,columns=vector_df.get_feature_names())
x_test_data=pd.DataFrame(vec_x_test,columns=vector_df.get_feature_names())

# Using Naive Bayes Algorithm for prediction

In [28]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,classification_report

In [29]:
gnb=GaussianNB()

In [30]:
gnb.fit(x_train_data,y_train)
y_pred_test=gnb.predict(x_test_data)
y_pred_test

array([1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,

# Prediction

In [31]:
news=cleaning_data(str("ADDIS ABABA, Ethiopia —President Obama convene"))
prediction=gnb.predict(vector_df.transform([news]).toarray())
prediction

array([1])

# Classification Report

In [32]:
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       0.92      0.81      0.86       450
           1       0.82      0.92      0.87       425

    accuracy                           0.87       875
   macro avg       0.87      0.87      0.87       875
weighted avg       0.87      0.87      0.87       875



In [33]:
y_pred_train=gnb.predict(x_train_data)
y_pred_train

array([0, 0, 1, ..., 0, 0, 1])

In [34]:
print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97      1311
           1       0.96      0.98      0.97      1314

    accuracy                           0.97      2625
   macro avg       0.97      0.97      0.97      2625
weighted avg       0.97      0.97      0.97      2625



In [35]:
print(accuracy_score(y_train,y_pred_train))

0.9714285714285714
