In [1]:
#Necessary Imports 

import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Basic EDA

In [2]:
#Read the data
df=pd.read_csv('news.csv')

#Get shape and head
print("The total number of elements in the dataframe :",(df.size))
print("The number of rows and columns in the dataframe are : "+str(df.shape))
df.head()

The total number of elements in the dataframe : 25340
The number of rows and columns in the dataframe are : (6335, 4)


Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [3]:
#Fetching labels from the dataframe
labels=df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

# Dataset Preparation - Train-Test Split

In [4]:
#Splitting the data into training data and testing data 
x_train,x_test,y_train,y_test=train_test_split(df['text'], labels, test_size=0.3, random_state=42)

In [5]:
#testing to see if we have got training dataset
x_train

2771    After months of uncertainty, Donald Trump has ...
6049    25 Views November 10, 2016 GOLD , KWN King Wor...
731     Trump proposals that seem startling now – such...
5835    Holed up in Saudi Arabia, the inner circle of ...
291     Pro-Palestinian Propaganda Lowering Standards ...
4593    These are some of the dead and missing in Tues...
184     On Monday, I made the case that Al Gore should...
4983    Black Female Attorney Demolishes anti-Trump Wh...
4584    The chairman of the House Foreign Relations Co...
3362    A top GOP leader is calling the meeting betwee...
533     WASHINGTON —Two Capitol Hill panels that polic...
1671    Tuesday, 1 November 2016 Kim Kardashian: The Q...
2611    Washington (CNN) Mitch McConnell is the Senate...
3613    Man uses Trump victory as excuse to call ex-gi...
2233    Israel Prime Minister Benjamin Netanyahu took ...
2375    The Democratic party moved a lot closer to cho...
5070    Donald Trump took to Twitter Saturday morning ...
657     Store 

In [6]:
#testing to see if we have got testing dataset
x_test

1357    Will Trump pull a Brexit times ten? What would...
2080    Clintons Are Under Multiple FBI Investigations...
2718    Dispatches from Eric Zuesse This piece is cros...
812     Print \n[Ed. – Every now and then the facade c...
4886    Nanny In Jail After Force Feeding Baby To Deat...
4890    By Belén Fernández | FAIR PHOTO ABOVE: Hillary...
4714    The words, when they came, had lost no power o...
1782    By Amanda Froelich It should be evident if you...
2445    A white police officer in North Charleston, S....
3574    Tony Blair helpfully describes Remain voters a...
4827    0 comments GOP VP candidate Mike Pence’s Trump...
3995    Clinton lost her temper at an event on Thursda...
318     House Speaker Paul D. Ryan attempted to lift t...
2728    The Washington Post \nExcerpts: Once prohibite...
2170    Project Veritas 4: Robert Creamer's Illegal $2...
4431    Miss Russia AFP/East News \nMiss Russia Alisa ...
217     On September 5, 2006, Eli Chomsky was an edito...
5718    Let us

# Removal of Stop Words & Vectorization

In [7]:
#Initialize a TfidfVectorizer and removing stopwords like 'a,an,the,is,or' which are by default in the 'english'
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)

#Fit and transform train set, transform test set
# We are using fit_transform on training data to centralize the data i.e. to have zero std error and mean and then use these objects (mu and sigma) to transform the testing data
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

In [8]:
#Initializing the passive aggressive classifier with 50 iterations
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)

#Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 93.79%


# Final Results

In [9]:
# Creating an error matrix to find true positives,true negatives,false positives and false negatives
a=confusion_matrix(y_test,y_pred, labels=['FAKE','REAL'])

print(a)

print("----------------------------------------------------------------")
print("The number of confirmed fake news is :" , a[0][0])
print("The number of confirmed real news is :" , a[1][1])
print("The number of real news wrongly classified as fake news is :" , a[1][0])
print("The number of fake news wrongly classified as real news is :" , a[0][1])


[[910  58]
 [ 60 873]]
----------------------------------------------------------------
The number of confirmed fake news is : 910
The number of confirmed real news is : 873
The number of real news wrongly classified as fake news is : 60
The number of fake news wrongly classified as real news is : 58
