#Importing the required Modules

In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer # To vectorize the text 
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

#Mounting the drive


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
!unzip "/content/drive/My Drive/news.zip" -d "/content/"

Archive:  /content/drive/My Drive/news.zip
replace /content/news.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


#Read the dataset

In [7]:
#Read the data
df=pd.read_csv('/content/news.csv')

#Data Analysis

In [8]:
print(df.shape)
df.head(10)

(6335, 4)


Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
5,6903,"Tehran, USA","\nI’m not an immigrant, but my grandparents ...",FAKE
6,7341,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",FAKE
7,95,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,REAL
8,4869,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,REAL
9,2909,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,REAL


In [9]:
d={"FAKE":0,"REAL":1}
df['label']=df['label'].map(d)

df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,1
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,0
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,0
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",1


#Splitting the data set


In [10]:
x_train,x_test,y_train,y_test=train_test_split(df['text'], df['label'], test_size=0.2, random_state=7)

#Tfidf Vectorizer


In [11]:
#Initialize a TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7) #Removes the stop words and max df ignores the words which have frequency greater than the threshold

#Fit and transform train set, transform test set
tfidf_train=tfidf_vectorizer.fit_transform(x_train) 
tfidf_test=tfidf_vectorizer.transform(x_test)

#Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression() #create the object of the model
lr = lr.fit(tfidf_train,y_train)

In [13]:
from sklearn.metrics import accuracy_score, confusion_matrix,precision_score,recall_score,f1_score

act = accuracy_score(y_test,lr.predict(tfidf_test))
print('Test Accuracy is: ',(act*100))
p = precision_score(y_test,lr.predict(tfidf_test))
print('Test Precision is: ',(p*100))
r = recall_score(y_test,lr.predict(tfidf_test))
print('Test Recall is: ',(r*100))
f = f1_score(y_test,lr.predict(tfidf_test))
print('Test F1 Score is: ',(f*100))


Test Accuracy is:  91.71270718232044
Test Precision is:  93.66666666666667
Test Recall is:  89.34817170111288
Test F1 Score is:  91.45646867371848


#Random Forest Classifier

In [14]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf = rf.fit(tfidf_train,y_train)

In [None]:
act = accuracy_score(y_test,rf.predict(tfidf_test))
print('Test Accuracy is: ',(act*100))
p = precision_score(y_test,rf.predict(tfidf_test))
print('Test Precision is: ',(p*100))
r = recall_score(y_test,rf.predict(tfidf_test))
print('Test Recall is: ',(r*100))
f = f1_score(y_test,rf.predict(tfidf_test))
print('Test F1 Score is: ',(f*100))

Test Accuracy is:  90.76558800315706
Test Precision is:  91.55844155844156
Test Recall is:  89.66613672496025
Test F1 Score is:  90.60240963855422


#Passive Aggressive Clasiifier

In [15]:
#Initialize a PassiveAggressiveClassifier
pac=PassiveAggressiveClassifier(max_iter=50)
pac=pac.fit(tfidf_train,y_train)



In [16]:
#Predict on the test set and calculate accuracy
y_pred=pac.predict(tfidf_test)

act = accuracy_score(y_test,y_pred)
print('Test Accuracy is: ',(act*100))
p = precision_score(y_test,y_pred)
print('Test Precision is: ',(p*100))
r = recall_score(y_test,y_pred)
print('Test Recall is: ',(r*100))
f = f1_score(y_test,y_pred)
print('Test F1 Score is: ',(f*100))

Test Accuracy is:  92.5808997632202
Test Precision is:  91.99372056514915
Test Recall is:  93.1637519872814
Test F1 Score is:  92.57503949447077
