In [1]:
#Importing libraries.
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
#Reading the fake news csv.
fake_df = pd.read_csv("dataset/Fake.csv")

#Adding 'FAKE' label.
fake_df['label'] = 'FAKE'
fake_df

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",FAKE
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",FAKE
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",FAKE
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",FAKE
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",FAKE
...,...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",FAKE
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",FAKE
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",FAKE
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",FAKE


In [3]:
#Reading the true news csv.
true_df = pd.read_csv("dataset/True.csv")

#Adding 'TRUE' label.
true_df['label'] = 'TRUE'
true_df

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",TRUE
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",TRUE
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",TRUE
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",TRUE
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",TRUE
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",TRUE
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",TRUE
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",TRUE
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",TRUE


In [4]:
#Merging these dataframes and shuffling rows with 'sample' function.
merged_df = pd.concat([fake_df, true_df])
merged_df = merged_df.sample(frac=1, random_state=26)
merged_df

Unnamed: 0,title,text,subject,date,label
223,Trump to give speech on U.S. tax overhaul on W...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"December 11, 2017",TRUE
3446,China says it will stick to commitment to figh...,BEIJING (Reuters) - China said on Friday it wo...,politicsNews,"June 2, 2017",TRUE
9385,Eight protesters arrested at Trump rally in Ca...,"ANAHEIM, Calif. (Reuters) - Some 100 people st...",politicsNews,"May 25, 2016",TRUE
10062,Republican Collins' call for Garland hearings ...,WASHINGTON (Reuters) - A moderate Republican s...,politicsNews,"April 5, 2016",TRUE
15112,"German parties back NATO, want good ties with ...",BERLIN (Reuters) - German parties exploring a ...,worldnews,"November 10, 2017",TRUE
...,...,...,...,...,...
148,Factbox: Republicans to keep an eye on as Sena...,WASHINGTON (Reuters) - Republicans in the U.S....,politicsNews,"December 14, 2017",TRUE
794,Republicans push ahead with tax bill as Democr...,WASHINGTON (Reuters) - Republican lawmakers on...,politicsNews,"November 6, 2017",TRUE
10177,YOUTUBE Gives Disgusting Reason For Pulling 95...,Lynnette Hardway and Rochelle Richardson of No...,politics,"Aug 10, 2017",FAKE
18935,"China says Taiwan not a country, Taiwan says C...",BEIJING/TAIPEI (Reuters) - China warned self-r...,worldnews,"September 27, 2017",TRUE


In [5]:
#Splitting dataset.
x_train, x_test, y_train, y_test = train_test_split(merged_df['text'], merged_df['label'], test_size=0.2, random_state=26)

In [6]:
#Initialize tfidf vectorizer.
tfidf_vec = TfidfVectorizer(stop_words='english', max_df=0.7)

#Fit-transform the train data and just transform the test data.
tfidf_train = tfidf_vec.fit_transform(x_train)
tfidf_test = tfidf_vec.transform(x_test)

In [7]:
#Initialize passive aggressive classifier.
pac = PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train, y_train)

#Make prediction on the test set and calculate accuracy.
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 99.43%


In [8]:
#Build confusion matrix.
confusion_matrix(y_test, y_pred, labels=['FAKE', 'TRUE'])

array([[4686,   29],
       [  22, 4243]], dtype=int64)

Removing every '(Reuters)' substrings in True.csv's texts.

In [9]:
#Erasing '(Reuters)' substrings in the text because it can cause overfit.
for i in range(len(true_df['text'])):
    true_df['text'][i] = true_df['text'][i].replace('(Reuters)', '')

true_df['text']

0        WASHINGTON  - The head of a conservative Repub...
1        WASHINGTON  - Transgender people will be allow...
2        WASHINGTON  - The special counsel investigatio...
3        WASHINGTON  - Trump campaign adviser George Pa...
4        SEATTLE/WASHINGTON  - President Donald Trump c...
                               ...                        
21412    BRUSSELS  - NATO allies on Tuesday welcomed Pr...
21413    LONDON  - LexisNexis, a provider of legal, reg...
21414    MINSK  - In the shadow of disused Soviet-era f...
21415    MOSCOW  - Vatican Secretary of State Cardinal ...
21416    JAKARTA  - Indonesia will buy 11 Sukhoi fighte...
Name: text, Length: 21417, dtype: object

In [10]:
#Merging dataframes and shuffling rows with 'sample' function.
merged_df = pd.concat([fake_df, true_df])
merged_df = merged_df.sample(frac=1, random_state=26)
merged_df

Unnamed: 0,title,text,subject,date,label
223,Trump to give speech on U.S. tax overhaul on W...,WASHINGTON - U.S. President Donald Trump will...,politicsNews,"December 11, 2017",TRUE
3446,China says it will stick to commitment to figh...,BEIJING - China said on Friday it would stick...,politicsNews,"June 2, 2017",TRUE
9385,Eight protesters arrested at Trump rally in Ca...,"ANAHEIM, Calif. - Some 100 people staged a bo...",politicsNews,"May 25, 2016",TRUE
10062,Republican Collins' call for Garland hearings ...,WASHINGTON - A moderate Republican senator he...,politicsNews,"April 5, 2016",TRUE
15112,"German parties back NATO, want good ties with ...",BERLIN - German parties exploring a coalition...,worldnews,"November 10, 2017",TRUE
...,...,...,...,...,...
148,Factbox: Republicans to keep an eye on as Sena...,WASHINGTON - Republicans in the U.S. Congress...,politicsNews,"December 14, 2017",TRUE
794,Republicans push ahead with tax bill as Democr...,WASHINGTON - Republican lawmakers on Monday b...,politicsNews,"November 6, 2017",TRUE
10177,YOUTUBE Gives Disgusting Reason For Pulling 95...,Lynnette Hardway and Rochelle Richardson of No...,politics,"Aug 10, 2017",FAKE
18935,"China says Taiwan not a country, Taiwan says C...",BEIJING/TAIPEI - China warned self-ruled Taiw...,worldnews,"September 27, 2017",TRUE


In [11]:
#Splitting dataset.
x_train, x_test, y_train, y_test = train_test_split(merged_df['text'], merged_df['label'], test_size=0.2, random_state=26)

In [12]:
#Initialize tfidf vectorizer.
tfidf_vec2 = TfidfVectorizer(stop_words='english', max_df=0.7)

#Fit-transform the train data and just transform the test data.
tfidf_train = tfidf_vec2.fit_transform(x_train)
tfidf_test = tfidf_vec2.transform(x_test)

In [13]:
#Initialize passive aggressive classifier.
pac2 = PassiveAggressiveClassifier(max_iter=50)
pac2.fit(tfidf_train, y_train)

#Make prediction on the test set and calculate accuracy.
y_pred = pac2.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 98.86%


In [14]:
#Build confusion matrix again.
confusion_matrix(y_test, y_pred, labels=['FAKE', 'TRUE'])

array([[4660,   55],
       [  47, 4218]], dtype=int64)

Remove '*Location_name (Reuters) -' from every rows in True.csv's texts.

In [15]:
#Reading the fake news csv.
true_df = pd.read_csv("dataset/True.csv")
#Adding 'TRUE' label.
true_df['label'] = 'TRUE'

#Erasing '(Reuters)' substrings in the text because it can cause overfit.
seperator = '(Reuters) -'
for i in range(len(true_df['text'])):
    if true_df['text'][i].find(seperator) == -1:
        continue
    true_df['text'][i] = true_df['text'][i].split(seperator, 1)[1]

true_df['text']

0         The head of a conservative Republican faction...
1         Transgender people will be allowed for the fi...
2         The special counsel investigation of links be...
3         Trump campaign adviser George Papadopoulos to...
4         President Donald Trump called on the U.S. Pos...
                               ...                        
21412     NATO allies on Tuesday welcomed President Don...
21413     LexisNexis, a provider of legal, regulatory a...
21414     In the shadow of disused Soviet-era factories...
21415     Vatican Secretary of State Cardinal Pietro Pa...
21416     Indonesia will buy 11 Sukhoi fighter jets wor...
Name: text, Length: 21417, dtype: object

In [16]:
#Merging dataframes and shuffling rows with 'sample' function.
merged_df = pd.concat([fake_df, true_df])
merged_df = merged_df.sample(frac=1, random_state=26)
merged_df

Unnamed: 0,title,text,subject,date,label
223,Trump to give speech on U.S. tax overhaul on W...,U.S. President Donald Trump will deliver a sp...,politicsNews,"December 11, 2017",TRUE
3446,China says it will stick to commitment to figh...,China said on Friday it would stick to its co...,politicsNews,"June 2, 2017",TRUE
9385,Eight protesters arrested at Trump rally in Ca...,Some 100 people staged a boisterous but large...,politicsNews,"May 25, 2016",TRUE
10062,Republican Collins' call for Garland hearings ...,A moderate Republican senator heaped praise o...,politicsNews,"April 5, 2016",TRUE
15112,"German parties back NATO, want good ties with ...",German parties exploring a coalition governme...,worldnews,"November 10, 2017",TRUE
...,...,...,...,...,...
148,Factbox: Republicans to keep an eye on as Sena...,Republicans in the U.S. Congress reached a de...,politicsNews,"December 14, 2017",TRUE
794,Republicans push ahead with tax bill as Democr...,Republican lawmakers on Monday began revising...,politicsNews,"November 6, 2017",TRUE
10177,YOUTUBE Gives Disgusting Reason For Pulling 95...,Lynnette Hardway and Rochelle Richardson of No...,politics,"Aug 10, 2017",FAKE
18935,"China says Taiwan not a country, Taiwan says C...",China warned self-ruled Taiwan on Wednesday t...,worldnews,"September 27, 2017",TRUE


In [17]:
#Splitting dataset.
x_train, x_test, y_train, y_test = train_test_split(merged_df['text'], merged_df['label'], test_size=0.2, random_state=26)

In [18]:
#Initialize tfidf vectorizer.
tfidf_vec3 = TfidfVectorizer(stop_words='english', max_df=0.7)

#Fit-transform the train data and just transform the test data.
tfidf_train = tfidf_vec3.fit_transform(x_train)
tfidf_test = tfidf_vec3.transform(x_test)

In [19]:
#Initialize passive aggressive classifier.
pac3 = PassiveAggressiveClassifier(max_iter=50)
pac3.fit(tfidf_train, y_train)

#Make prediction on the test set and calculate accuracy.
y_pred = pac3.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 98.81%


In [20]:
#Build confusion matrix again.
confusion_matrix(y_test, y_pred, labels=['FAKE', 'TRUE'])

array([[4653,   62],
       [  45, 4220]], dtype=int64)