## Importing Libraries

In [2]:
#Importing basic libraries and operations
import pandas as pd
import numpy as np
import re
import string

In [3]:
#Importing ntlk libraries for language processing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [4]:
#Import machine learning libraries and features
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
#Importing Different Models
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

## Importing Of Datasets

In [6]:
#Reading of csv files
try:
    realnews_df = pd.read_csv('real.csv')
    fakenews_df = pd.read_csv('fake.csv')

except:
    print("csv files cannot be read")

In [7]:
#First 10 readings of real news
realnews_df.head(10)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,"December 29, 2017"
6,"Trump says Russia probe will be fair, but time...","WEST PALM BEACH, Fla (Reuters) - President Don...",politicsNews,"December 29, 2017"
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,The following statements were posted to the ve...,politicsNews,"December 29, 2017"
8,Trump on Twitter (Dec 28) - Global Warming,The following statements were posted to the ve...,politicsNews,"December 29, 2017"
9,Alabama official to certify Senator-elect Jone...,WASHINGTON (Reuters) - Alabama Secretary of St...,politicsNews,"December 28, 2017"


In [8]:
#First 10 readings of fake news
fakenews_df.head(10)

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"
5,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,News,"December 25, 2017"
6,"Fresh Off The Golf Course, Trump Lashes Out A...",Donald Trump spent a good portion of his day a...,News,"December 23, 2017"
7,Trump Said Some INSANELY Racist Stuff Inside ...,In the wake of yet another court decision that...,News,"December 23, 2017"
8,Former CIA Director Slams Trump Over UN Bully...,Many people have raised the alarm regarding th...,News,"December 22, 2017"
9,WATCH: Brand-New Pro-Trump Ad Features So Muc...,Just when you might have thought we d get a br...,News,"December 21, 2017"


In [9]:
#Creating a new column called "type" to represent real or fake news 
realnews_df["type"] = 1
fakenews_df["type"] = 0

In [10]:
#Example of the updated dataframe for real news 
realnews_df.head(1)

Unnamed: 0,title,text,subject,date,type
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1


In [11]:
#Example of the updated dataframe for fake news
fakenews_df.head(1)

Unnamed: 0,title,text,subject,date,type
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0


## Merging Real News and Fake News Dataframes

In [12]:
merged_df = pd.concat([realnews_df,fakenews_df], axis = 0)

In [13]:
#Size of Merged Dataframe
merged_df.shape

(44898, 5)

In [14]:
#List out the existing columns of the newly merged dataframe
merged_df.columns

Index(['title', 'text', 'subject', 'date', 'type'], dtype='object')

In [15]:
#First 10 readings of the new merged dataframe
merged_df.sample(10)

Unnamed: 0,title,text,subject,date,type
10273,WATCH REAGAN WARN US AND DRAW BATTLE LINES…Tru...,President Reagan Warned Of The Dangers Of Libe...,politics,"Jul 30, 2017",0
7914,"President Obama Announces The Major, Unfulfil...",President Obama delivered an address from the ...,News,"February 23, 2016",0
14561,Botswana's Khama tells Mugabe to go,JOHANNESBURG (Reuters) - Zimbabwe President Ro...,worldnews,"November 17, 2017",1
11809,PRESIDENT TRUMP Describes Receiving Nuclear Co...,Pres. Trump on receiving nuclear codes: It is...,politics,"Jan 26, 2017",0
11531,WOW! 83-YEAR OLD SENATOR Challenged By Teen To...,,politics,"Mar 1, 2017",0
20613,WATCH 8TH Grader DESTROY Disgraced Detroit Cit...,Watching this video will help anyone who is cu...,left-news,"May 6, 2016",0
22874,Boiler Room EP #113 – ‘CNN is ISIS’,Tune in to the Alternate Current Radio Network...,Middle-east,"June 16, 2017",0
16109,LOU DOBBS: SEAN SPICER On Trump Firing Comey [...,White House Press Secretary Sean Spicer appear...,Government News,"May 10, 2017",0
6633,Trump nominates trading firm founder Viola as ...,WASHINGTON/NEW YORK (Reuters) - President-elec...,politicsNews,"December 19, 2016",1
16760,Putin-Trump meeting not yet planned for Asia s...,MOSCOW (Reuters) - Russian President Vladimir ...,worldnews,"October 23, 2017",1


## Data PreProcessing

In [16]:
#Checking for null values in the dataframe
merged_df.isnull().sum()

title      0
text       0
subject    0
date       0
type       0
dtype: int64

In [17]:
#Dropping unecessary columns
updated_df = merged_df.drop(["title", "subject", "date"], axis = 1)

In [18]:
#Verifying the new size of the updated dataframe
updated_df.shape

(44898, 2)

In [19]:
#Removing duplicate entries from the dataframe
updated_df.drop_duplicates(inplace = True)

In [20]:
#Removing Stopwords
stop_words = stopwords.words('english')

In [21]:
#Text Processing Steps
def text_process(text):
    
    #Changes the text to all lowercase
    text = text.lower()
    
    #Removes punctuation
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    
    #Remove metacharacter patterns
    text = re.sub("\\W"," ",text) 
    
    #Removes the urls
    text = re.sub('https?://\S+|www\.\S+', '', text)
    
    #Removes the new line spacings
    text = re.sub('\n', '', text)
    
    #Removes digits
    text = re.sub('\w*\d\w*', '', text) 
    
    return text

In [22]:
#Applying the compiled text processing function to clean the dataframe text
updated_df["text"] = updated_df["text"].apply(text_process)

In [23]:
#Setting the x and y for model training
x = updated_df["text"]
y = updated_df["type"]

In [24]:
#Split data into random train and test data in a ratio of 70% and 30% respectively
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

In [25]:
#Implement the feature extractor
feature_extractor = TfidfVectorizer()
training_data = feature_extractor.fit_transform(x_train)
test_data = feature_extractor.transform(x_test)

## Baseline Performance: PassiveAgressive Classifier

In [26]:
PassiveAggressiveClassifier = PassiveAggressiveClassifier() 
PassiveAggressiveClassifier.fit(training_data, y_train) 

In [27]:
#PassiveAggressive Prediction
PA_model_prediction = PassiveAggressiveClassifier.predict(test_data)

In [28]:
#PassiveAggressive Accuracy Level Evaluation
print(f"Test Set Accuracy : {accuracy_score(y_test, PA_model_prediction) * 100} %\n\n") 

Test Set Accuracy : 99.2496765847348 %




In [29]:
#Baseline Model Classification Report
print(f"PassiveAgressive Classification Report : \n\n{classification_report(y_test, PA_model_prediction)}")

PassiveAgressive Classification Report : 

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5151
           1       0.99      0.99      0.99      6444

    accuracy                           0.99     11595
   macro avg       0.99      0.99      0.99     11595
weighted avg       0.99      0.99      0.99     11595



## Comparison with other Classification/Regression Models

## Logistic Regression Model

In [30]:
LogisticRegression = LogisticRegression()
LogisticRegression.fit(training_data,y_train)

In [31]:
#LogisticRegression Prediction
LR_model_prediction = LogisticRegression.predict(test_data)

In [32]:
#LogisticRegression Accuracy Level Evaluation
print(f"Test Set Accuracy : {accuracy_score(y_test, LR_model_prediction) * 100} %\n\n") 

Test Set Accuracy : 98.4389823199655 %




In [33]:
#Logistic Regression Model Classification Report
print(f" Logistic Regression Classification Report : \n\n{classification_report(y_test, LR_model_prediction)}")

 Logistic Regression Classification Report : 

              precision    recall  f1-score   support

           0       0.99      0.98      0.98      5151
           1       0.98      0.99      0.99      6444

    accuracy                           0.98     11595
   macro avg       0.98      0.98      0.98     11595
weighted avg       0.98      0.98      0.98     11595



## Multinomial Naive Bayes Classification Model

In [34]:
MultinomialNaiveBayes = MultinomialNB()
MultinomialNaiveBayes.fit(training_data,y_train)

In [35]:
#Multinomial Naive Bayes Prediction
MultiNB_model_prediction = MultinomialNaiveBayes.predict(test_data)

In [36]:
#Multinomial Naive Bayes Accuracy Level Evaluation
print(f"Test Set Accuracy : {accuracy_score(y_test, MultiNB_model_prediction) * 100} %\n\n") 

Test Set Accuracy : 92.66925398878827 %




In [37]:
#Multinomial Naive Bayes Model Classification Report
print(f" Multinomial Naive Bayes Classification Report : \n\n{classification_report(y_test, MultiNB_model_prediction)}")

 Multinomial Naive Bayes Classification Report : 

              precision    recall  f1-score   support

           0       0.97      0.86      0.91      5151
           1       0.90      0.98      0.94      6444

    accuracy                           0.93     11595
   macro avg       0.93      0.92      0.92     11595
weighted avg       0.93      0.93      0.93     11595



## Extreme Gradient Boosting Classification Model

In [38]:
XGBClassifier = XGBClassifier()
XGBClassifier.fit(training_data,y_train)

In [39]:
# Extreme Gradient Boosting Prediction
XGB_model_prediction = XGBClassifier.predict(test_data)

In [40]:
# Extreme Gradient Boosting Accuracy Level Evaluation
print(f"Test Set Accuracy : {accuracy_score(y_test, XGB_model_prediction) * 100} %\n\n") 

Test Set Accuracy : 99.60327727468736 %




In [41]:
# Extreme Gradient Boosting Classification Report
print(f" Extreme Gradient Boosting Classification Report : \n\n{classification_report(y_test, XGB_model_prediction)}")

 Extreme Gradient Boosting Classification Report : 

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      5151
           1       0.99      1.00      1.00      6444

    accuracy                           1.00     11595
   macro avg       1.00      1.00      1.00     11595
weighted avg       1.00      1.00      1.00     11595



## Fake News Detection Using Extreme Gradient Boosting Classification Model

In [42]:
#Verification Alert Function
def news_prediction(n):
    if n == 0:
        return "Warning! This is fake news!"
    elif n == 1:
        return "This news is real!"

In [43]:
#Compiled main demo function
def news_testing(news):
    TestNews = {"text":[news]}
    new_test = pd.DataFrame(TestNews)
    new_test["text"] = new_test["text"].apply(text_process) 
    new_test2 = new_test["text"]
    test_Vector = feature_extractor.transform(new_test2)
    XGB_model_prediction = XGBClassifier.predict(test_Vector)

    return print("\nXGB Prediction: {}".format(news_prediction(XGB_model_prediction[0])))

## Demo with Real News Data

In [None]:
news = str(input())
news_testing(news)

## Demo with Fake News Data

In [44]:
news = str(input())
news_testing(news)

Drunk Bragging Trump Staffer Started Russian Collusion Investigation,"House Intelligence Committee Chairman Devin Nunes is going to have a bad day. He s been under the assumption, like many of us, that the Christopher Steele-dossier was what prompted the Russia investigation so he s been lashing out at the Department of Justice and the FBI in order to protect Trump. As it happens, the dossier is not what started the investigation, according to documents obtained by the New York Times.Former Trump campaign adviser George Papadopoulos was drunk in a wine bar when he revealed knowledge of Russian opposition research on Hillary Clinton.On top of that, Papadopoulos wasn t just a covfefe boy for Trump, as his administration has alleged. He had a much larger role, but none so damning as being a drunken fool in a wine bar. Coffee boys  don t help to arrange a New York meeting between Trump and President Abdel Fattah el-Sisi of Egypt two months before the election. It was known before that the 