## Importing Libraries

In [87]:
#Importing basic libraries and operations
import pandas as pd
import numpy as np
import re
import string

In [88]:
#Importing ntlk libraries for language processing
from nltk.corpus import stopwords

In [89]:
#Import machine learning libraries and features
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from textstat import gunning_fog
import spacy

In [90]:
#Importing Different Models
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

## Importing Of Datasets

In [91]:
#Reading of csv files
try:
    realnews_df = pd.read_csv('real.csv')
    fakenews_df = pd.read_csv('fake.csv')

except:
    print("csv files cannot be read")

In [92]:
#First 10 readings of real news
realnews_df.head(10)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,"December 29, 2017"
6,"Trump says Russia probe will be fair, but time...","WEST PALM BEACH, Fla (Reuters) - President Don...",politicsNews,"December 29, 2017"
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,The following statements were posted to the ve...,politicsNews,"December 29, 2017"
8,Trump on Twitter (Dec 28) - Global Warming,The following statements were posted to the ve...,politicsNews,"December 29, 2017"
9,Alabama official to certify Senator-elect Jone...,WASHINGTON (Reuters) - Alabama Secretary of St...,politicsNews,"December 28, 2017"


In [93]:
#First 10 readings of fake news
fakenews_df.head(10)

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"
5,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,News,"December 25, 2017"
6,"Fresh Off The Golf Course, Trump Lashes Out A...",Donald Trump spent a good portion of his day a...,News,"December 23, 2017"
7,Trump Said Some INSANELY Racist Stuff Inside ...,In the wake of yet another court decision that...,News,"December 23, 2017"
8,Former CIA Director Slams Trump Over UN Bully...,Many people have raised the alarm regarding th...,News,"December 22, 2017"
9,WATCH: Brand-New Pro-Trump Ad Features So Muc...,Just when you might have thought we d get a br...,News,"December 21, 2017"


In [94]:
#Creating a new column called "type" to represent real or fake news 
realnews_df["type"] = 1
fakenews_df["type"] = 0

In [95]:
#Example of the updated dataframe for real news 
realnews_df.head(1)

Unnamed: 0,title,text,subject,date,type
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1


In [96]:
#Example of the updated dataframe for fake news
fakenews_df.head(1)

Unnamed: 0,title,text,subject,date,type
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0


## Merging Real News and Fake News Dataframes

In [97]:
merged_df = pd.concat([realnews_df,fakenews_df], axis = 0)

In [98]:
#Size of Merged Dataframe
merged_df.shape

(44898, 5)

In [99]:
#List out the existing columns of the newly merged dataframe
merged_df.columns

Index(['title', 'text', 'subject', 'date', 'type'], dtype='object')

In [100]:
#First 10 readings of the new merged dataframe
merged_df.sample(10)

Unnamed: 0,title,text,subject,date,type
16381,Islamic State claims responsibility for attack...,CAIRO (Reuters) - Islamic State has claimed re...,worldnews,"October 26, 2017",1
18723,Migrant smuggling crackdown triggered clashes ...,TRIPOLI (Reuters) - An armed group in the Liby...,worldnews,"September 29, 2017",1
9012,Watch Heartbreaking Cries From An Abused Pupp...,A video of a terrified and previously abused p...,News,"January 2, 2016",0
8059,U.S. lawmakers accuse Russia of seeking to inf...,WASHINGTON (Reuters) - The top Democrats on th...,politicsNews,"September 22, 2016",1
14043,"Aid agencies say Yemen blockade remains, Egela...",GENEVA (Reuters) - The Saudi-led coalition s b...,worldnews,"November 23, 2017",1
2064,"As tax debate heats up, Republicans tweak busi...",WASHINGTON (Reuters) - Congressional Republica...,politicsNews,"August 22, 2017",1
7519,Legendary Investigative Journalist Carl Berns...,Veteran investigative reporter Carl Bernstein ...,News,"March 13, 2016",0
11329,ONE HILARIOUS TWEET Perfectly Sums Up How Irre...,Nunes dropped a bombshell that Obama admin spi...,politics,"Mar 23, 2017",0
342,Trump to nominate former NASA chief Griffin fo...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"December 5, 2017",1
926,WH Starts Condensing Intel Briefings Into Twe...,Donald Trump s obsession with Twitter and inab...,News,"July 5, 2017",0


## Data PreProcessing

In [101]:
#Checking for null values in the dataframe
merged_df.isnull().sum()

title      0
text       0
subject    0
date       0
type       0
dtype: int64

In [102]:
#Dropping unecessary columns
updated_df = merged_df.drop(["title", "subject", "date"], axis = 1)

In [103]:
#Verifying the new size of the updated dataframe
updated_df.shape

(44898, 2)

In [104]:
#Removing duplicate entries from the dataframe
updated_df.drop_duplicates(inplace = True)

In [105]:
#Initialise Stopwords
stop_words = stopwords.words('english')

In [106]:
#Text Cleaning
def text_cleaning(text):
    
    #Changes the text to all lowercase
    text = text.lower()
    
    #Removes punctuation
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    
    #Remove metacharacter patterns
    text = re.sub("\\W"," ",text) 
    
    #Removes the urls
    text = re.sub('https?://\S+|www\.\S+', '', text)
    
    #Removes the new line spacings
    text = re.sub('\n', '', text)
    
    #Removes digits
    text = re.sub('\w*\d\w*', '', text)
    
    #Remove stopwords
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    
    return text

In [107]:
#Applying the compiled text processing function to clean the dataframe text
updated_df["text"] = updated_df["text"].apply(text_cleaning)

In [112]:
def calculate_gfi(text):
    return gunning_fog(text)

# Calculate GFI for each text
updated_df['gfi'] = updated_df['text'].apply(calculate_gfi)

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit TF-IDF vectorizer and transform text data
tfidf_matrix = tfidf_vectorizer.fit_transform(updated_df['text'])

# Concatenate TF-IDF matrix with GFI column
from scipy.sparse import hstack
features = hstack((tfidf_matrix, np.array(updated_df['gfi'])[:, None]))

In [113]:
#Split data into random train and test data in a ratio of 70% and 30% respectively
X_train, X_test, y_train, y_test = train_test_split(features, updated_df['type'], test_size=0.3, random_state=42)

## Baseline Performance: PassiveAgressive Classifier

In [None]:
PassiveAggressiveClassifier = PassiveAggressiveClassifier() 
PassiveAggressiveClassifier.fit(X_train, y_train) 

In [None]:
#PassiveAggressive Prediction
PA_model_prediction = PassiveAggressiveClassifier.predict(test_data)

In [None]:
#PassiveAggressive Accuracy Level Evaluation
print(f"Test Set Accuracy : {accuracy_score(y_test, PA_model_prediction) * 100} %\n\n") 

In [None]:
#Baseline Model Classification Report
print(f"PassiveAgressive Classification Report : \n\n{classification_report(y_test, PA_model_prediction)}")

## Comparison with other Classification/Regression Models

## Logistic Regression Model

In [None]:
LogisticRegression = LogisticRegression()
LogisticRegression.fit(training_data,y_train)

In [None]:
#LogisticRegression Prediction
LR_model_prediction = LogisticRegression.predict(test_data)

In [None]:
#LogisticRegression Accuracy Level Evaluation
print(f"Test Set Accuracy : {accuracy_score(y_test, LR_model_prediction) * 100} %\n\n") 

In [None]:
#Logistic Regression Model Classification Report
print(f" Logistic Regression Classification Report : \n\n{classification_report(y_test, LR_model_prediction)}")

## Multinomial Naive Bayes Classification Model

In [None]:
MultinomialNaiveBayes = MultinomialNB()
MultinomialNaiveBayes.fit(training_data,y_train)

In [None]:
#Multinomial Naive Bayes Prediction
MultiNB_model_prediction = MultinomialNaiveBayes.predict(test_data)

In [None]:
#Multinomial Naive Bayes Accuracy Level Evaluation
print(f"Test Set Accuracy : {accuracy_score(y_test, MultiNB_model_prediction) * 100} %\n\n") 

In [None]:
#Multinomial Naive Bayes Model Classification Report
print(f" Multinomial Naive Bayes Classification Report : \n\n{classification_report(y_test, MultiNB_model_prediction)}")

## Extreme Gradient Boosting Classification Model

In [None]:
XGBClassifier = XGBClassifier()
XGBClassifier.fit(training_data,y_train)

In [None]:
# Extreme Gradient Boosting Prediction
XGB_model_prediction = XGBClassifier.predict(test_data)

In [None]:
# Extreme Gradient Boosting Accuracy Level Evaluation
print(f"Test Set Accuracy : {accuracy_score(y_test, XGB_model_prediction) * 100} %\n\n") 

In [None]:
# Extreme Gradient Boosting Classification Report
print(f" Extreme Gradient Boosting Classification Report : \n\n{classification_report(y_test, XGB_model_prediction)}")

## Fake News Detection Using Extreme Gradient Boosting Classification Model

In [None]:
#Verification Alert Function
def news_prediction(n):
    if n == 0:
        return "Warning! This is fake news!"
    elif n == 1:
        return "This news is real!"

In [None]:
#Compiled main demo function
def news_testing(news):
    TestNews = {"text":[news]}
    new_test = pd.DataFrame(TestNews)
    new_test["text"] = new_test["text"].apply(text_cleaning) 
    new_test2 = new_test["text"]
    test_Vector = feature_extractor.transform(new_test2)
    XGB_model_prediction = XGBClassifier.predict(test_Vector)

    return print("\nXGB Prediction: {}".format(news_prediction(XGB_model_prediction[0])))

## Demo with News Data

In [None]:
news = str(input())
news_testing(news)