## Importing Libraries

In [1]:
#Importing basic libraries and operations
import pandas as pd
import numpy as np
import re
import string

In [2]:
#Importing ntlk libraries for language processing
from nltk.corpus import stopwords

In [3]:
#Import machine learning libraries and features
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from textstat import gunning_fog
from scipy.sparse import hstack

In [4]:
#Importing Different Models
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

## Importing Of Datasets

In [5]:
#Reading of csv files
try:
    realnews_df = pd.read_csv('real.csv')
    fakenews_df = pd.read_csv('fake.csv')

except:
    print("csv files cannot be read")

In [6]:
#First 10 readings of real news
realnews_df.head(10)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,"December 29, 2017"
6,"Trump says Russia probe will be fair, but time...","WEST PALM BEACH, Fla (Reuters) - President Don...",politicsNews,"December 29, 2017"
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,The following statements were posted to the ve...,politicsNews,"December 29, 2017"
8,Trump on Twitter (Dec 28) - Global Warming,The following statements were posted to the ve...,politicsNews,"December 29, 2017"
9,Alabama official to certify Senator-elect Jone...,WASHINGTON (Reuters) - Alabama Secretary of St...,politicsNews,"December 28, 2017"


In [7]:
#First 10 readings of fake news
fakenews_df.head(10)

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"
5,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,News,"December 25, 2017"
6,"Fresh Off The Golf Course, Trump Lashes Out A...",Donald Trump spent a good portion of his day a...,News,"December 23, 2017"
7,Trump Said Some INSANELY Racist Stuff Inside ...,In the wake of yet another court decision that...,News,"December 23, 2017"
8,Former CIA Director Slams Trump Over UN Bully...,Many people have raised the alarm regarding th...,News,"December 22, 2017"
9,WATCH: Brand-New Pro-Trump Ad Features So Muc...,Just when you might have thought we d get a br...,News,"December 21, 2017"


In [8]:
#Creating a new column called "type" to represent real or fake news 
realnews_df["type"] = 1
fakenews_df["type"] = 0

In [9]:
#Example of the updated dataframe for real news 
realnews_df.head(1)

Unnamed: 0,title,text,subject,date,type
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1


In [10]:
#Example of the updated dataframe for fake news
fakenews_df.head(1)

Unnamed: 0,title,text,subject,date,type
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0


## Merging Real News and Fake News Dataframes

In [11]:
merged_df = pd.concat([realnews_df,fakenews_df], axis = 0)

In [12]:
#Size of Merged Dataframe
merged_df.shape

(44898, 5)

In [13]:
#List out the existing columns of the newly merged dataframe
merged_df.columns

Index(['title', 'text', 'subject', 'date', 'type'], dtype='object')

In [14]:
#First 10 readings of the new merged dataframe
merged_df.sample(10)

Unnamed: 0,title,text,subject,date,type
15133,WOW! New Email (Pictured) Uncovered Shows Hill...,Who s Ready For Hillary?Among the thousands mo...,politics,"Oct 2, 2015",0
6500,Democrats demand probe of Trump health nominee,WASHINGTON (Reuters) - Senate Democrats on Thu...,politicsNews,"January 5, 2017",1
17364,LYING WHITE HOUSE PRESS SECRETARY: “OBAMA HAS ...,It s interesting that Josh Earnest still carri...,Government News,"May 7, 2015",0
12711,"Pope singles out Rome's decay, corruption on t...",ROME (Reuters) - Pope Francis lamented the dec...,worldnews,"December 8, 2017",1
18549,"Bangladesh, Myanmar agree on 'working group' f...","DHAKA, (Reuters) - Bangladesh and Myanmar agr...",worldnews,"October 2, 2017",1
17872,LIBERAL TEACHER’S Social Media Message Goes VI...,Anyone who is not yet convinced that liberalis...,left-news,"Oct 2, 2017",0
22707,"Dopey Santas, McAfee Hacked, Silicon Valley vs...",Tune in to the Alternate Current Radio Network...,Middle-east,"December 29, 2017",0
9072,Trump seeks to clarify comments on guns at Orl...,(Reuters) - Republican U.S. presidential candi...,politicsNews,"June 20, 2016",1
13029,BREAKING: OBAMA WILL VETO BILL Unanimously Pas...,This should come as a surprise to no one Yeste...,politics,"Sep 12, 2016",0
20807,FIDEL CASTRO MOCKS President Obama…Blasts Him ...,B..b b but what about the wave the baseball ga...,left-news,"Mar 28, 2016",0


## Data PreProcessing

In [15]:
#Checking for null values in the dataframe
merged_df.isnull().sum()

title      0
text       0
subject    0
date       0
type       0
dtype: int64

In [16]:
#Dropping unecessary columns
updated_df = merged_df.drop(["title", "subject", "date"], axis = 1)

In [17]:
#Verifying the new size of the updated dataframe
updated_df.shape

(44898, 2)

In [18]:
#Removing duplicate entries from the dataframe
updated_df.drop_duplicates(inplace = True)

In [19]:
#Initialise Stopwords
stop_words = stopwords.words('english')

In [20]:
#Text Cleaning
def text_cleaning(text):
    
    #Changes the text to all lowercase
    text = text.lower()
    
    #Removes punctuation
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    
    #Remove metacharacter patterns
    text = re.sub("\\W"," ",text) 
    
    #Removes the urls
    text = re.sub('https?://\S+|www\.\S+', '', text)
    
    #Removes the new line spacings
    text = re.sub('\n', '', text)
    
    #Removes digits
    text = re.sub('\w*\d\w*', '', text)
    
    #Remove stopwords
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    
    return text

In [21]:
#Applying the compiled text processing function to clean the dataframe text
updated_df["text"] = updated_df["text"].apply(text_cleaning)

In [22]:
#feature extraction function

def feature_extractor(text):
    
    def calculate_gfi(text):
        
        return gunning_fog(text)
    
    # Calculate GFI for each text
    updated_df['gfi'] = updated_df['text'].apply(calculate_gfi)

    # Initialize TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Fit TF-IDF vectorizer and transform text data
    tfidf_matrix = tfidf_vectorizer.fit_transform(updated_df['text'])

    # Concatenate TF-IDF matrix with GFI column
    features = hstack((tfidf_matrix, np.array(updated_df['gfi'])[:, None]))
    
    return features

In [23]:
featured_text = feature_extractor(updated_df["text"])

In [24]:
#Split data into random train and test data in a ratio of 70% and 30% respectively
X_train, X_test, y_train, y_test = train_test_split(featured_text, updated_df['type'], test_size=0.3, random_state=42)

## Baseline Performance: PassiveAgressive Classifier

In [25]:
PassiveAggressiveClassifier_base = PassiveAggressiveClassifier() 
PassiveAggressiveClassifier_base.fit(X_train, y_train) 

In [26]:
#PassiveAggressive Prediction
PA_base_model_prediction = PassiveAggressiveClassifier_base.predict(X_test)

In [27]:
#PassiveAggressive Accuracy Level Evaluation
print(f"Test Set Accuracy : {accuracy_score(y_test, PA_base_model_prediction) * 100} %\n\n") 

Test Set Accuracy : 85.74385510996119 %




In [28]:
#Baseline Model Classification Report
print(f"PassiveAgressive Classification Report : \n\n{classification_report(y_test, PA_base_model_prediction)}")

PassiveAgressive Classification Report : 

              precision    recall  f1-score   support

           0       1.00      0.69      0.81      5223
           1       0.79      1.00      0.88      6372

    accuracy                           0.86     11595
   macro avg       0.90      0.84      0.85     11595
weighted avg       0.89      0.86      0.85     11595



## Comparison with other Classification/Regression Models

## Logistic Regression Model

In [29]:
LogisticRegression = LogisticRegression(max_iter=44898)
LogisticRegression.fit(X_train,y_train)

In [30]:
#LogisticRegression Prediction
LR_model_prediction = LogisticRegression.predict(X_test)

In [31]:
#LogisticRegression Accuracy Level Evaluation
print(f"Test Set Accuracy : {accuracy_score(y_test, LR_model_prediction) * 100} %\n\n") 

Test Set Accuracy : 98.40448469167745 %




In [32]:
#Logistic Regression Model Classification Report
print(f" Logistic Regression Classification Report : \n\n{classification_report(y_test, LR_model_prediction)}")

 Logistic Regression Classification Report : 

              precision    recall  f1-score   support

           0       0.99      0.98      0.98      5223
           1       0.98      0.99      0.99      6372

    accuracy                           0.98     11595
   macro avg       0.98      0.98      0.98     11595
weighted avg       0.98      0.98      0.98     11595



## Multinomial Naive Bayes Classification Model

In [33]:
MultinomialNaiveBayes = MultinomialNB()
MultinomialNaiveBayes.fit(X_train,y_train)

In [34]:
#Multinomial Naive Bayes Prediction
MultiNB_model_prediction = MultinomialNaiveBayes.predict(X_test)

In [35]:
#Multinomial Naive Bayes Accuracy Level Evaluation
print(f"Test Set Accuracy : {accuracy_score(y_test, MultiNB_model_prediction) * 100} %\n\n") 

Test Set Accuracy : 85.73523070288917 %




In [36]:
#Multinomial Naive Bayes Model Classification Report
print(f" Multinomial Naive Bayes Classification Report : \n\n{classification_report(y_test, MultiNB_model_prediction)}")

 Multinomial Naive Bayes Classification Report : 

              precision    recall  f1-score   support

           0       0.99      0.69      0.81      5223
           1       0.80      0.99      0.88      6372

    accuracy                           0.86     11595
   macro avg       0.89      0.84      0.85     11595
weighted avg       0.88      0.86      0.85     11595



## Extreme Gradient Boosting Classification Model

In [37]:
XGBClassifier_base = XGBClassifier()
XGBClassifier_base.fit(X_train,y_train)

In [38]:
#Extreme Gradient Boosting Prediction
XGB_base_model_prediction = XGBClassifier_base.predict(X_test)

In [39]:
#Extreme Gradient Boosting Accuracy Level Evaluation
print(f"Test Set Accuracy : {accuracy_score(y_test, XGB_base_model_prediction) * 100} %\n\n") 

Test Set Accuracy : 99.63777490297542 %




In [40]:
#Extreme Gradient Boosting Classification Report
print(f" Extreme Gradient Boosting Classification Report : \n\n{classification_report(y_test, XGB_base_model_prediction)}")

 Extreme Gradient Boosting Classification Report : 

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      5223
           1       1.00      1.00      1.00      6372

    accuracy                           1.00     11595
   macro avg       1.00      1.00      1.00     11595
weighted avg       1.00      1.00      1.00     11595



## Improving the Feature Data with GloVe Embedding

In [41]:
#Load GloVe embeddings into memory
def load_glove_embeddings(embedding_file):
    embeddings = {}
    with open(embedding_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

#Load GloVe embeddings
glove_embeddings = load_glove_embeddings("glove.6B.100d.txt")

#Function to extract features using GloVe embeddings and PCA
def feature_extractor_with_glove(text, embeddings):
    embedded_text = []
    for sentence in text:
        embedded_sentence = []
        for word in sentence.split():
            if word in embeddings:
                embedded_sentence.append(embeddings[word])
        if embedded_sentence:
            embedded_text.append(np.mean(embedded_sentence, axis=0))
        else:
            # If no embeddings found for any word in the sentence, use zeros
            embedded_text.append(np.zeros_like(next(iter(embeddings.values()))))
    
    #Apply PCA
    pca = PCA(n_components=100)
    pca.fit(embedded_text)
    pca_transformed = pca.transform(embedded_text)
    
    return pca_transformed

In [42]:
#Update your feature extraction function to use GloVe embeddings
featured_text = feature_extractor_with_glove(updated_df["text"], glove_embeddings)

In [43]:
#Split data into random train and test data in a ratio of 70% and 30% respectively
X_train, X_test, y_train, y_test = train_test_split(featured_text, updated_df['type'], test_size=0.3, random_state=42)

In [44]:
PassiveAggressiveClassifier_with_glove = PassiveAggressiveClassifier() 
PassiveAggressiveClassifier_with_glove.fit(X_train, y_train) 

In [45]:
#PassiveAggressive with GloVe Embedding Prediction
PA_with_glove_model_prediction = PassiveAggressiveClassifier_with_glove.predict(X_test)

In [46]:
#PassiveAggressive with GloVe Embedding Accuracy Level Evaluation
print(f"Test Set Accuracy : {accuracy_score(y_test, PA_with_glove_model_prediction) * 100} %\n\n") 

Test Set Accuracy : 94.376886589047 %




In [47]:
#Passive Aggressive Model with GloVe Embedding Classification Report
print(f"PassiveAgressive Classification With Glove Embedding Report : \n\n{classification_report(y_test, PA_with_glove_model_prediction)}")

PassiveAgressive Classification With Glove Embedding Report : 

              precision    recall  f1-score   support

           0       0.93      0.94      0.94      5223
           1       0.95      0.95      0.95      6372

    accuracy                           0.94     11595
   macro avg       0.94      0.94      0.94     11595
weighted avg       0.94      0.94      0.94     11595



## Fake News Detection Using The Best Model: Extreme Gradient Boosting Classification Model

In [48]:
#Verification Alert Function
def news_prediction(n):
    if n == 0:
        return "Warning! This is fake news!"
    elif n == 1:
        return "This news is real!"

In [49]:
#Compiled main demo function
def news_testing(news):
    TestNews = {"text":[news]}
    new_test = pd.DataFrame(TestNews)
    new_test["text"] = new_test["text"].apply(text_cleaning) 
    new_test2 = new_test["text"]
    test_Vector = feature_extractor(new_test2)
    XGB_base_model_prediction = XGBClassifier_base.predict(test_Vector)

    return print("\nExtreme Gradient Boosting Prediction: {}".format(news_prediction(XGB_base_model_prediction[0])))

## Demo with News Data

In [50]:
news = str(input())
news_testing(news)

"As U.S. budget fight looms, Republicans flip their fiscal script","WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for n