In [6]:
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, DistilBertConfig, create_optimizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight

import spacy
from spacy.lang.en import English
from nltk.corpus import stopwords
import nltk

In [None]:
# import model for testing
model_path = '../models/distilbert_model_best.h5'
model = TFDistilBertForSequenceClassification.from_pretrained(model_path)

# import tokenizer for testing
tokenizer_path = '../tokenizers/distilbert_tokenizer_best'
tokenizer = DistilBertTokenizer.from_pretrained(tokenizer_path)


In [None]:
# Download the dictionary for stopwords
nltk.download('stopwords')
# Get the set of stopwords
stop_words_set = set(stopwords.words('english'))
# Load English tokenizer from spacy
nlp = English()
spacy_tokenizer = nlp.tokenizer ## make instance
# Create function to clean text -- lowercase, remove non alphanumeric, remove stop words
def optimized_preprocess(text): ## Takes in a list of texts, i.e. the entire corpus
    # Tokenize using spaCy’s tokenizer
    tokens = [token.text.lower() for token in spacy_tokenizer(text) if token.text.isalpha() and token.text.lower() not in stop_words_set]
    cleaned_query= ' '.join(word for word in tokens)
    return cleaned_query

In [9]:
# Load unseen dataset for testing
test_df = pd.read_csv('../data/train.csv')

# Seperate fake and true articles with random sample size variable
sample_size = 100
fake_df = test_df[test_df['label'] == 1].sample(sample_size)
true_df = test_df[test_df['label'] == 0].sample(sample_size)

In [10]:
# Display fake df
fake_df

Unnamed: 0,id,title,author,text,label
16655,16655,Trump’s Camp Manager DESTROYS Hillary By Point...,Amanda Shea,Trump’s Camp Manager DESTROYS Hillary By Point...,1
19631,19631,October Surprise: ABC Uncovers “Millions” of P...,Colin Taylor,Comments \nRepublican nominee Donald Trump has...,1
3360,3360,The Pathologization of Dissent,CJ Hopkins,Photo by Jamelle Bouie | CC BY 2.0 \n\nAccordi...,1
6160,6160,8 Ways To Forge Strength Through Challenges,Corey Savage,8 Ways To Forge Strength Through Challenges ...,1
12017,12017,,patriots.bonfire,All eyes on Electoral delegates. The People kn...,1
...,...,...,...,...,...
1820,1820,These Blast Points on Hillary’s Campaign… Only...,Charles Hugh Smith,\nThe Deep State’s most prescient elements mus...,1
1646,1646,“Nothing Good Can Come of This Election”–and T...,Charles Hugh Smith,"Posted on November 4, 2016 by Charles Hugh Smi...",1
11591,11591,"Obama Faults F.B.I. on Emails, Citing ‘Incompl...",Kaitlyn Stegall,"November 3, 2016 Obama Faults F.B.I. on Emails...",1
19574,19574,U.S. Takes A Stab At A No Fly Zone In Two Plac...,Brandon Turbeville,By Brandon Turbeville As the United States mar...,1


In [11]:
# Display true df
true_df

Unnamed: 0,id,title,author,text,label
17824,17824,Snap Is Said to Have Worked on a Drone - The N...,Katie Benner,Snap has long been known as the maker of Snapc...,0
14738,14738,"Nintendo Switch: A Blast at Home, So-So on the...",Brian X. Chen,There’s a new gadget that you can count on to ...,0
1639,1639,Poll: 96% of Trump Supporters Would Vote for H...,Joel B. Pollak,An ABC Post poll released on Sunday found th...,0
274,274,How the Fight for a National African-American ...,Graham Bowley,"Eleven years ago, Lonnie G. Bunch III was a mu...",0
13721,13721,Swedish Journalist Attacked in ’No Go Zone’,Chris Tomlinson,"A photographer for Dagens Nyheter, one of Swed...",0
...,...,...,...,...,...
12027,12027,Review: Emmy Awards Showcase TV’s Cultural Dom...,James Poniewozik,Our full report on the 2016 Emmy Awards | red...,0
9834,9834,CNN Op-Ed: Saudis Loved Melania Trump Because ...,Breitbart News,Bangladeshi pundit Anushay Hossain writes for ...,0
14864,14864,"Jane Fawcett, British Decoder Who Helped Doom ...",Bruce Weber,"Jane Fawcett, who was a reluctant London debut...",0
5844,5844,Armstrong and Green: What Does the March for S...,J. Scott Armstrong and Kesten C. Green,What is the “Scientific method”? [Saturday’s M...,0


In [None]:
# Run test on fake_df 
# -----WITH PREPROCESS FUNCTION-----

# Create empty list to store results
fake_check_list = []

# Iterate through texts in the fake_df
for text in fake_df.text:
    # store preprocessed text in random_input variable
    random_input = optimized_preprocess(text)
    # tokenize the random input and return as a tensor
    random_input = tokenizer.encode_plus(
        random_input,
        add_special_tokens=True,
        max_length=300,
        truncation = True,
        padding='max_length', 
        return_attention_mask=True,
        return_tensors="tf"
    )
    
    # Extract 'input_ids' from the 'random_input' dictionary.
    # (numerical representations of textual input for the model)
    input_ids = random_input['input_ids']
    
    # Extract the 'attention_mask' from the 'random_input' dictionary. 
    # This mask helps the model focus on relevant parts of the input.
    attention_mask = random_input['attention_mask']

    # Pass the 'input_ids' and 'attention_mask' to the model to get predictions. 
    # The model uses these inputs to make predictions about the class of the input (Real or Fake).
    predictions = model(input_ids, attention_mask=attention_mask)
    
    # Apply the softmax function to the logits (raw outputs) of the model's predictions.
    # Softmax converts logits to probabilities, making them easier to interpret.
    probabilities = tf.nn.softmax(predictions.logits, axis=-1)

    # Find the class with the highest probability as the model's final prediction.
    # 'tf.argmax' returns the index of the highest value along the specified axis 
    # (-1 refers to the last axis).
    predicted_class = tf.argmax(probabilities, axis=-1).numpy()
    
    # Define the class names corresponding to the output of the model. 
    # In this case, the classes are 'Real' and 'Fake'.
    class_names = ['Real', 'Fake']
    
    # Append a summary message to the 'fake_check_list'.
    # This message includes the predicted class name and the associated probabilities for each class.
    # The predicted class is identified by indexing into 'class_names' using 'predicted_class[0]'.
    # Probabilities are converted to a numpy array for easier readability.
    fake_check_list.append([f"The article is predicted as: {class_names[predicted_class[0]]}", f"Probabilities per class: {probabilities.numpy()[0]}"])
    
# Print test results
fake_check_list

In [None]:
# Run test on true_df 
# -----WITH PREPROCESS FUNCTION-----

# refer to code block above for comments

real_check_list = []
for text in true_df.text:
    random_input = optimized_preprocess(text)
    random_input = tokenizer.encode_plus(
        random_input,
        add_special_tokens=True,
        max_length=300,
        truncation = True,
        padding='max_length', 
        return_attention_mask=True,
        return_tensors="tf"
    )
    input_ids = random_input['input_ids']
    attention_mask = random_input['attention_mask']


    predictions = model(input_ids, attention_mask=attention_mask)
    probabilities = tf.nn.softmax(predictions.logits, axis=-1)


    predicted_class = tf.argmax(probabilities, axis=-1).numpy()
    class_names = ['Real', 'Fake']
    real_check_list.append([f"The article is predicted as: {class_names[predicted_class[0]]}", f"Probabilities per class: {probabilities.numpy()[0]}"])
real_check_list

In [14]:
# RUN THIS CELL ONLY ONCE

# Create an empty dict to store results
result_dict = {}

# Start at test #1
test_num = 1

In [15]:
# Load test results in dict
result_dict[f'test_{test_num}'] = {
    # Sub dict for fake article test
    'fake_examples': {
        # Save indices of tested articles in list form
        'indices': fake_df.index.tolist(),
        # Save scores for test
        'scores': fake_check_list
    },
    # Sub dict for true article test
    'true_examples': {
        # Save indices of tested articles in list form
        'indices': true_df.index.tolist(),
        # Save scores for test
        'scores': real_check_list
    }
}
# Increase test_num by one to prepare for next test
test_num += 1

In [16]:
# Check keys of Dict to see how many tests have been stored
result_dict.keys()


dict_keys(['test_1'])

In [17]:
# Calculate fake_df test accuracy
# ------WITH PREPROCESSING------
# Iterate through tests in the result_dict
for test in result_dict.keys():
    # Create a count variable to track results
    count = 0
    # Index to list of scores
    for result in result_dict[test]['fake_examples']['scores']:
        # For score in list of scores, if wrongly predicted:
        for row in result:
            if "Real" in row:
                # Add one to count
                count += 1
                # Print row that was incorrect
                print(result)

# Print the accuracy for the test in question in percentage format
print(f'Accuracy (Fake W/ Preprocessing) = {100 - count}%')
    

['The article is predicted as: Real', 'Probabilities per class: [0.8440879  0.15591207]']
['The article is predicted as: Real', 'Probabilities per class: [0.75991464 0.24008535]']
['The article is predicted as: Real', 'Probabilities per class: [0.57415354 0.42584652]']
['The article is predicted as: Real', 'Probabilities per class: [0.64537156 0.35462847]']
['The article is predicted as: Real', 'Probabilities per class: [0.8273343  0.17266572]']
['The article is predicted as: Real', 'Probabilities per class: [0.5161735  0.48382646]']
['The article is predicted as: Real', 'Probabilities per class: [0.5240606  0.47593936]']
Accuracy (Fake W/ Preprocessing) = 93%


In [18]:
# Calculate true_df test accuracy
# ------WITH PREPROCESSING------
# refer to above cell block for comments
for test in result_dict.keys():
    count = 0
    for result in result_dict[test]['true_examples']['scores']:
        for row in result:
            if "Fake" in row:
                count += 1
                print(result)
print(f'Accuracy (True W/ Preprocessing) = {100 - count}%')

['The article is predicted as: Fake', 'Probabilities per class: [0.23449449 0.7655055 ]']
['The article is predicted as: Fake', 'Probabilities per class: [0.11451195 0.88548803]']
['The article is predicted as: Fake', 'Probabilities per class: [0.48959827 0.51040167]']
['The article is predicted as: Fake', 'Probabilities per class: [0.25674927 0.74325067]']
['The article is predicted as: Fake', 'Probabilities per class: [0.09440463 0.9055954 ]']
['The article is predicted as: Fake', 'Probabilities per class: [0.39327615 0.6067239 ]']
['The article is predicted as: Fake', 'Probabilities per class: [0.44517723 0.5548228 ]']
Accuracy (True W/ Preprocessing) = 93%


In [None]:
# Run test on fake_df 
# -----WITHOUT PREPROCESS FUNCTION-----
# Refer to test cell block above for comments

no_preproc_fake_check_list = []
for text in fake_df.text:
    random_input = text
    random_input = tokenizer.encode_plus(
        random_input,
        add_special_tokens=True,
        max_length=300,
        truncation = True,
        padding='max_length', 
        return_attention_mask=True,
        return_tensors="tf"
    )
    input_ids = random_input['input_ids']
    attention_mask = random_input['attention_mask']


    predictions = model(input_ids, attention_mask=attention_mask)
    probabilities = tf.nn.softmax(predictions.logits, axis=-1)


    predicted_class = tf.argmax(probabilities, axis=-1).numpy()
    class_names = ['Real', 'Fake']
    no_preproc_fake_check_list.append([f"The article is predicted as: {class_names[predicted_class[0]]}", f"Probabilities per class: {probabilities.numpy()[0]}"])
no_preproc_fake_check_list

In [None]:
# Run test on true_df 
# -----WITHOUT PREPROCESS FUNCTION-----
# Refer to test cell block above for comments

no_preproc_real_check_list = []
for text in true_df.text:
    random_input = text
    random_input = tokenizer.encode_plus(
        random_input,
        add_special_tokens=True,
        max_length=300,
        truncation = True,
        padding='max_length', 
        return_attention_mask=True,
        return_tensors="tf"
    )
    input_ids = random_input['input_ids']
    attention_mask = random_input['attention_mask']


    predictions = model(input_ids, attention_mask=attention_mask)
    probabilities = tf.nn.softmax(predictions.logits, axis=-1)


    predicted_class = tf.argmax(probabilities, axis=-1).numpy()
    class_names = ['Real', 'Fake']
    no_preproc_real_check_list.append([f"The article is predicted as: {class_names[predicted_class[0]]}", f"Probabilities per class: {probabilities.numpy()[0]}"])
no_preproc_real_check_list

In [21]:
# RUN THIS CELL ONLY ONCE

# Create an empty dict to store results
no_preproc_result_dict = {}

# Start at test #1
test_num_no_preproc = 1

In [22]:
# Store test results
# refer to result dict cell block above for comments
no_preproc_result_dict[f'test_{test_num}'] = {
    'fake_examples': {
        'indices': fake_df.index.tolist(),
        'scores': no_preproc_fake_check_list
    },
    'true_examples': {
        'indices': true_df.index.tolist(),
        'scores': no_preproc_real_check_list
    }
}
test_num_no_preproc += 1

In [23]:
# Calculate fake_df test accuracy
# ------WITHOUT PREPROCESSING------
# refer to test accuracy cell above for comments

for test in no_preproc_result_dict.keys():
    count = 0
    for result in no_preproc_result_dict[test]['fake_examples']['scores']:
        for row in result:
            if "Real" in row:
                count += 1
                print(result)
print(f'Accuracy (Fake NO Preprocessing) = {100 - count}%')

['The article is predicted as: Real', 'Probabilities per class: [0.56449354 0.43550646]']
['The article is predicted as: Real', 'Probabilities per class: [0.5453537  0.45464623]']
['The article is predicted as: Real', 'Probabilities per class: [0.94858193 0.05141811]']
['The article is predicted as: Real', 'Probabilities per class: [0.7990511  0.20094892]']
['The article is predicted as: Real', 'Probabilities per class: [0.6760489 0.3239511]']
['The article is predicted as: Real', 'Probabilities per class: [0.5590448  0.44095516]']
['The article is predicted as: Real', 'Probabilities per class: [0.59428835 0.4057117 ]']
['The article is predicted as: Real', 'Probabilities per class: [0.7882812  0.21171874]']
['The article is predicted as: Real', 'Probabilities per class: [0.7903143  0.20968564]']
['The article is predicted as: Real', 'Probabilities per class: [0.85688585 0.14311416]']
Accuracy (Fake NO Preprocessing) = 90%


In [24]:
# Calculate true_df test accuracy
# ------WITHOUT PREPROCESSING------
# refer to test accuracy cell above for comments

for test in no_preproc_result_dict.keys():
    count = 0
    for result in no_preproc_result_dict[test]['true_examples']['scores']:
        for row in result:
            if "Fake" in row:
                count += 1
                print(result)
print(f'Accuracy (True NO Preprocessing)= {100 - count}%')

['The article is predicted as: Fake', 'Probabilities per class: [0.24672669 0.7532733 ]']
['The article is predicted as: Fake', 'Probabilities per class: [0.14297141 0.8570286 ]']
Accuracy (True NO Preprocessing)= 98%
