# Using Logistic Regression to Predict Fake News Headlines

*   The model uses logistic regression to predict whether a news headline is REAL or FAKE



In [28]:
import json

# Download the model.json
!gdown 1eiPXzk4omal2bv4nfy8B7SwC5eskY9Da

with open('model.json') as f:
    model = json.load(f)

Downloading...
From: https://drive.google.com/uc?id=1eiPXzk4omal2bv4nfy8B7SwC5eskY9Da
To: /content/model.json
  0% 0.00/309k [00:00<?, ?B/s]100% 309k/309k [00:00<00:00, 98.6MB/s]


In [29]:
vocabulary = model['vocabulary']
weights = model['weights']
biases = model['biases']

In [127]:
import numpy as np
import pandas as pd
import re
from typing import List


def cleaned_word_list(headline: str) -> List[str]:
    """
    Clean a news headline by:
        - replacing all dashes with spaces
        - removing punctuation
        - converting to lowercase

    Parameters:
        headline (str): The news headline to clean

    Returns:
        list: The cleaned word list
    """
    cleaned_word_list = []
    for word in re.split(r'[\s–—]+', headline):
        word = re.sub(r'[^\w\s]', '', word).lower()
        if word != '':
            cleaned_word_list.append(word)

    return cleaned_word_list


def one_hot_encode(headline: str, vocabulary: list) -> np.array:
    """
    One-hot encode a news headline

    Parameters:
        headline (str): The news headline to encode
        vocabulary (list): The vocabulary of the model

    Returns:
        numpy array: The one-hot encoded headline
    """
    one_hot_vector = np.zeros(len(vocabulary))

    for word in cleaned_word_list(headline):
        if word in vocabulary:
            one_hot_vector[vocabulary.index(word)] = 1

    return one_hot_vector


def linear_regression(feature_vector: np.array, weights: np.array, biases: int) -> float:
    """
    Calculate the linear regression of a news headline

    Parameters:
        feature_vector (numpy array): The one-hot encoded headline
        weights (numpy array): The weights of the model
        biases (int): The biases of the model

    Returns:
        float: The linear regression of the headline
    """
    return np.dot(feature_vector, weights) + biases


def logistic_function(linear_regression_value: float) -> float:
    """
    Apply the logistic function to a linear regression

    Parameters:
        linear_regression_value (float): The linear regression

    Returns:
        float: The predicted probability of the headline being REAL
    """
    return 1 / (1 + np.exp( -1 * linear_regression_value ))


def predict(headline: str) -> int:
    """
    Predict a probability of a news headline being REAL

    Parameters:
        headline (str): The news headline to predict

    Returns:
        int: The predicted probability of the headline being REAL
    """
    one_hot_vector = one_hot_encode(headline, vocabulary)
    linear_regression_value = linear_regression(one_hot_vector, weights, biases)
    probability = logistic_function(linear_regression_value)

    return probability


def print_word_weights(headline: str) -> None:
    """
    Print the word weights of a news headline

    Parameters:
        headline (str): The news headline to print the word weights of
    """
    word_list = cleaned_word_list(headline)

    word_weights = {}
    for word in word_list:
        if word in vocabulary:
            word_index = vocabulary.index(word)
            word_weights[word] = weights[word_index]
        else:
            word_weights[word] = 0

    word_weights_df = pd.DataFrame(word_weights.items(), columns=['word', 'weight'])
    word_weights_df = word_weights_df.sort_values(by='weight', ascending=False)
    display(word_weights_df)


def print_result(probability: float) -> None:
    """
    Print the result of a news headline

    Parameters:
        probability (float): The predicted probability of the headline being REAL
    """
    if probability > 0.5:
        print(f'This headline is {( probability * 100 ):.1f}% REAL')
    else:
        fake_probability = 1 - probability
        print(f'This headline is {( fake_probability * 100 ):.1f}% FAKE')

    print('-' * 27)

# News Headline Input

In [132]:
headline = 'Course Synthesis 2: Submission No Longer Required' #@param {type: 'string'}

probability = predict(headline)
print_result(probability)
print_word_weights(headline)

This headline is 71.7% REAL
---------------------------


Unnamed: 0,word,weight
2,2,0.503068
4,no,0.419108
0,course,0.329493
5,longer,0.025705
1,synthesis,0.0
3,submission,0.0
6,required,-0.053783
