# Using Logistic Regression to Predict Fake News Headlines

## Global Parameters

In [None]:
n_epoch = 200 #@param {type:"integer"}
learning_rate = 0.001 #@param {type:"number"}

## Libraries Import

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt

## Data Preparation

### Read datasets from csv files

In [4]:
train_csv_file = 'train_clean.csv'
test_csv_file = 'test_clean.csv'

!gdown 1UHxw14fjz_ARraW4dc2im9e0WUwnQL5m
!gdown 1ZsI_uh5mEIZCuT-W11CdS2C3dAWzwz8A

# Read datasets from .csv files
train_df = pd.read_csv(train_csv_file)
test_df = pd.read_csv(test_csv_file)

Downloading...
From: https://drive.google.com/uc?id=1UHxw14fjz_ARraW4dc2im9e0WUwnQL5m
To: /content/train_clean.csv
100% 308k/308k [00:00<00:00, 123MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ZsI_uh5mEIZCuT-W11CdS2C3dAWzwz8A
To: /content/test_clean.csv
100% 133k/133k [00:00<00:00, 97.7MB/s]


In [5]:
# Ensure all titles are Strings
train_df['title'] = train_df['title'].astype(str)
test_df['title'] = test_df['title'].astype(str)

### One-Hot Encoding

In [6]:
# Combine all 'titles' from train and test
all_titles = pd.concat([train_df['title'], test_df['title']])

# Extract all unique words
all_words = []
for title in all_titles:
    list_of_words = title.split()
    for word in list_of_words:
        word = str(word)  # To account for numerals
        if word not in all_words:
            all_words.append(word)

# One-Hot Encoding
train_one_hot = pd.DataFrame(index=train_df.index, columns=all_words).fillna(0)
test_one_hot = pd.DataFrame(index=test_df.index, columns=all_words).fillna(0)

for index, title in enumerate(train_df['title']):
    list_of_words = title.split()
    for word in list_of_words:
        word = str(word)  # To account for numerals
        train_one_hot.loc[train_df.index[index], word] = 1

for index, title in enumerate(test_df['title']):
    list_of_words = title.split()
    for word in list_of_words:
        word = str(word)  # To account for numerals
        test_one_hot.loc[test_df.index[index], word] = 1

# Concatenate One-Hot encoded columns
train_df = pd.concat([train_one_hot, train_df['label']], axis=1)
test_df = pd.concat([test_one_hot, test_df['label']], axis=1)

# Convert labels 'REAL' to 1, and 'FAKE' to 0
train_df = train_df.replace({'label': {'REAL': 1, 'FAKE': 0}})
test_df = test_df.replace({'label': {'REAL': 1, 'FAKE': 0}})

In [7]:
# Check the shapes of the dataframe
print(train_df.shape)
print(test_df.shape)

(4435, 10414)
(1900, 10414)


In [8]:
# Inspect the train_df
train_df

Unnamed: 0,going,back,to,the,future,in,2016,dem,insiders,sanders,...,moby,investment,gasp,congratulate,adequate,evacuations,churkin,unpredictability,symphysiotomy,label
0,1,1,1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,1,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4430,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4431,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4432,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4433,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Inspect the test_df
test_df

Unnamed: 0,going,back,to,the,future,in,2016,dem,insiders,sanders,...,moby,investment,gasp,congratulate,adequate,evacuations,churkin,unpredictability,symphysiotomy,label
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1895,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1896,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1897,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1898,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Storing Data as Variables

In [10]:
# X, y for training in numpy
# X -- features
# y -- labels
X_train = train_df.iloc[:, : -1]
y_train = train_df.iloc[:, -1]

# x, y for testing
X_test = test_df.iloc[:, : -1]
y_test = test_df.iloc[:, -1]

# Convert df to np
X_train = X_train.to_numpy()
y_train = y_train.to_numpy().flatten()
X_test = X_test.to_numpy()
y_test = y_test.to_numpy().flatten()

In [11]:
# Check shape of training set
n_samples, n_features = X_train.shape

print("Samples: ", n_samples)
print("Features: ", n_features)

Samples:  4435
Features:  10413


## Train the Model

### Logistic Function

In [12]:
def logistic(z):
    """
    Function to compute the value of the logistic function

    Parameters:
        z: a scalar value or array of values

    Returns:
        The value of the logistic function
    """
    return 1 / (1 + np.exp(-z))

### Stochastic Gradient Descent

In [13]:
def predict(X, w, b):
    """
    Function to predict the class label

    Parameters:
        X: feature vector, a numpy array of shape (n_samples, n_features)
        w: weight vector, a numpy array of shape (n_features)
        b: bias, a scalar value

    Returns:
        The probability of the class label 1
    """
    # Compute the value of the logistic function
    z = np.dot(w, X) + b
    p = logistic(z)
    return p

def stochastic_gradient_descent(X, y, w, b, learning_rate):
    """
    Function to perform stochastic gradient descent

    Parameters:
        X: feature vector, a numpy array of shape (n_samples, n_features)
        y: class label, a numpy array of shape (n_samples)
        w: weight vector, a numpy array of shape (n_features)
        b: bias, a scalar value
        learning_rate: a scalar value

    Returns:
        w_new: new weight vector, a numpy array of shape (n_features)
        b_new: new bias, a scalar value
    """
    # Number of samples
    n_samples = X.shape[0]

    # Gradient of the loss with respect to w
    grad_w = np.zeros(w.shape)

    # Gradient of the loss with respect to b
    grad_b = 0.0

    # For all samples
    for i in range(n_samples):
        # Compute the value of the logistic function
        p_i = predict(X[i], w, b)

        # Compute the gradient of the loss with respect to w
        grad_w += (p_i - y[i]) * X[i]

        # Compute the gradient of the loss with respect to b
        grad_b += (p_i - y[i])

    # Update w and b
    w_new = w - learning_rate * grad_w
    b_new = b - learning_rate * grad_b

    return w_new, b_new

### Loss Function

In [14]:
def loss(X, y, w, b):
    """
    Function to compute the loss

    Parameters:
        X: feature vector, a numpy array of shape (n_samples, n_features)
        y: class label, a numpy array of shape (n_samples)
        w: weight vector, a numpy array of shape (n_features)
        b: bias, a scalar value

    Returns:
        The value of the loss
    """
    # Number of samples
    n_samples = X.shape[0]

    # Compute the value of the loss
    loss_value = 0.0
    epsilon = 1e-10 # Use to avoid division by 0
    for i in range(n_samples):
        p_i = predict(X[i], w, b)
        loss_value += -y[i] * np.log(p_i + epsilon) - (1 - y[i]) * np.log(1 - p_i + epsilon)

    loss_value /= n_samples

    return loss_value

Accuracy Function

In [40]:
def accuracy(X, y_actual, w, b):
    """
    Function to compute the accuracy

    Parameters:
        X: feature vector, a numpy array of shape (n_samples, n_features)
        y_actual: actual class label, a numpy array of shape (n_samples
        w: weight vector, a numpy array of shape (n_features)
        b: bias, a scalar value

    Returns:
        The value of the accuracy
    """
    # Number of samples
    n_samples = y_actual.shape[0]

    # Compute the value of the accuracy
    accuracy_value = 0.0

    for i in range(n_samples):
        p_i = predict(X[i], w, b)
        if p_i >= 0.5:
            predicted_label_i = 1
        else:
            predicted_label_i = 0

        accuracy_value += (predicted_label_i == y_actual[i])

    accuracy_value /= n_samples

    return accuracy_value

### Use Linear Regression Algorithm with Stochastic Gradient Descent

In [43]:
def logistic_regression(X, y, w, b, learning_rate, n_epochs):
    """
    Function to perform logistic regression

    Parameters:
        X: feature vector, a numpy array of shape (n_samples, n_features)
        y: class label, a numpy array of shape (n_samples)
        w: weight vector, a numpy array of shape (n_features)
        b: bias, a scalar value
        learning_rate: a scalar value
        n_epochs: number of epochs, an integer

    Returns:
        w: weight vector, a numpy array of shape (n_features)
        b: bias, a scalar value
    """
    # Number of samples
    n_samples = X.shape[0]

    # For all epochs
    for epoch in range(n_epochs):
        # Print the loss every (n_epochs / 10) epochs
        print_interval = max( n_epochs // 10, 1 )
        if ( epoch % print_interval ) == 0 or epoch == n_epoch:
            loss_value = loss(X, y, w, b)
            accuracy_value = accuracy(X, y, w, b)
            print(f"Epoch: {epoch}/{n_epoch}, Loss: {loss_value}, Accuracy: {accuracy_value}")

        # Perform stochastic gradient descent
        w, b = stochastic_gradient_descent(X, y, w, b, learning_rate)

        # Print the loss at the final epoch
        if epoch == ( n_epoch - 1 ):
            loss_value = loss(X, y, w, b)
            accuracy_value = accuracy(X, y, w, b)
            print(f"Final Loss: {loss_value}", f"Final Accuracy: {accuracy_value}")

    return w, b

In [44]:
# Initialize w and b
w = np.zeros(X_train.shape[1])
b = 0.0

# Perform logistic regression
w, b = logistic_regression(X_train, y_train, w, b, learning_rate=learning_rate, n_epochs=n_epoch)

Epoch: 0/200, Loss: 0.6931471803599981, Accuracy: 0.5005636978579482
Epoch: 20/200, Loss: 0.5301951799578559, Accuracy: 0.8229988726042841
Epoch: 40/200, Loss: 0.4676718789010899, Accuracy: 0.8507328072153326
Epoch: 60/200, Loss: 0.42893366142624206, Accuracy: 0.8649379932356257
Epoch: 80/200, Loss: 0.4006246316441805, Accuracy: 0.8744081172491545
Epoch: 100/200, Loss: 0.3781757272122597, Accuracy: 0.882750845546787
Epoch: 120/200, Loss: 0.3595097264180512, Accuracy: 0.8895152198421646
Epoch: 140/200, Loss: 0.34350839579927156, Accuracy: 0.893348365276212
Epoch: 160/200, Loss: 0.3294981848953051, Accuracy: 0.8989853438556934
Epoch: 180/200, Loss: 0.31703977763597924, Accuracy: 0.9034949267192784
Final Loss: 0.30582925380842996 Final Accuracy: 0.9075535512965051


## Evaluate the Model

### Predict using Test Set

In [36]:
# Evaluate each row
def predicted_label(X, w, b):
    """
    Function to check whether the model predicts correctly

    Parameters:
        X: feature vector of a single row, a numpy array of shape (n_features)
        w: weight vector, a numpy array of shape (n_features)
        b: bias, a scalar value

    Returns:
        The probability of label 1
    """
    # Check if the predicted label matches the actual label
    p_i = predict(X, w, b)

    if p_i >= 0.5:
        return 1
    else:
        return 0

y_train_pred = []

# Evaluate each row
for i in range(y_train.shape[0]):
    X_train_i = X_train[i]
    y_train_i = y_train[i]
    predicted_label_i = predicted_label(X_train_i, w, b)
    y_train_pred.append(predicted_label_i)

y_test_pred = []

# Evaluate each row
for i in range(y_test.shape[0]):
    X_test_i = X_test[i]
    y_test_i = y_test[i]
    predicted_label_i = predicted_label(X_test_i, w, b)
    y_test_pred.append(predicted_label_i)

### Calculate Loss

In [37]:
# Calculate loss
train_loss = loss(X_train, y_train, w, b)
test_loss = loss(X_test, y_test, w, b)

print(f"Train Loss: {train_loss:.4f}")
print(f"Test Loss: {test_loss:.4f}")

Train Loss: 0.6931
Test Loss: 0.6931


### Calculate Metrics

In [38]:
# Calculate metrics
def calculate_metrics(y_true, y_pred):
    '''
    Calculate accuracy, precision, recall, and f1

    Parameters:
        y_true: true labels, a numpy array of shape (n_samples)
        y_pred: predicted labels, a numpy array of shape (n_samples)

    Returns:
        accuracy: accuracy, a scalar value
        precision: precision, a scalar value
        recall: recall, a scalar value
        f1_score: f1 score, a scalar value
        tru_positive: true positive, a scalar value
        false_positive: false positive, a scalar value
        true_negative: true negative, a scalar value
        false_negative: false negative, a scalar value
    '''
    tp = 0 # true positive
    fp = 0 # false positive
    tn = 0 # true negative
    fn = 0 # false negative

    for i in range(len(y_true)):
        tp += int(y_true[i] == 1 and y_pred[i] == 1)
        fp += int(y_true[i] == 0 and y_pred[i] == 1)
        tn += int(y_true[i] == 0 and y_pred[i] == 0)
        fn += int(y_true[i] == 1 and y_pred[i] == 0)

    accuracy = (tp + tn) / (tp + fp + tn + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2 * (precision * recall) / (precision + recall)

    return accuracy, precision, recall, f1_score, tp, fp, tn, fn

train_accuracy, train_precision, train_recall, train_f1_score, train_tp, train_fp, train_tn, train_fn = calculate_metrics(y_train, y_train_pred)
test_accuracy, test_precision, test_recall, test_f1_score, test_tp, test_fp, test_tn, test_fn = calculate_metrics(y_test, y_test_pred)

In [39]:
print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Train Precision: {train_precision:.4f}")
print(f"Train Recall: {train_recall:.4f}")
print(f"Train F1 Score: {train_f1_score:.4f}")
print()
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1 Score: {test_f1_score:.4f}")
print()
print(f"Train True Positive: {train_tp}")
print(f"Train False Positive: {train_fp}")
print(f"Train True Negative: {train_tn}")
print(f"Train False Negative: {train_fn}")
print()
print(f"Test True Positive: {test_tp}")
print(f"Test False Positive: {test_fp}")
print(f"Test True Negative: {test_tn}")
print(f"Test False Negative: {test_fn}")

Train Accuracy: 0.5006
Train Precision: 0.5006
Train Recall: 1.0000
Train F1 Score: 0.6672

Test Accuracy: 0.5005
Test Precision: 0.5005
Test Recall: 1.0000
Test F1 Score: 0.6671

Train True Positive: 2220
Train False Positive: 2215
Train True Negative: 0
Train False Negative: 0

Test True Positive: 951
Test False Positive: 949
Test True Negative: 0
Test False Negative: 0


## Analyze the Model

In [None]:
# Look at bias
print(f"bias: {b:.4f}")

# Check probability of an empty string
prob_all_0 = predict(np.zeros(X_train.shape[1]), w, b)
print(f"Probability of an empty string: {prob_all_0:.4f}")

# Check probability of a string containing all words
prob_all_1 = predict(np.ones(X_train.shape[1]), w, b)
print(f"Probability of a string containing all words: {prob_all_1:.4f}")

In [None]:
# Sort the values of w to determine the top and bottom n important words
n = 10
word_dict = dict(zip(train_df.columns, w))
sorted_word_dict = dict(sorted(word_dict.items(), key=lambda item: item[1], reverse=True))

# Get the top n important words
top_n_word_dict = dict(list(sorted_word_dict.items())[:n])

# Get the bottom n important words
bottom_n_word_dict = dict(list(sorted_word_dict.items())[-n:])

# Find the n  words closest to 0
closest_to_0_dict = dict(sorted(word_dict.items(), key=lambda item: abs(item[1]))[:n])

In [None]:
# Print the top n important words
print(f"Top {n} Words in Real News:")
for word, value in top_n_word_dict.items():
    print(f"{word.ljust(20)}{value:.4f}")

print()

# Print the bottom n important words
print(f"Top {n} Words in Fake News:")
for word, value in bottom_n_word_dict.items():
    print(f"{word.ljust(20)}{value:.4f}")

print()

# Print the n words closest to 0
print(f"Top {n} Indifferent Words:")
for word, value in closest_to_0_dict.items():
    print(f"{word.ljust(20)}{value:.4f}")

## Predict Whether a Sample News is Real or Fake

### Predction Function

In [None]:
def predict_from_input(input):
    '''
    Predict whether a sample news is real or fake.

    Parameters:
        input (str): The input news.

    Returns:
        None
    '''
    # Convert the input to a one-hot encoding
    sample_one_hot = pd.DataFrame(index=[0], columns=all_words).fillna(0)

    for word in re.split(r'[\s–—]+', input):
        # Remove punctuation using regex and convert to lower case
        word = re.sub(r'[^\w\s]', '', word).lower()

        # Check if the word is in the vocabulary
        if word in all_words:
            word = str(word)  # To account for numerals
            sample_one_hot.loc[0, word] = 1

    X_sample = sample_one_hot.to_numpy()

    # Predict the label for the input
    predicted_prob = predict(X_sample.T, w, b)[0]
    if predicted_prob >= 0.5:
        print(f"\t{( predicted_prob * 100 ):.1f}% REAL")

    else:
        print(f"\t{( ( 1 - predicted_prob ) * 100 ):.1f}% FAKE")
    print()

### News Headline Input

In [None]:
# Empty Words
input_1 = "" #@param {type:"string"}
predict_from_input(input_1)

In [None]:
# Neutral words
input_2 = "Uranium Forum Eluded Friday Meltdown" #@param {type:"string"}
predict_from_input(input_2)

In [None]:
# Neutral words + Real words
input_3 = "Republicans Debate: Uranium Forum Eluded Friday Meltdown" #@param {type:"string"}
predict_from_input(input_3)

In [None]:
# Neutral words + Fake words
input_4 = "Breaking News! Uranium Forum Eluded Friday Meltdown" #@param {type:"string"}
predict_from_input(input_4)

## Save Model to JSON

In [None]:
# Create a dictionary to save the model
model_dict = {
    "model_name": "Predicting Fake News",
    "model_type": "Logistic Regression",
    "learning_rate": learning_rate,
    "epochs": n_epoch,
    "vocabulary": all_words,
    "weights": w.tolist(),
    "biases": b.tolist()
}

import json

# Save the dictionary to a JSON file
with open("model.json", "w") as f:
    json.dump(model_dict, f)