In [5]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# import necessary liberaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout,Input,Embedding,Bidirectional, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
import nltk
import random

# Problem 1: Fake News Detection Using Feed-forward Neural Network (No Embeddings)

In [3]:
# specify file paths
fake_news = '/content/drive/MyDrive/FakeNews.csv'
true_news = '/content/drive/MyDrive/TrueNews.csv'


In [6]:
# generate them into pd dataframes
fake_df = pd.read_csv(fake_news)
true_df = pd.read_csv(true_news)
# combine them into one and label them with 1 if true, 0 if false
fake_df['label'] = 0
true_df['label'] = 1
# combine the dataframes together
df = pd.concat([fake_df, true_df], ignore_index=True)
display(df.head())

Unnamed: 0,Title,Text,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,0


In [5]:
# combine the title and text into one column, concatenate cols
df['Text'] = df['Title'] + ' ' + df['Text']
# drop the title column
df.drop('Title', axis=1, inplace=True)

In [6]:
display(df.head())

Unnamed: 0,Text,label
0,Donald Trump Sends Out Embarrassing New Year’...,0
1,Drunk Bragging Trump Staffer Started Russian ...,0
2,Sheriff David Clarke Becomes An Internet Joke...,0
3,Trump Is So Obsessed He Even Has Obama’s Name...,0
4,Pope Francis Just Called Out Donald Trump Dur...,0


In [7]:
# use TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000,stop_words='english', max_df=0.7)
x = tfidf_vectorizer.fit_transform(df['Text'])
y = df['label']


In [8]:
#print(x)
 # Iterate through the vocabulary and print the index, word, and count
#whatDoesVectorLookLike = []
#for word, index in tfidf_vectorizer.vocabulary_.items():
 #    count = x.sum(axis=0).A1[index]  # Count of the word in the corpus
  #   print(f"Index: {index}, Word: {word}, Count: {count}")
   #  whatDoesVectorLookLike.append( (index, word, count))
    # whatDoesVectorLookLike.sort()
     #for (i, w, c) in whatDoesVectorLookLike:
      #    print(i, w, c)

#firstRow = x[0]  # document 0
#print(firstRow.toarray())

In [8]:
# Set seeds for TensorFlow, NumPy, and Python random
tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

In [9]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(X_train.shape)

(35918, 5000)


In [11]:
# use a MLP calssifer
clf = MLPClassifier(hidden_layer_sizes=(200, 100), max_iter=1000, random_state=42)
clf.fit(X_train, y_train)

In [13]:
# Evaluate the classifier
accuracy = clf.score(X_test, y_test)
print(f"Accuracy of classifying fake news vs real news using MLP: {accuracy}")

Accuracy of classifying fake news vs real news using MLP: 0.9859688195991091


# Problem 2: Spam Detection Using Feed-forward Neural Network (No Embeddings)

In [8]:
# import the spam dataset using pandas
spam_df = pd.read_csv('/content/drive/MyDrive/spam.csv', encoding='latin-1')
spam_df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
spam_df.columns = ['label', 'text']
spam_df['label'] = spam_df['label'].map({'ham': 0, 'spam': 1})

In [9]:
print(spam_df.head())

   label                                               text
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


In [12]:
# use TF-IDF vectorizer on spam df
tfidf_vectorizer = TfidfVectorizer(max_features=5000,stop_words='english', max_df=0.7)
x = tfidf_vectorizer.fit_transform(spam_df['text'])
y = spam_df['label']

In [24]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(X_train.shape)

(4457, 5000)


In [23]:
# start creating a custom neural network model
# from reading online sources: dont need many layers, simpler = better generalization
model = Sequential([
    # input size, number of tf-idf features(5000)
    # 1-3 hidden layers , i chose 2
    Input(shape=(5000,)),
     # larger layers if more data , powers of 2 , 128- 512 per layer
        # reulu works well for dense layers
    Dense(256, activation='relu'),
    # dropout between 0.3-0.5 to regularizee
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    # since we are classifying real vs fake we use sigmoud as activation
    Dense(1, activation='sigmoid')
])
#print(model.summary())
#print("model created")

In [25]:
from re import X

# use binary crossentropy because its binary classification,
# instantiate the model
y_train = np.array(y_train)
y_test = np.array(y_test)
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])
print("model compiled")

model compiled


In [26]:
# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=32)

Epoch 1/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 18ms/step - accuracy: 0.8557 - loss: 0.4241
Epoch 2/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.9829 - loss: 0.0815
Epoch 3/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.9925 - loss: 0.0271
Epoch 4/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.9978 - loss: 0.0101
Epoch 5/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.9993 - loss: 0.0044


<keras.src.callbacks.history.History at 0x7b396c4bdee0>

In [27]:
# evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9867 - loss: 0.0557
Test Loss: 0.08058644086122513, Test Accuracy: 0.9829596281051636


In [28]:
# claculate precision and accuracy
from sklearn.metrics import precision_score, recall_score, f1_score

# create a list of predictions
test_tags = list(model.predict(X_test))
test_tags = [0 if x < 0.5 else 1 for x in test_tags]


[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


In [29]:
mtrx = nltk.ConfusionMatrix(y_test, test_tags)
print(mtrx)
# get precision and recall
precision = precision_score(y_test, test_tags)
recall = recall_score(y_test, test_tags)
f1 = f1_score(y_test, test_tags)
print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

  |   0   1   2 |
--+-------------+
0 |<957>  8   . |
1 |  11<139>  . |
2 |   .   .  <.>|
--+-------------+
(row = reference; col = test)

Precision: 0.9455782312925171, Recall: 0.9266666666666666, F1 Score: 0.936026936026936


# Problem 3: Spam Detection Using Feed-forward Neural Network (With Embeddings

In [10]:
# Parameters for tokenization and padding
max_words = 10000  # Maximum number of words this weill be the amount of nodes in one of the layers
max_sequence_length = 100  # how many words will we take from each text ? padding

# Tokenize the text data
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(spam_df['text'])
print("Tokenization done")

# Convert text to sequences, replace all words with their indices that tokenizer decides
sequences = tokenizer.texts_to_sequences(spam_df['text'])

# Pad the sequences that are too short - if email is shorter than 512
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# Prepare labels
labels = np.array(spam_df['label'])

print("Text tokenized, sequenced, and padded.")
print("Padded sequences shape:", padded_sequences.shape)
print("Labels shape:", labels.shape)

Tokenization done
Text tokenized, sequenced, and padded.
Padded sequences shape: (5572, 100)
Labels shape: (5572,)


In [11]:
# Split the data into training and testing sets
X_train_embed, X_test_embed, y_train_embed, y_test_embed = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

print("Data split into training and testing sets for embedding model.")
print("X_train_embed shape:", X_train_embed.shape)
print("X_test_embed shape:", X_test_embed.shape)
print("y_train_embed shape:", y_train_embed.shape)
print("y_test_embed shape:", y_test_embed.shape)

Data split into training and testing sets for embedding model.
X_train_embed shape: (4457, 100)
X_test_embed shape: (1115, 100)
y_train_embed shape: (4457,)
y_test_embed shape: (1115,)


In [12]:
# start creating a custom neural network model
# from reading online sources: dont need many layers, simpler = better generalization
# dimension
embedding_dim = 256
model = Sequential([
    # embedding layer: how many diff words in vocab, how many units are in the embedding dimension , input length is depreciated
    Embedding(input_dim=max_words, output_dim=embedding_dim,input_length=max_sequence_length),
    Flatten(),
    # input size, number of tf-idf features(5000)
    # 1-3 hidden layers , i chose 2
     # larger layers if more data , powers of 2 , 128- 512 per layer
        # reulu works well for dense layers
    Dense(256, activation='relu',kernel_regularizer=l2(0.001)),
    # dropout between 0.3-0.5 to regularizee , 50% of the weights will be ignored each training cycle
    Dropout(0.5),
    Dense(128, activation='relu',kernel_regularizer=l2(0.001)),
    Dropout(0.5),
    # since we are classifying real vs fake we use sigmoud as activation
    Dense(1, activation='sigmoid')
])
#print(model.summary())
#print("model created")



In [13]:
# train the model
# use binary crossentropy because its binary classification,
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])
print("model compiled")

model compiled


In [14]:
model.fit(X_train_embed, y_train_embed, epochs=5, batch_size=32, validation_split=0.2)


Epoch 1/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 262ms/step - accuracy: 0.8745 - loss: 0.7296 - val_accuracy: 0.9854 - val_loss: 0.1881
Epoch 2/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 221ms/step - accuracy: 0.9921 - loss: 0.1464 - val_accuracy: 0.9798 - val_loss: 0.1976
Epoch 3/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 237ms/step - accuracy: 0.9882 - loss: 0.1590 - val_accuracy: 0.9865 - val_loss: 0.1288
Epoch 4/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 243ms/step - accuracy: 0.9988 - loss: 0.0707 - val_accuracy: 0.9843 - val_loss: 0.0905
Epoch 5/5
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 226ms/step - accuracy: 0.9999 - loss: 0.0461 - val_accuracy: 0.9821 - val_loss: 0.1064


<keras.src.callbacks.history.History at 0x7d030f41fb00>

In [15]:
# evaluate the model
loss, accuracy = model.evaluate(X_test_embed, y_test_embed)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - accuracy: 0.9795 - loss: 0.1191
Test Loss: 0.13801077008247375, Test Accuracy: 0.9775784611701965


In [16]:
# claculate precision and accuracy
from sklearn.metrics import precision_score, recall_score, f1_score
# Predict and flatten
test_preds = model.predict(X_test_embed)
test_preds = test_preds.flatten()  # turns (1115, 1) → (1115,)

# Convert probabilities to binary
test_tags = [1 if x >= 0.5 else 0 for x in test_preds]

# Now compute metrics
precision = precision_score(y_test_embed, test_tags)
recall = recall_score(y_test_embed, test_tags)
f1 = f1_score(y_test_embed, test_tags)

print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, F1 Score: {f1:.3f}")


[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step
Precision: 0.992, Recall: 0.840, F1 Score: 0.910


In [17]:
import numpy as np

print("Pred min:", np.min(test_preds))
print("Pred max:", np.max(test_preds))
print("Mean:", np.mean(test_preds))
print("Unique predicted tags:", np.unique(test_tags, return_counts=True))

Pred min: 4.3238597e-07
Pred max: 0.9998141
Mean: 0.11452233
Unique predicted tags: (array([0, 1]), array([988, 127]))


In [20]:
# evaluate the embedding model
from sklearn.metrics import confusion_matrix
mtrx = nltk.ConfusionMatrix(y_test_embed, test_tags)
print(mtrx)

# Use scikit-learn's confusion matrix
conf_matrix = confusion_matrix(y_test_embed, test_tags)
print("Confusion Matrix:")
print(conf_matrix)

# get precision and recall
precision = precision_score(y_test_embed, test_tags)
recall = recall_score(y_test_embed, test_tags)
f1 = f1_score(y_test_embed, test_tags)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

  |   0   1   2 |
--+-------------+
0 |<964>  1   . |
1 |  24<126>  . |
2 |   .   .  <.>|
--+-------------+
(row = reference; col = test)

Confusion Matrix:
[[964   1]
 [ 24 126]]
Precision: 0.9921259842519685
Recall: 0.84
F1 Score: 0.9097472924187726


From the metrics can conclude that if the model predictes an email as spam it has a very high chance of it being actually being spam , about 99%, however since the recall is 84%, that means the classifier only identifies 84% percent of the postitive spam cases. 24 emails were spam that were marked as not spam. There is a bit of an imbalance between precision and recall. When designing a model we have to decide what kind of trade off is ideal, mistakenly classifying ham as spam or letting more spam come through.