In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from utils import preprocess_text
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import confusion_matrix

data = pd.read_csv('data\Phishing_Email.csv')
data.drop("Unnamed: 0", axis=1, inplace=True)

sample_size = 10000
label_encoder = LabelEncoder()

X = data['Email Text'].iloc[:sample_size].apply(preprocess_text)
y = data['Email Type'].iloc[:sample_size]
y = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [2]:
# Parameters
max_words = 1000
max_sequence_length = 32
embedding_dim = 32
rnn_units = 32

# Tokenize the training data
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)
sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Create a Sequential model
model = keras.Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(SimpleRNN(units=rnn_units))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Prepare training data
X = sequences
y = np.array(y_train)

# Train the model
model.fit(X, y, epochs=10, batch_size=2)

# Tokenize and preprocess test data
new_sequences = tokenizer.texts_to_sequences(X_test)
new_sequences = pad_sequences(new_sequences, maxlen=max_sequence_length)

# Make predictions on test data
prediction_proba = model.predict(new_sequences)
prediction = (prediction_proba > 0.5).astype(int)

# Evaluate the model on test data
score = model.evaluate(new_sequences, y_test)
print(f"Test loss: {score[0]}")
print(f"Test accuracy: {score[1]}")

# Compute and print the confusion matrix
conf_matrix = confusion_matrix(y_test, prediction)
print(f"Confusion matrix:\n{conf_matrix}")

from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate precision and recall
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)

# Calculate F1 score
f1 = 2 * (precision * recall) / (precision + recall)

print(f"F1 Score: {f1}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.33151113986968994
Test accuracy: 0.9100000262260437
Confusion matrix:
[[ 694   95]
 [  85 1126]]
F1 Score: 0.925986842105263
