In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, LSTM
from tensorflow.keras.utils import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_recall_curve, precision_score, recall_score, balanced_accuracy_score, auc, matthews_corrcoef
import time


In [3]:
def get_metrics(y_true, y_pred, y_prob):
    f1 = f1_score(y_true=y_true, y_pred=y_pred)
    precision  = precision_score(y_true=y_true, y_pred=y_pred)
    recall = recall_score(y_true=y_true, y_pred=y_pred)
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    balanced_accuracy = balanced_accuracy_score(y_true=y_true, y_pred=y_pred)
    MCC = matthews_corrcoef(y_true=y_true, y_pred=y_pred)
    print(f"f1-score = {f1}\nprecision = {precision}\nrecall = {recall}\naccuracy = {accuracy}\nbalanced-accuracy = {balanced_accuracy}\nMCC = {MCC}")

In [4]:
# Function to save metrics to Excel
def save_metrics_to_excel(metrics, model_name, filename="model_metrics.xlsx"):
    try:
        existing_df = pd.read_excel(filename)
    except FileNotFoundError:
        existing_df = pd.DataFrame()

    metrics["Model"] = model_name
    updated_df = pd.concat([existing_df, pd.DataFrame([metrics])], ignore_index=True)
    updated_df.to_excel(filename, index=False)
    print(f"Metrics for {model_name} saved to {filename}")

In [5]:
df = pd.read_csv("Phishing_Email.csv")

df.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [6]:
import re
def preprocess_text(text):
    
    if not isinstance(text, str):
        return ""

    text = re.sub(r'http\S+', '', text)

    # Remove punctuations
    text = re.sub(r'[^\w\s]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text


df["Email Text"] =df["Email Text"].apply(preprocess_text)

In [7]:
le = LabelEncoder()

df["Email Type"] = le.fit_transform(df["Email Type"])

In [8]:
max_len = 150

tk = Tokenizer()

# Process the text

tk.fit_on_texts(df['Email Text'])
sequences = tk.texts_to_sequences(df['Email Text'])
vector = pad_sequences(sequences,padding='post', maxlen=max_len)
vector[0]

array([30488,    39,    12,   105,    28,   370,    11,    15,     8,
        3085,     3,  7413,    13,     3, 10007,   239,  3493,    88,
        1522,  3983,     7,    43,   682,    27,    46,   204,  2897,
         104,  4639,     6,    13,   171,    54,    42,   103,    27,
          46,   126, 30487,     4,    15,     8,   584,     3,  2176,
           5,   316,   988, 16255,  1522,  3983,     4,    22,    10,
          39,  4891,  4639,     7,   349,    57,  1096,     3,    67,
           3,    14,  1846,    98,    30,    12,   105,    28, 25463,
           5,  2183,    11,     8,    28,    57,    15,   666,     3,
          67,    11,     1,  1522,  1947,   580,    17,   237,     6,
           5,  6201,  1287,     2,  2203,    60,   249,  2307,    59,
          23,     5,  2445,     3,    75,   675,    22,  1027,     4,
        2603,    26,     1,  1367,     2,    29,  5332,  3118,    22,
          10,   666,    42,   316,     6,  5037,    88,   768,  2604,
         999,    67,

In [34]:
x = np.array(vector)
y = np.array(df["Email Type"])

In [35]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, shuffle=False)

# Logistic Regression

In [36]:
model_logis = LogisticRegression()

start = time.time()
model_logis.fit(X_train, y_train)
end = time.time()
total_time = end - start
y_pred = model_logis.predict(X_test)
y_prob = model_logis.predict_proba(X_test)
get_metrics(y_true=y_test, y_pred=y_pred, y_prob=y_prob[:,1])
print(len(model_logis.coef_))
print(total_time)

f1-score = 0.7468955441928415
precision = 0.6404635139367366
recall = 0.8957512045554096
accuracy = 0.6284182305630027
balanced-accuracy = 0.5511928102942908
MCC = 0.14211489096614727
1
0.0988779067993164


# Random Forest

In [37]:
model_logis = RandomForestClassifier()
start = time.time()
model_logis.fit(X_train, y_train)
end = time.time()
total_time = end - start

y_pred = model_logis.predict(X_test)
y_prob = model_logis.predict_proba(X_test)
get_metrics(y_true=y_test, y_pred=y_pred, y_prob=y_prob[:,1])

print(total_time)

f1-score = 0.8338607594936709
precision = 0.7601875225387666
recall = 0.9233464739378011
accuracy = 0.774798927613941
balanced-accuracy = 0.7318874733199717
MCC = 0.5174442934379464
9.40778374671936


# Decision Tree

In [38]:
model_logis =  DecisionTreeClassifier()
start = time.time()
model_logis.fit(X_train, y_train)
end = time.time()
total_time = end - start

y_pred = model_logis.predict(X_test)
y_prob = model_logis.predict_proba(X_test)
get_metrics(y_true=y_test, y_pred=y_pred, y_prob=y_prob[:,1])

print(total_time)

f1-score = 0.7396344181899243
precision = 0.7530640036314117
recall = 0.7266754270696452
accuracy = 0.6868632707774799
balanced-accuracy = 0.6753625925949469
MCC = 0.3475580944116217
1.7852561473846436


# KNN

In [39]:
model_logis =  KNeighborsClassifier(n_neighbors=5)

start = time.time()
model_logis.fit(X_train, y_train)
end = time.time()
total_time = end - start

y_pred = model_logis.predict(X_test)
y_prob = model_logis.predict_proba(X_test)
get_metrics(y_true=y_test, y_pred=y_pred, y_prob=y_prob[:,1])

print(total_time)

f1-score = 0.7212830515821413
precision = 0.7138567138567139
recall = 0.7288655278142795
accuracy = 0.6552278820375335
balanced-accuracy = 0.6339559152547555
MCC = 0.2696491893503193
0.0011255741119384766


# SVM

In [40]:
model_logis = SVC(probability=True)

start = time.time()
model_logis.fit(X_train, y_train)
end = time.time()
total_time = end - start

y_pred = model_logis.predict(X_test)
y_prob = model_logis.predict_proba(X_test)
get_metrics(y_true=y_test, y_pred=y_pred, y_prob=y_prob[:,1])

print(total_time)

f1-score = 0.7675028079371022
precision = 0.6701536449820202
recall = 0.8979413053000438
accuracy = 0.6670241286863271
balanced-accuracy = 0.60031826840676
MCC = 0.2545342596921049
67.54396796226501


# NN

In [62]:
model_logis =  MLPClassifier(hidden_layer_sizes=(30,40,50,20), solver='adam')
start = time.time()
model_logis.fit(X_train, y_train)
end = time.time()
total_time = end - start

y_pred = model_logis.predict(X_test)
y_prob = model_logis.predict_proba(X_test)
get_metrics(y_true=y_test, y_pred=y_pred, y_prob=y_prob[:,1])

print(total_time)

f1-score = 0.6744711439720682
precision = 0.6349574632637278
recall = 0.7192290845378887
accuracy = 0.5750670241286864
balanced-accuracy = 0.5334224206379837
MCC = 0.07063628419163222
8.939071655273438


# LSTM ARCH


In [41]:
model = Sequential() # Sequential() API
model.add(Embedding(input_dim=len(tk.word_index)+1,output_dim=50,input_length=150))
model.add(LSTM(units=100))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))


2024-12-17 15:59:19.191433: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-12-17 15:59:19.353902: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [42]:
model.compile(loss='binary_crossentropy' , optimizer='adam', metrics=['accuracy'])

In [43]:
model.summary()

In [None]:
start = time.time()

historical  = model.fit(X_train,y_train, epochs=40, batch_size=16, validation_data=(X_test,y_test))
end = time.time()
print(f"total time = {end - start}")

Epoch 1/40
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 48ms/step - accuracy: 0.6607 - loss: 0.6155 - val_accuracy: 0.8684 - val_loss: 0.3439
Epoch 2/40
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 47ms/step - accuracy: 0.8057 - loss: 0.4861 - val_accuracy: 0.9145 - val_loss: 0.2638
Epoch 3/40
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 47ms/step - accuracy: 0.9257 - loss: 0.2168 - val_accuracy: 0.9440 - val_loss: 0.1544
Epoch 4/40
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 47ms/step - accuracy: 0.9371 - loss: 0.1640 - val_accuracy: 0.9504 - val_loss: 0.1293
Epoch 5/40
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 48ms/step - accuracy: 0.9755 - loss: 0.0718 - val_accuracy: 0.9622 - val_loss: 0.1067
Epoch 6/40
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 47ms/step - accuracy: 0.9837 - loss: 0.0449 - val_accuracy: 0.9622 - val_loss: 0.1114
Epoch 7/40
[1m9

In [45]:
results = model.evaluate(X_test, y_test)
loss = results[0]  # Extract the loss from the results
accuracy = results[1]  # Extract the accuracy from the results

print(f"Model Loss: {loss}")
print(f"Model Accuracy: {accuracy*100}")

[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9632 - loss: 0.2784
Model Loss: 0.23527483642101288
Model Accuracy: 96.38069868087769


In [46]:
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step


In [47]:

get_metrics(y_true=y_test, y_pred=y_pred, y_prob=y_pred_prob)

f1-score = 0.9702183984116479
precision = 0.9773333333333334
recall = 0.9632063074901446
accuracy = 0.9638069705093834
balanced-accuracy = 0.963980486156959
MCC = 0.9242613997616788


# CNN

In [48]:
model = Sequential() # Sequential() API
model.add(Embedding(input_dim=len(tk.word_index)+1,output_dim=50,input_length=150))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy' , optimizer='adam', metrics=['accuracy'])




In [49]:
model.summary()

In [50]:
start = time.time()
historical  = model.fit(X_train,y_train, epochs=40, batch_size=16, validation_data=(X_test,y_test))
end = time.time()
print(f"total time = {end - start}")

Epoch 1/40
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 33ms/step - accuracy: 0.8209 - loss: 0.3371 - val_accuracy: 0.9638 - val_loss: 0.0903
Epoch 2/40
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 32ms/step - accuracy: 0.9835 - loss: 0.0386 - val_accuracy: 0.9660 - val_loss: 0.0873
Epoch 3/40
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 32ms/step - accuracy: 0.9885 - loss: 0.0211 - val_accuracy: 0.9676 - val_loss: 0.0924
Epoch 4/40
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 32ms/step - accuracy: 0.9888 - loss: 0.0204 - val_accuracy: 0.9654 - val_loss: 0.0966
Epoch 5/40
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 32ms/step - accuracy: 0.9897 - loss: 0.0197 - val_accuracy: 0.9660 - val_loss: 0.1010
Epoch 6/40
[1m933/933[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 32ms/step - accuracy: 0.9905 - loss: 0.0200 - val_accuracy: 0.9649 - val_loss: 0.1053
Epoch 7/40
[1m9

In [51]:
results = model.evaluate(X_test, y_test)
loss = results[0]  # Extract the loss from the results
accuracy = results[1]  # Extract the accuracy from the results

print(f"Model Loss: {loss}")
print(f"Model Accuracy: {accuracy*100}")

[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9585 - loss: 0.2426
Model Loss: 0.18150043487548828
Model Accuracy: 96.30026817321777


In [52]:
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

[1m117/117[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [53]:
get_metrics(y_true=y_test, y_pred=y_pred, y_prob=y_pred_prob)

f1-score = 0.9694825298540469
precision = 0.9790084859312193
recall = 0.9601401664476565
accuracy = 0.9630026809651474
balanced-accuracy = 0.9638295856426258
MCC = 0.9228046948515948


In [None]:
#(V + 1) * 50 + 600400 + 101