In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from numpy import asarray
from numpy import zeros

# Load the data from a CSV file
# Load train and test datasets
train_df = pd.read_json('../dataset/input/emails_set_train.jsonl', lines=True)
test_df = pd.read_json('../dataset/input/emails_set_test.jsonl', lines=True)

X_train = train_df['text']
y_train = train_df['is_phishing']
X_test = test_df['text']
y_test = test_df['is_phishing']

2023-04-23 16:58:09.658984: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Convert the text into GloVe embeddings
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1

In [3]:
# Load the GloVe word embeddings file
embeddings_dict = dict()
with open('./glove.6B.100d.txt', encoding='utf8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = asarray(values[1:], dtype='float32')
        embeddings_dict[word] = coefs

In [4]:
# Create a weight matrix for the embedding layer
embedding_matrix = zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [5]:
# Pad the sequences so that they are all the same length
X_train_pad = pad_sequences(X_train_tokens, maxlen=100, padding='post')
X_test_pad = pad_sequences(X_test_tokens, maxlen=100, padding='post')

In [6]:
# Train a random forest classifier on the training set
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train_pad, y_train)

In [7]:
# Make predictions on the test set
preds = clf.predict(X_test_pad)
probas = clf.predict_proba(X_test_pad)

In [11]:
# Print the classification report
print('Random Forest Classification Report:\n', classification_report(y_test, preds, digits = 5))

Random Forest Classification Report:
               precision    recall  f1-score   support

       False    0.90976   0.99396   0.95000       497
        True    0.94737   0.52427   0.67500       103

    accuracy                        0.91333       600
   macro avg    0.92856   0.75912   0.81250       600
weighted avg    0.91622   0.91333   0.90279       600

