# Email Classififaction 

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text  # Required for BERT preprocessor or you can preprocess it urself
import pandas as pd
import io
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import glob
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.layers import Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [36]:
# Import the data
data = pd.read_csv('combined_data.csv')

  data = pd.read_csv('combined_data.csv')


## Preliminary Data Processing step 


This includes loading the combined dataset, preprocessing text data, and preparing it for model training. There are columns for sender, receiver, date, subject, body, label, and urls. The label column appears to be a binary indicator where 1 might represent a phishing email, and the urls column indicates the presence of URLs in the email body, which is also marked as 1 for the presence of URLs. We want to preprocess both the subject and body as both of these fields carry significant information that can contribute to the classification performance.

In [None]:
# Fill NaN values
data.fillna({'subject': '', 'body': '', 'label': 0}, inplace=True)

# Handle 'urls' column: Create a binary flag indicating the presence of URLs
data['urls_present'] = data['urls'].notna().astype(int)

## Raw Data Visualisation

To visualise the CSV file, we need to do catergorise the data in a few ways:

- Distribution of labels: Show the balance between phishing and non-phishing emails
- Emails by date: How the voulme of emails varies over time
- Presence of URLs in phishing vs. non-phishing emails: A comparison to see if phishing emails are more likely to contain URLs

1. Distribution of labels 

In [None]:
sns.countplot(x='label', data=data)
plt.title('Distribution of Email Labels (Raw Data)')
plt.xlabel('Label')
plt.ylabel('Count')
plt.xticks([0, 1], ['Non-Phishing', 'Phishing'])
plt.show()

2. Emails by date

In [None]:
# Explicitly convert datetimes to UTC
data['date'] = pd.to_datetime(data['date'], errors='coerce', utc=True)

# Get the current date
current_date = datetime.now().date()

# First, ensure 'date' is in datetime format
data['date'] = pd.to_datetime(data['date'], errors='coerce', utc=True)

# Extract the date part to focus on daily volume
data['date_only'] = data['date'].dt.date

# Count the number of emails per day
emails_per_day = data.groupby('date_only').size()

# Sort the counts by date
emails_per_day_sorted = emails_per_day.sort_index()

# Filter the dataset to exclude future dates
emails_per_day_filtered = emails_per_day_sorted[emails_per_day_sorted.index <= current_date]

# Plotting
emails_per_day_filtered.plot(kind='line', figsize=(12, 6), marker='o', linestyle='-', logy=True)
plt.title('Email Volume Per Day (Log Scale)')
plt.xlabel('Date')
plt.ylabel('Number of Emails (Log Scale)')
plt.xticks(rotation=45)

plt.show()

# Display dates with the highest email volumes to identify outliers
print(emails_per_day_sorted.sort_values(ascending=False).head())



3. Presence of URLs in Phishing vs. Non-Phishing Emails

In [None]:
# Now plot using 'urls_present' instead of 'urls'
plt.figure(figsize=(10, 6))
sns.countplot(x='label', hue='urls_present', data=data)
plt.title('Presence of URLs in Phishing vs. Non-Phishing Emails')
plt.xlabel('Label')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Non-Phishing', 'Phishing'])
plt.legend(title='URLs Present', labels=['No', 'Yes'])
plt.show()

## Data Preprocessing

Typicall, we should always split the data into training and testing sets before passing it through and preprocessing functions like the BERT preprocessor from Tensorflow Hub. This ensures that the preprocessing is done independtly 

In [None]:
# Combine 'subject' and 'body' headers into one column using a special token like [SEP]
data['text'] = data['subject'] + " [SEP] " + data['body']


# Split the data into training and testing sets
X = data['text'].values
y = data['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

## Model Building

### Resources

- Encoder API: https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-l-12-h-768-a-12/versions/3 This SavedModel implements the encoder API for text embeddings with transformer encoders. It expects a dict with three int32 Tensors as input: input_word_ids, input_mask, and input_type_ids.

- Preprocessor API: https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-preprocess/versions/3 This SavedModel implements the preprocessor API for text embeddings with Transformer encoders, which offers several ways to go from one or more batches of text segments (plain text encoded as UTF-8) to the inputs for the Transformer encoder model.

In [None]:
# Define the input for the preprocessor
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)

In [None]:
print(text_input)

In [None]:
# Load the BERT preprocessor and encoder from TensorFlow Hub
preprocessor_url = "https://www.kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-preprocess/versions/3"
preprocessor = hub.KerasLayer(preprocessor_url)

encoder_inputs = preprocessor(text_input)

encoder_url = "https://kaggle.com/models/tensorflow/bert/frameworks/TensorFlow2/variations/en-uncased-l-12-h-768-a-12/versions/3"
encoder = hub.KerasLayer(encoder_url, trainable=True)

In [None]:
# Build the model
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]      # [batch_size, 768].
sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 768].

In [None]:
# Use the pooled_output for classification tasks
pooled_output = outputs['pooled_output']
dropout = Dropout(0.1)(pooled_output)
class_output = Dense(1, activation='sigmoid', name='class_output')(dropout)

model = Model(inputs=[text_input], outputs=[class_output])


In [None]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=2e-5),
              loss='binary_crossentropy',
              metrics=['accuracy'])


In [None]:
# Print the model structure
model.summary()


In [None]:
# Train the model
model.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_test, y_test))


## Evaluate the model

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

In [None]:
# Plot training history
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0, 1])
plt.legend(loc='lower right')
plt.show()