In [17]:
from datasets import load_dataset

data = load_dataset("zefang-liu/phishing-email-dataset")

df = data['train'].to_pandas()

print(df.head())
print(df.info)
print(df.shape)
print(df.columns)

   Unnamed: 0                                         Email Text  \
0           0  re : 6 . 1100 , disc : uniformitarianism , re ...   
1           1  the other side of * galicismos * * galicismo *...   
2           2  re : equistar deal tickets are you still avail...   
3           3  \nHello I am your hot lil horny toy.\n    I am...   
4           4  software at incredibly low prices ( 86 % lower...   

       Email Type  
0      Safe Email  
1      Safe Email  
2      Safe Email  
3  Phishing Email  
4  Phishing Email  
<bound method DataFrame.info of        Unnamed: 0                                         Email Text  \
0               0  re : 6 . 1100 , disc : uniformitarianism , re ...   
1               1  the other side of * galicismos * * galicismo *...   
2               2  re : equistar deal tickets are you still avail...   
3               3  \nHello I am your hot lil horny toy.\n    I am...   
4               4  software at incredibly low prices ( 86 % lower...   
...    

In [18]:
# Cell 1: Load and prepare your original dataset
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming your dataset is already loaded as 'df'
# with columns 'Email Text' and 'Email Type'
print("Original dataset distribution:")
print(df['Email Type'].value_counts())

# Convert labels to numerical format for deep learning
label_map = {'Safe Email': 0, 'Phishing Email': 1}
df_dl = df.copy()
df_dl['label'] = df_dl['Email Type'].map(label_map)

Original dataset distribution:
Email Type
Safe Email        11322
Phishing Email     7328
Name: count, dtype: int64


In [19]:
import os

# Use your project directory explicitly
project_dir = r'D:\phishing_repo_thesis'  # r prefix for raw string to handle backslashes
balanced_dir = os.path.join(project_dir, 'balanced_data_files')
os.makedirs(balanced_dir, exist_ok=True)

print(f"Directory created at: {balanced_dir}")


Directory created at: D:\phishing_repo_thesis\balanced_data_files


In [20]:
# Cell 2: Create dataframe for deep learning
# For deep learning, we'll use class weights instead of resampling
# But we'll create train/val/test splits with stratification

# Create stratified splits for deep learning
# 70% train, 15% validation, 15% test
X = df_dl['Email Text']
y = df_dl['label']

# First split: 85% train+val, 15% test
X_temp, X_test_dl, y_temp, y_test_dl = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

# Second split: 70% train, 15% validation (from the remaining 85%)
X_train_dl, X_val_dl, y_train_dl, y_val_dl = train_test_split(
    X_temp, y_temp, test_size=0.15/0.85, random_state=42, stratify=y_temp
)

# Create final dataframes
df_train_dl = pd.DataFrame({'Email Text': X_train_dl, 'label': y_train_dl}).reset_index(drop=True)
df_val_dl = pd.DataFrame({'Email Text': X_val_dl, 'label': y_val_dl}).reset_index(drop=True)
df_test_dl = pd.DataFrame({'Email Text': X_test_dl, 'label': y_test_dl}).reset_index(drop=True)

print("\nDeep Learning datasets created:")
print(f"Training set: {len(df_train_dl)} samples")
print(f"Validation set: {len(df_val_dl)} samples")
print(f"Test set: {len(df_test_dl)} samples")
print("Class distribution in training set:")
print(df_train_dl['label'].value_counts())

# Calculate class weights for deep learning
total_samples = len(df_train_dl)
n_classes = 2
class_counts = df_train_dl['label'].value_counts().sort_index()
class_weights = total_samples / (n_classes * class_counts)
print("\nClass weights for deep learning:")
print(class_weights.to_dict())

# Save deep learning datasets
# Save your files with absolute paths
df_train_dl.to_csv(os.path.join(balanced_dir, 'train_dl.csv'), index=False)
df_val_dl.to_csv(os.path.join(balanced_dir, 'val_dl.csv'), index=False)
df_test_dl.to_csv(os.path.join(balanced_dir, 'test_dl.csv'), index=False)

# Print the location so you know where to look
print(f"Files saved to: {balanced_dir}")


Deep Learning datasets created:
Training set: 13054 samples
Validation set: 2798 samples
Test set: 2798 samples
Class distribution in training set:
label
0    7924
1    5130
Name: count, dtype: int64

Class weights for deep learning:
{0: 0.8237001514386674, 1: 1.2723196881091619}
Files saved to: D:\phishing_repo_thesis\balanced_data_files


In [21]:
from sklearn.model_selection import train_test_split
import os

In [22]:
# ML datasets (Train = 70, Val = 15, Test = 15)

# Split 1
df_temp, df_test = train_test_split(
    df,
    test_size=0.15,
    random_state=42,
    stratify=df['Email Type'],
)

# Split 2
df_train, df_val = train_test_split(
    df_temp,
    test_size=0.15/0.85,
    random_state=42,
    stratify=df_temp['Email Type'],
)


In [23]:
# Verify split proportions for ML

total = len(df)
print(f"Total samples: {total}")
print(f"Training: {len(df_train)} ({len(df_train)/total:.1%})")
print(f"Validation: {len(df_val)} ({len(df_val)/total:.1%})")
print(f"Testing: {len(df_test)} ({len(df_test)/total:.1%})")

Total samples: 18650
Training: 13054 (70.0%)
Validation: 2798 (15.0%)
Testing: 2798 (15.0%)


In [24]:
print("\nClass distribution in training set:")
print(df_train['Email Type'].value_counts())
print("\nClass distribution in validation set:")
print(df_val['Email Type'].value_counts())
print("\nClass distribution in test set:")
print(df_test['Email Type'].value_counts())


Class distribution in training set:
Email Type
Safe Email        7924
Phishing Email    5130
Name: count, dtype: int64

Class distribution in validation set:
Email Type
Safe Email        1699
Phishing Email    1099
Name: count, dtype: int64

Class distribution in test set:
Email Type
Safe Email        1699
Phishing Email    1099
Name: count, dtype: int64


In [25]:
# Save original raw data
df_train.to_csv(os.path.join(balanced_dir, 'train_raw.csv'), index=False)
df_val.to_csv(os.path.join(balanced_dir, 'val_raw.csv'), index=False)
df_test.to_csv(os.path.join(balanced_dir, 'test_raw.csv'), index=False)