# Anomaly Detection for Phishing Emails

# Importing Libraries

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
import pandas as pd

In [2]:
# Load dataset
df = pd.read_csv("Dataset II.csv")

In [3]:
df.head()

Unnamed: 0,sender,receiver,date,subject,body,label,urls
0,Young Esposito <Young@iworld.de>,user4@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 16:31:02 -0700",Never agree to be a loser,"Buck up, your troubles caused by small dimensi...",1,1
1,Mok <ipline's1983@icable.ph>,user2.2@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 18:31:03 -0500",Befriend Jenna Jameson,\nUpgrade your sex and pleasures with these te...,1,1
2,Daily Top 10 <Karmandeep-opengevl@universalnet...,user2.9@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 20:28:00 -1200",CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1
3,Michael Parker <ivqrnai@pobox.com>,SpamAssassin Dev <xrh@spamassassin.apache.org>,"Tue, 05 Aug 2008 17:31:20 -0600",Re: svn commit: r619753 - in /spamassassin/tru...,Would anyone object to removing .so from this ...,0,1
4,Gretchen Suggs <externalsep1@loanofficertool.com>,user2.2@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 19:31:21 -0400",SpecialPricesPharmMoreinfo,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1,1


# Text Preprocessing (Subject and Body)

In [4]:
# Initialize stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [5]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    words = word_tokenize(text)  # Tokenize text
    words = [stemmer.stem(word) for word in words if word not in stop_words]  # Remove stop words and stem
    return " ".join(words)

In [6]:
# Handle missing values by replacing NaN with empty strings
df['subject'] = df['subject'].fillna('')
df['body'] = df['body'].fillna('')

In [7]:
# Apply preprocessing to 'Subject' and 'Body' columns
df['processed_subject'] = df['subject'].apply(preprocess_text)
df['processed_body'] = df['body'].apply(preprocess_text)

In [8]:
# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
subject_vectors = vectorizer.fit_transform(df['processed_subject'])
body_vectors = vectorizer.fit_transform(df['processed_body'])

# URL Feature Extraction

In [9]:
import re
import requests

In [10]:
def extract_urls(text):
    urls = re.findall(r'https?://\S+', text)  # Extract all URLs starting with 'http' or 'https'
    return urls

In [11]:
def check_url_safety(url):
    # Use an external API to check URL safety (e.g., Google Safe Browsing, VirusTotal)
    response = requests.get(f"https://www.virustotal.com/api/v3/urls/{url}")
    return response.json()['data']['attributes']['last_analysis_stats']

In [12]:
# Apply URL extraction
df['urls'] = df['body'].apply(extract_urls)

# Label Encoding

In [13]:
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# Isolation Forest Model

In [14]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report

In [15]:
# Convert numpy arrays to DataFrames
subject_df = pd.DataFrame(subject_vectors.toarray())
body_df = pd.DataFrame(body_vectors.toarray())

In [16]:
# Concatenate the DataFrames along columns (axis=1)
X = pd.concat([subject_df, body_df], axis=1)

In [17]:
# Train Isolation Forest for anomaly detection
model = IsolationForest(n_estimators=200, contamination=0.05, max_samples=0.8, random_state=42)
model.fit(X)

In [18]:
# Predict anomalies
df['predicted_anomaly'] = model.predict(X)

In [19]:
# Convert the predictions: -1 for outliers (phishing), 1 for inliers (legitimate)
df['predicted_anomaly'] = df['predicted_anomaly'].map({1: 0, -1: 1})

In [20]:
# Evaluate the model
print(classification_report(df['label_encoded'], df['predicted_anomaly']))

              precision    recall  f1-score   support

           0       0.45      1.00      0.62     17312
           1       1.00      0.05      0.09     21842

    accuracy                           0.47     39154
   macro avg       0.73      0.52      0.36     39154
weighted avg       0.76      0.47      0.32     39154



In [21]:
from sklearn.metrics import accuracy_score

In [22]:
# Calculate accuracy by comparing predicted and true labels
accuracy = accuracy_score(df['label_encoded'], df['predicted_anomaly'])

# Print the accuracy
print(f"Accuracy: {accuracy * 100:.2f}")

Accuracy: 46.74


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39154 entries, 0 to 39153
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sender    39154 non-null  object
 1   receiver  38692 non-null  object
 2   date      39154 non-null  object
 3   subject   39126 non-null  object
 4   body      39154 non-null  object
 5   label     39154 non-null  int64 
 6   urls      39154 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 2.1+ MB


In [5]:
df.isna().sum()

sender        0
receiver    462
date          0
subject      28
body          0
label         0
urls          0
dtype: int64

In [6]:
df.groupby('label').describe()

Unnamed: 0_level_0,urls,urls,urls,urls,urls,urls,urls,urls
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,17312.0,0.65521,0.475313,0.0,0.0,1.0,1.0,1.0
1,21842.0,0.681668,0.46584,0.0,0.0,1.0,1.0,1.0


In [7]:
# Number of phishing emails (label = 1)
phishing_emails = df[df['label'] == 1].shape[0]

# Total number of emails
total_emails = df.shape[0]

# Calculate contamination (proportion of phishing emails)
contamination = phishing_emails / total_emails
print(f"Contamination: {contamination:.4f}")


Contamination: 0.5578
