In [6]:
import pandas as pd

# Load each dataset
datasets = {
    'CEAS_08': pd.read_csv('ceas_08.csv'),
    'enron': pd.read_csv('enron.csv'),
    'ling': pd.read_csv('ling.csv'),
    'nazario': pd.read_csv('nazario.csv'),
    'Nigerian_Fraud': pd.read_csv('nigerian_fraud.csv'),
    # 'SpamAssassin': pd.read_csv('spamassassin.csv')
}

# Standardize each dataset
def standardize_dataset(df, source_name):
    standardized = pd.DataFrame()
    
    # Map to common schema
    standardized['subject'] = df.get('subject', '')
    standardized['body'] = df.get('body', '')
    standardized['label'] = df['label']
    standardized['sender'] = df.get('sender', None)
    standardized['receiver'] = df.get('receiver', None)
    standardized['date'] = df.get('date', None)
    standardized['urls'] = df.get('urls', None)
    standardized['source'] = source_name
    
    return standardized

# Combine all datasets
combined_df = pd.concat([
    standardize_dataset(df, name) 
    for name, df in datasets.items()
], ignore_index=True)

In [7]:
# Fill missing text fields with empty strings
combined_df['subject'] = combined_df['subject'].fillna('')
combined_df['body'] = combined_df['body'].fillna('')

# Check completeness
print(combined_df.isnull().sum())

subject         0
body            0
label           0
sender      32957
receiver    34508
date        33109
urls        32626
source          0
dtype: int64


In [8]:
import re

def clean_text(text):
    if pd.isna(text):
        return ''
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    # Remove HTML tags (common in emails)
    text = re.sub(r'<[^>]+>', '', text)
    
    # Handle special characters (optional - depends on your model)
    # text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    return text

combined_df['subject_clean'] = combined_df['subject'].apply(clean_text)
combined_df['body_clean'] = combined_df['body'].apply(clean_text)

In [9]:
# Check for exact duplicates
print(f"Duplicates: {combined_df.duplicated(subset=['subject', 'body']).sum()}")

# Remove duplicates
combined_df = combined_df.drop_duplicates(subset=['subject', 'body'], keep='first')

Duplicates: 0


In [10]:
# Standardize date format
combined_df['date'] = pd.to_datetime(combined_df['date'], errors='coerce')

In [11]:
print(combined_df['label'].value_counts())
print(combined_df.groupby(['source', 'label']).size())

label
1    41173
0    35504
Name: count, dtype: int64
source          label
CEAS_08         0        17312
                1        21842
Nigerian_Fraud  1         3332
enron           0        15791
                1        13976
ling            0         2401
                1          458
nazario         1         1565
dtype: int64


In [12]:
combined_df['subject_length'] = combined_df['subject'].str.len()
combined_df['body_length'] = combined_df['body'].str.len()

# Compare lengths by label
combined_df.groupby('label')[['subject_length', 'body_length']].describe()

Unnamed: 0_level_0,subject_length,subject_length,subject_length,subject_length,subject_length,subject_length,subject_length,subject_length,body_length,body_length,body_length,body_length,body_length,body_length,body_length,body_length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,35504.0,39.262984,22.721899,0.0,24.0,36.0,50.0,815.0,35504.0,2174.392068,5178.198201,1.0,526.0,1088.0,2186.25,230120.0
1,41173.0,33.307313,43.032401,0.0,20.0,29.0,42.0,7170.0,41173.0,1312.297817,23290.751224,1.0,204.0,442.0,1368.0,4599644.0


In [13]:
# Find records with empty subject AND body
empty_records = combined_df[
    (combined_df['subject'].str.len() == 0) & 
    (combined_df['body'].str.len() == 0)
]
print(f"Empty records: {len(empty_records)}")

# Remove them
combined_df = combined_df[
    (combined_df['subject'].str.len() > 0) | 
    (combined_df['body'].str.len() > 0)
]

Empty records: 0


In [14]:
# Save for modeling
combined_df.to_csv('cleaned_combined_emails.csv', index=False)

# Create a summary report
summary = {
    'total_records': len(combined_df),
    'phishing_count': (combined_df['label'] == 1).sum(),
    'legitimate_count': (combined_df['label'] == 0).sum(),
    'sources': combined_df['source'].value_counts().to_dict()
}
print(summary)

{'total_records': 76677, 'phishing_count': np.int64(41173), 'legitimate_count': np.int64(35504), 'sources': {'CEAS_08': 39154, 'enron': 29767, 'Nigerian_Fraud': 3332, 'ling': 2859, 'nazario': 1565}}


In [15]:
# Combine subject and body for text analysis
combined_df['full_text'] = combined_df['subject'] + ' ' + combined_df['body']

# OR keep them separate and concatenate their features later

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=5000,  # Limit to top 5000 words
    ngram_range=(1, 2),  # Unigrams and bigrams
    min_df=5,  # Ignore rare words
    max_df=0.8  # Ignore very common words
)

X_text = vectorizer.fit_transform(combined_df['full_text'])

In [19]:
import re
import pandas as pd
import numpy as np

def extract_features(df):
    features = pd.DataFrame()
    
    # === EMAIL METADATA FEATURES ===
    
    # Sender domain features (handle NaN)
    features['sender_has_domain'] = df['sender'].notna().astype(int)
    features['sender_is_freemail'] = df['sender'].fillna('').astype(str).str.contains(
        '@gmail|@yahoo|@hotmail|@outlook', 
        case=False, regex=True
    ).astype(int)
    
    # Receiver features
    features['has_receiver'] = df['receiver'].notna().astype(int)
    
    # === TEXT LENGTH FEATURES ===
    features['subject_length'] = df['subject'].fillna('').str.len()
    features['body_length'] = df['body'].fillna('').str.len()
    features['subject_word_count'] = df['subject'].fillna('').str.split().str.len().fillna(0)
    features['body_word_count'] = df['body'].fillna('').str.split().str.len().fillna(0)
    
    # === URL FEATURES ===
    # Convert urls to string first, handling NaN and lists
    urls_str = df['urls'].fillna('').astype(str)
    features['url_count'] = urls_str.str.count('http')
    features['has_urls'] = (features['url_count'] > 0).astype(int)
    features['suspicious_tld'] = urls_str.str.contains(
        '.tk|.ml|.ga|.cf|.gq', 
        case=False, regex=True
    ).astype(int)
    
    # === SUSPICIOUS PATTERNS ===
    # Capital letters (shouting)
    def caps_ratio(text):
        text = str(text)
        if len(text) == 0:
            return 0
        return sum(1 for c in text if c.isupper()) / len(text)
    
    features['subject_caps_ratio'] = df['subject'].fillna('').apply(caps_ratio)
    
    # Exclamation marks (count manually)
    features['subject_exclamation'] = df['subject'].fillna('').apply(lambda x: str(x).count('!'))
    features['body_exclamation'] = df['body'].fillna('').apply(lambda x: str(x).count('!'))
    
    # Dollar signs
    features['has_dollar_sign'] = df['body'].fillna('').apply(lambda x: 1 if '$' in str(x) else 0)
    
    # Common phishing keywords
    phishing_keywords = ['urgent', 'verify', 'account', 'suspended', 'click', 
                         'confirm', 'password', 'winner', 'prize', 'claim']
    
    def count_phishing_keywords(text):
        text = str(text).lower()
        return sum(keyword in text for keyword in phishing_keywords)
    
    # Create full_text if not exists
    full_text = (df['subject'].fillna('') + ' ' + df['body'].fillna(''))
    features['phishing_keyword_count'] = full_text.apply(count_phishing_keywords)
    
    # === TIME FEATURES (if date available) ===
    if 'date' in df.columns and df['date'].notna().sum() > 0:
        date_col = pd.to_datetime(df['date'], errors='coerce')
        features['hour_of_day'] = date_col.dt.hour.fillna(-1).astype(int)
        features['day_of_week'] = date_col.dt.dayofweek.fillna(-1).astype(int)
        features['is_weekend'] = (date_col.dt.dayofweek >= 5).fillna(False).astype(int)
    else:
        # Add dummy columns if date not available
        features['hour_of_day'] = -1
        features['day_of_week'] = -1
        features['is_weekend'] = 0
    
    # Fill any remaining NaN values
    features = features.fillna(0)
    
    return features

# Extract all engineered features
X_engineered = extract_features(combined_df)

# Check the results
print(X_engineered.head())
print(f"\nFeature shape: {X_engineered.shape}")
print(f"\nFeature columns: {X_engineered.columns.tolist()}")
print(f"\nAny NaN values: {X_engineered.isna().sum().sum()}")

   sender_has_domain  sender_is_freemail  has_receiver  subject_length  \
0                  1                   0             1              25   
1                  1                   0             1              22   
2                  1                   0             1              20   
3                  1                   0             1             150   
4                  1                   0             1              26   

   body_length  subject_word_count  body_word_count  url_count  has_urls  \
0          273                   6               46          0         0   
1           82                   3                9          0         0   
2         3918                   4              302          0         0   
3        24418                  10             2660          0         0   
4          175                   1                2          0         0   

   suspicious_tld  subject_caps_ratio  subject_exclamation  body_exclamation  \
0               0 

In [20]:
from scipy.sparse import hstack
import numpy as np

# Combine text features with engineered features
if isinstance(X_text, np.ndarray):
    # Dense array (embeddings)
    X_combined = np.hstack([X_text, X_engineered.values])
else:
    # Sparse matrix (TF-IDF)
    X_combined = hstack([X_text, X_engineered.values])

y = combined_df['label'].values

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.2, random_state=42, stratify=y
)

# Train model
model = LogisticRegression(max_iter=1000, class_weight='balanced')
# OR
# model = RandomForestClassifier(n_estimators=100, class_weight='balanced')

model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.86      0.80      7101
           1       0.86      0.76      0.80      8235

    accuracy                           0.80     15336
   macro avg       0.81      0.81      0.80     15336
weighted avg       0.81      0.80      0.80     15336

[[6099 1002]
 [2014 6221]]


In [22]:
import pandas as pd

# Load your cleaned data
df = pd.read_csv('cleaned_combined_emails.csv')

# Inspect the data
print("Column names:")
print(df.columns.tolist())
print(f"\nShape: {df.shape}")
print("\nFirst few rows:")
print(df.head())
print("\nData types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())

Column names:
['subject', 'body', 'label', 'sender', 'receiver', 'date', 'urls', 'source', 'subject_clean', 'body_clean', 'subject_length', 'body_length']

Shape: (76677, 12)

First few rows:
                                             subject  \
0                          Never agree to be a loser   
1                             Befriend Jenna Jameson   
2                               CNN.com Daily Top 10   
3  Re: svn commit: r619753 - in /spamassassin/tru...   
4                         SpecialPricesPharmMoreinfo   

                                                body  label  \
0  Buck up, your troubles caused by small dimensi...      1   
1  \nUpgrade your sex and pleasures with these te...      1   
2  >+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...      1   
3  Would anyone object to removing .so from this ...      0   
4  \nWelcomeFastShippingCustomerSupport\nhttp://7...      1   

                                              sender  \
0                   Young Esposito <