In [1]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np
import re
from urllib.parse import urlparse
from collections import Counter
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries imported successfully!")


‚úÖ Libraries imported successfully!


In [2]:
# Cell 2: Utility Functions

def shannon_entropy(text):
    """Calculate Shannon entropy of a string"""
    if not text or len(text) == 0:
        return 0
    
    text = str(text)
    counts = Counter(text)
    probs = [count/len(text) for count in counts.values()]
    return -sum(p * math.log2(p) for p in probs if p > 0)


def extract_domain(url):
    """Extract domain from URL"""
    try:
        parsed = urlparse(str(url))
        domain = parsed.netloc if parsed.netloc else parsed.path.split('/')[0]
        return domain.lower()
    except:
        return ""


def count_subdomains(url):
    """Count number of subdomains"""
    domain = extract_domain(url)
    if not domain:
        return 0
    parts = domain.split('.')
    return max(0, len(parts) - 2)


def has_ip_address(url):
    """Check if URL contains IP address"""
    ip_pattern = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
    return 1 if re.search(ip_pattern, str(url)) else 0


def is_suspicious_tld(tld):
    """Check if TLD is commonly used for phishing"""
    suspicious_tlds = ['.tk', '.ml', '.ga', '.cf', '.gq', '.xyz', '.top', 
                       '.work', '.click', '.link', '.download']
    return 1 if str(tld).lower() in suspicious_tlds else 0


def count_suspicious_chars(url):
    """Count suspicious characters in URL"""
    suspicious = ['@', '~', '%20', '..', '//']
    count = sum(str(url).count(char) for char in suspicious)
    return count


def extract_brand_keywords(text):
    """Extract potential brand names from text"""
    brands = ['paypal', 'amazon', 'google', 'microsoft', 'apple', 'facebook',
              'netflix', 'bank', 'ebay', 'linkedin', 'instagram', 'twitter']
    text_lower = str(text).lower()
    found = [brand for brand in brands if brand in text_lower]
    return ' '.join(found) if found else ""


def calculate_domain_trust_score(domain_age, web_who_is, web_https, web_tld):
    """
    Calculate domain trust score based on multiple factors
    Score range: 0 (suspicious) to 1 (trusted)
    """
    score = 0.0
    
    # Age factor (0-0.4)
    try:
        age = float(domain_age) if domain_age else 0
        if age > 365 * 3:
            score += 0.4
        elif age > 365:
            score += 0.3
        elif age > 180:
            score += 0.2
        elif age > 30:
            score += 0.1
    except:
        pass
    
    # WHOIS completeness (0-0.2)
    if str(web_who_is).lower() == 'complete':
        score += 0.2
    
    # HTTPS (0-0.2)
    if str(web_https).lower() == 'yes':
        score += 0.2
    
    # TLD reputation (0-0.2)
    trusted_tlds = ['.com', '.org', '.net', '.edu', '.gov']
    if str(web_tld).lower() in trusted_tlds:
        score += 0.2
    elif is_suspicious_tld(web_tld):
        score -= 0.1
    
    return max(0, min(1, score))


def calculate_semantic_coherence(email_text, url_text, content_text):
    """
    Calculate semantic coherence between email, URL, and content
    Returns score 0-1 (1 = highly coherent, 0 = incoherent)
    """
    try:
        texts = [str(t).lower() for t in [email_text, url_text, content_text]]
        
        vectorizer = TfidfVectorizer(max_features=50, stop_words='english')
        
        non_empty = [t for t in texts if len(t.strip()) > 0]
        if len(non_empty) < 2:
            return 0.5
        
        tfidf_matrix = vectorizer.fit_transform(non_empty)
        
        similarities = []
        for i in range(len(non_empty)):
            for j in range(i + 1, len(non_empty)):
                sim = cosine_similarity(tfidf_matrix[i:i+1], tfidf_matrix[j:j+1])[0][0]
                similarities.append(sim)
        
        return np.mean(similarities) if similarities else 0.5
    except:
        return 0.5


def calculate_email_url_similarity(email_domain, url_domain):
    """Calculate similarity between email domain and URL domain"""
    if not email_domain or not url_domain:
        return 0.0
    
    email_domain = str(email_domain).lower().replace('www.', '')
    url_domain = str(url_domain).lower().replace('www.', '')
    
    if email_domain == url_domain:
        return 1.0
    
    if email_domain in url_domain or url_domain in email_domain:
        return 0.7
    
    set1 = set(email_domain)
    set2 = set(url_domain)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    
    return intersection / union if union > 0 else 0.0


def brand_consistency(email_brand, content_brand):
    """Check if brands match between email and content"""
    if not email_brand or not content_brand:
        return 0.5
    
    email_set = set(email_brand.split())
    content_set = set(content_brand.split())
    
    if email_set.intersection(content_set):
        return 1.0
    elif email_set or content_set:
        return 0.0
    return 0.5

print("‚úÖ Utility functions defined!")

‚úÖ Utility functions defined!


In [3]:
# Cell 3: Main Feature Engineering Function

def engineer_features(df):
    """Add all new features to the dataframe"""
    print("="*60)
    print("STARTING FEATURE ENGINEERING")
    print("="*60)
    
    df_new = df.copy()
    
    # Phase 1: Structural Features
    print("\nüìä Phase 1: Structural Features...")
    
    df_new['js_obfuscation_ratio'] = df_new['web_js_obf_len'] / (df_new['web_js_len'] + 1)
    df_new['url_has_ip'] = df_new['web_url'].apply(has_ip_address)
    df_new['url_num_dots'] = df_new['web_url'].apply(lambda x: str(x).count('.'))
    df_new['url_num_hyphens'] = df_new['web_url'].apply(lambda x: str(x).count('-'))
    df_new['url_num_underscores'] = df_new['web_url'].apply(lambda x: str(x).count('_'))
    df_new['url_num_slashes'] = df_new['web_url'].apply(lambda x: str(x).count('/'))
    df_new['url_num_queries'] = df_new['web_url'].apply(lambda x: str(x).count('?'))
    df_new['url_num_ampersands'] = df_new['web_url'].apply(lambda x: str(x).count('&'))
    df_new['url_suspicious_chars'] = df_new['web_url'].apply(count_suspicious_chars)
    df_new['domain_num_subdomains'] = df_new['web_url'].apply(count_subdomains)
    df_new['domain_contains_numbers'] = df_new['web_url'].apply(
        lambda x: 1 if re.search(r'\d', extract_domain(str(x))) else 0
    )
    df_new['domain_suspicious_tld'] = df_new['web_tld'].apply(is_suspicious_tld)
    
    print("   ‚úì Added 12 structural features")
    
    # Phase 2: Entropy Features
    print("\nüî¢ Phase 2: Entropy Features...")
    
    df_new['url_entropy'] = df_new['web_url'].apply(shannon_entropy)
    df_new['domain_entropy'] = df_new['web_url'].apply(
        lambda x: shannon_entropy(extract_domain(str(x)))
    )
    df_new['content_entropy'] = df_new['web_content'].apply(
        lambda x: shannon_entropy(str(x)[:1000])
    )
    
    print("   ‚úì Added 3 entropy features")
    
    # Phase 3: Domain Trust Score
    print("\nüõ°Ô∏è  Phase 3: Domain Trust Score...")
    
    df_new['domain_trust_score'] = df_new.apply(
        lambda row: calculate_domain_trust_score(
            row.get('domain_age', 0),
            row.get('web_who_is', ''),
            row.get('web_https', ''),
            row.get('web_tld', '')
        ), axis=1
    )
    
    print("   ‚úì Added 1 trust score feature")

    # Phase 4: Email-URL Consistency
    print("\nüîó Phase 4: Email-URL Consistency...")
    
    df_new['email_domain_extracted'] = df_new['email_from_domain'].apply(extract_domain)
    df_new['url_domain_extracted'] = df_new['web_url'].apply(extract_domain)
    
    df_new['email_domain_matches_url'] = (
        df_new['email_domain_extracted'] == df_new['url_domain_extracted']
    ).astype(int)
    
    df_new['email_url_domain_similarity'] = df_new.apply(
        lambda row: calculate_email_url_similarity(
            row['email_domain_extracted'],
            row['url_domain_extracted']
        ), axis=1
    )
    
    df_new = df_new.drop(['email_domain_extracted', 'url_domain_extracted'], axis=1)
    
    print("   Added 2 consistency features")
    
    # Phase 5: Content Analysis
    print("\nüìù Phase 5: Content Analysis...")
    
    df_new['content_num_forms'] = df_new['web_content'].apply(
        lambda x: str(x).lower().count('<form')
    )
    df_new['content_num_inputs'] = df_new['web_content'].apply(
        lambda x: str(x).lower().count('<input')
    )
    df_new['content_num_scripts'] = df_new['web_content'].apply(
        lambda x: str(x).lower().count('<script')
    )
    
    suspicious_keywords = ['verify', 'urgent', 'suspended', 'account', 'confirm', 
                          'password', 'update', 'click', 'login', 'security']
    df_new['content_suspicious_keywords'] = df_new['web_content'].apply(
        lambda x: sum(1 for kw in suspicious_keywords if kw in str(x).lower())
    )
    
    print("    Added 4 content features")
    
    # Phase 6: Semantic Coherence
    print("\nüß† Phase 6: Semantic Coherence...")
    
    df_new['semantic_coherence_score'] = df_new.apply(
        lambda row: calculate_semantic_coherence(
            str(row.get('email_from_domain', '')),
            str(row.get('web_url', '')),
            str(row.get('web_content', ''))[:500]
        ), axis=1
    )
    
    print("    Added 1 semantic feature")
    
    # Phase 7: Brand Consistency
    print("\nüè∑Ô∏è  Phase 7: Brand Consistency...")
    
    df_new['email_brand_extracted'] = df_new['email_from_domain'].apply(extract_brand_keywords)
    df_new['content_brand_extracted'] = df_new['web_content'].apply(
        lambda x: extract_brand_keywords(str(x)[:500])
    )
    
    df_new['brand_consistency_score'] = df_new.apply(
        lambda row: brand_consistency(
            row['email_brand_extracted'],
            row['content_brand_extracted']
        ), axis=1
    )
    
    df_new = df_new.drop(['email_brand_extracted', 'content_brand_extracted'], axis=1)
    
    print("    Added 1 brand consistency feature")
    
    # Summary
    new_features_count = len(df_new.columns) - len(df.columns)
    print("\n" + "="*60)
    print("‚úÖ FEATURE ENGINEERING COMPLETE!")
    print("="*60)
    print(f"Original features: {len(df.columns)}")
    print(f"New features added: {new_features_count}")
    print(f"Total features: {len(df_new.columns)}")
    print("="*60)
    
    return df_new

print("‚úÖ Main function defined!")

‚úÖ Main function defined!


In [4]:
# Cell 4: Load Your Dataset

# Load CSV file
print("\nüìÇ Loading dataset...")
df = pd.read_csv('new_dataset_classical.csv')  


# remove accidental empty columns
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]


print(f"‚úÖ Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"\nColumns in original dataset:")
print(list(df.columns))
print(f"\nFirst 3 rows:")
print(df.head(3))


üìÇ Loading dataset...
‚úÖ Dataset loaded successfully!
Shape: (8000, 15)

Columns in original dataset:
['email_subject_len', 'email_has_urgent_keyword', 'email_from_domain', 'web_url', 'web_url_len', 'web_ip_add', 'web_geo_loc', 'web_tld', 'web_who_is', 'web_https', 'web_js_len', 'web_js_obf_len', 'web_content', 'domain_age', 'final_label']

First 3 rows:
   email_subject_len  email_has_urgent_keyword              email_from_domain  \
0                 32                         0  spamassassin.zones.apache.org   
1                 46                         0                     gmail.com>   
2                 21                         0                telefonica.net>   

                              web_url  web_url_len       web_ip_add  \
0  http://tools.ietf.org/html/rfc1583           34     30.180.42.35   
1         http://www.quickfixgolf.com           27     150.66.16.42   
2           http://www.lvnazarene.org           25  180.123.185.229   

     web_geo_loc web_tld web_

In [5]:
# Cell 5: Run Feature Engineering

print("\n" + "="*60)
print("RUNNING FEATURE ENGINEERING...")
print("="*60)

df_enhanced = engineer_features(df)


RUNNING FEATURE ENGINEERING...
STARTING FEATURE ENGINEERING

üìä Phase 1: Structural Features...
   ‚úì Added 12 structural features

üî¢ Phase 2: Entropy Features...
   ‚úì Added 3 entropy features

üõ°Ô∏è  Phase 3: Domain Trust Score...
   ‚úì Added 1 trust score feature

üîó Phase 4: Email-URL Consistency...
   Added 2 consistency features

üìù Phase 5: Content Analysis...
    Added 4 content features

üß† Phase 6: Semantic Coherence...
    Added 1 semantic feature

üè∑Ô∏è  Phase 7: Brand Consistency...
    Added 1 brand consistency feature

‚úÖ FEATURE ENGINEERING COMPLETE!
Original features: 15
New features added: 24
Total features: 39


In [6]:
# Cell 8: Save FINAL Dataset


print("\nüíæ Saving FINAL dataset...")
output_file = 'email_phishing_dataset_FINAL.csv'
df_enhanced.to_csv(output_file, index=False)
print(f"‚úÖ Saved to: {output_file}")



üíæ Saving FINAL dataset...
‚úÖ Saved to: email_phishing_dataset_FINAL.csv
