In [1]:
import pandas as pd
import numpy as np
import re

# ============================================
# STEP 1: LOAD YOUR 4 DATASETS 
# ============================================

print("="*60)
print("STEP 1: LOADING DATASETS")
print("="*60)

ceas = pd.read_csv('CEAS_08.csv')
nazario = pd.read_csv('Nazario.csv')
nigerian = pd.read_csv('Nigerian_Fraud.csv')
spam_assassin = pd.read_csv('SpamAssasin.csv')

print(f"✓ CEAS_08:       {ceas.shape}")
print(f"✓ Nazario:       {nazario.shape}")
print(f"✓ Nigerian Fraud: {nigerian.shape}")
print(f"✓ SpamAssassin:  {spam_assassin.shape}")
print(f"\nTotal: {ceas.shape[0] + nazario.shape[0] + nigerian.shape[0] + spam_assassin.shape[0]:,} samples")

STEP 1: LOADING DATASETS
✓ CEAS_08:       (39154, 7)
✓ Nazario:       (1565, 7)
✓ Nigerian Fraud: (3332, 7)
✓ SpamAssassin:  (5809, 7)

Total: 49,860 samples


In [2]:
# ============================================
# STEP 2: STANDARDIZE AND COMBINE
# ============================================

print("\n" + "="*60)
print("STEP 2: STANDARDIZING SCHEMAS")
print("="*60)

def standardize_dataset(df, source_name):
    """Standardize dataset to common schema"""
    std_df = df.copy()
    std_df['source'] = source_name
    
    # Clean columns
    std_df['subject'] = std_df['subject'].fillna('').astype(str)
    std_df['body'] = std_df['body'].fillna('').astype(str)
    std_df['sender'] = std_df['sender'].fillna('').astype(str)
    std_df['receiver'] = std_df['receiver'].fillna('').astype(str)
    std_df['date'] = std_df['date'].fillna('').astype(str)
    
    # Standardize urls to binary
    std_df['urls'] = pd.to_numeric(std_df['urls'], errors='coerce').fillna(0).astype(int)
    std_df['urls'] = (std_df['urls'] > 0).astype(int)
    
    # Ensure label is int
    std_df['label'] = std_df['label'].astype(int)
    
    # Consistent order
    columns = ['source', 'sender', 'receiver', 'date', 'subject', 'body', 'urls', 'label']
    return std_df[columns]

# Standardize all
ceas_std = standardize_dataset(ceas, 'ceas')
nazario_std = standardize_dataset(nazario, 'nazario')
nigerian_std = standardize_dataset(nigerian, 'nigerian')
spam_assassin_std = standardize_dataset(spam_assassin, 'spam_assassin')

print("✓ All datasets standardized")

# Combine
unified_df = pd.concat([
    ceas_std,
    nazario_std,
    nigerian_std,
    spam_assassin_std
], ignore_index=True)

print(f"✓ Combined shape: {unified_df.shape}")


STEP 2: STANDARDIZING SCHEMAS
✓ All datasets standardized
✓ Combined shape: (49860, 8)


In [3]:
# ============================================
# STEP 3: EXTRACT FEATURES (WITH ROBUST ERROR HANDLING)
# ============================================

print("\n" + "="*60)
print("STEP 3: EXTRACTING FEATURES")
print("="*60)

# 1. Sender domain
def extract_domain(email):
    """Extract domain from email address"""
    if pd.isna(email) or email == '' or '@' not in str(email):
        return None
    try:
        parts = str(email).split('@')
        if len(parts) == 2:
            domain = parts[-1].strip().lower()
            # Remove any trailing characters that aren't part of domain
            domain = re.sub(r'[>\s].*$', '', domain)
            return domain if domain else None
        return None
    except Exception as e:
        return None

unified_df['sender_domain'] = unified_df['sender'].apply(extract_domain)
valid_domains = unified_df['sender_domain'].notna().sum()
print(f"  ✓ Extracted sender domains: {valid_domains:,} valid domains")

# 2. Extract URLs from body (ROBUST VERSION)
def extract_urls(text):
    """Extract URLs with better error handling"""
    if pd.isna(text) or text == '':
        return []
    
    try:
        # More permissive URL pattern
        pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
        urls = re.findall(pattern, str(text))
        
        # Clean URLs - remove trailing punctuation
        cleaned_urls = []
        for url in urls:
            # Remove common trailing punctuation
            url = re.sub(r'[.,;:!?\)\]]+$', '', url)
            if url:
                cleaned_urls.append(url)
        
        return cleaned_urls
    except Exception as e:
        return []

unified_df['url_list'] = unified_df['body'].apply(extract_urls)
unified_df['num_urls'] = unified_df['url_list'].apply(len)
unified_df['urls'] = ((unified_df['urls'] == 1) | (unified_df['num_urls'] > 0)).astype(int)

emails_with_urls = unified_df['urls'].sum()
total_urls = unified_df['num_urls'].sum()
print(f"  ✓ Extracted URLs: {emails_with_urls:,} emails contain {total_urls:,} total URLs")

# 3. Extract domains from URLs (ROBUST VERSION)
def extract_domain_from_url(url):
    """Extract domain from URL with robust error handling"""
    try:
        # Remove protocol
        url = re.sub(r'^https?://', '', url)
        # Take first part (domain and path)
        url = url.split('/')[0]
        # Remove port if present
        url = url.split(':')[0]
        # Remove username/password if present
        if '@' in url:
            url = url.split('@')[-1]
        return url.lower() if url else None
    except:
        return None

def get_url_domains(url_list):
    """Extract domains from list of URLs"""
    domains = []
    for url in url_list:
        domain = extract_domain_from_url(url)
        if domain:
            domains.append(domain)
    return list(set(domains))  # Remove duplicates

unified_df['url_domains'] = unified_df['url_list'].apply(get_url_domains)
print(f"  ✓ Extracted URL domains")


STEP 3: EXTRACTING FEATURES
  ✓ Extracted sender domains: 48,310 valid domains
  ✓ Extracted URLs: 33,843 emails contain 156,620 total URLs
  ✓ Extracted URL domains


In [4]:
# 4. Urgent keywords
urgent_keywords = [
    'urgent', 'immediate', 'action required', 'verify', 'suspend',
    'confirm', 'expire', 'click here', 'act now', 'account locked',
    'security alert', 'unusual activity', 'reset password', 'dear customer',
    'dear user', 'congratulations', 'limited time', 'respond now'
]

def has_urgent(text):
    """Check for urgent keywords"""
    try:
        text_lower = str(text).lower()
        return int(any(kw in text_lower for kw in urgent_keywords))
    except:
        return 0

def count_urgent(text):
    """Count urgent keywords"""
    try:
        text_lower = str(text).lower()
        return sum(1 for kw in urgent_keywords if kw in text_lower)
    except:
        return 0

unified_df['has_urgent_words'] = (unified_df['subject'] + ' ' + unified_df['body']).apply(has_urgent)
unified_df['num_urgent_keywords'] = (unified_df['subject'] + ' ' + unified_df['body']).apply(count_urgent)

emails_with_urgent = unified_df['has_urgent_words'].sum()
print(f"  ✓ Detected urgent keywords: {emails_with_urgent:,} emails")

  ✓ Detected urgent keywords: 9,648 emails


In [5]:
# 5. Text lengths
unified_df['subject_length'] = unified_df['subject'].apply(len)
unified_df['body_length'] = unified_df['body'].apply(len)
print(f"  ✓ Calculated text lengths")

  ✓ Calculated text lengths


In [6]:
# 6. Suspicious domain patterns
def is_suspicious_domain(domain):
    """Check for suspicious patterns in domain"""
    if pd.isna(domain) or domain == '':
        return 0
    
    try:
        domain_lower = str(domain).lower()
        suspicious_patterns = [
            '-',           # Hyphens often used in phishing
            'secure',
            'verify',
            'account',
            'update',
            'confirm',
            'login',
            'banking',
            'alert'
        ]
        return int(any(pattern in domain_lower for pattern in suspicious_patterns))
    except:
        return 0

unified_df['sender_domain_suspicious'] = unified_df['sender_domain'].apply(is_suspicious_domain)
suspicious_domains = unified_df['sender_domain_suspicious'].sum()
print(f"  ✓ Suspicious domains detected: {suspicious_domains:,}")

  ✓ Suspicious domains detected: 3,096


In [7]:
# 7. Domain-brand mismatch (CRITICAL PHISHING INDICATOR)
def check_mismatch(row):
    """
    Check if mentioned brands don't match sender domain
    This is a KEY phishing indicator!
    """
    try:
        # Need both domain and brands
        if pd.isna(row['sender_domain']) or not row['mentioned_brands']:
            return 0
        
        sender_domain = str(row['sender_domain']).lower()
        
        # Check if any mentioned brand appears in sender domain
        for brand in row['mentioned_brands']:
            brand_clean = brand.replace(' ', '').replace('-', '')
            if brand_clean in sender_domain:
                return 0  # Match found - likely legitimate
        
        # No match = MISMATCH = SUSPICIOUS!
        return 1
    except:
        return 0

unified_df['domain_brand_mismatch'] = unified_df.apply(check_mismatch, axis=1)
mismatches = unified_df['domain_brand_mismatch'].sum()
print(f"  ✓ Domain-brand mismatches: {mismatches:,} (KEY PHISHING INDICATOR!)")

  ✓ Domain-brand mismatches: 0 (KEY PHISHING INDICATOR!)


In [8]:
# 8. Receiver domain
unified_df['receiver_domain'] = unified_df['receiver'].apply(extract_domain)

In [9]:
# 9. Same domain check
def same_domain_check(row):
    """Check if sender and receiver are from same domain"""
    try:
        if pd.isna(row['sender_domain']) or pd.isna(row['receiver_domain']):
            return 0
        return int(row['sender_domain'] == row['receiver_domain'])
    except:
        return 0

unified_df['sender_receiver_same_domain'] = unified_df.apply(same_domain_check, axis=1)

print("\n✓ All features extracted successfully!")


✓ All features extracted successfully!


In [10]:
# ============================================
# STEP 4: CREATE METADATA VECTORS
# ============================================

print("\n" + "="*60)
print("STEP 4: CREATING METADATA VECTORS")
print("="*60)

def create_metadata_vector(row):
    """Create 20-dimensional metadata vector"""
    try:
        return [
            # URL features (5)
            float(row['urls']),
            min(float(row['num_urls']) / 10.0, 1.0),
            float(len(row['url_domains']) > 0),
            float(row['num_urls'] > 3),
            float(len(row['url_domains']) > 1),
            
            # Text content features (5)
            float(row['has_urgent_words']),
            min(float(row['num_urgent_keywords']) / 5.0, 1.0),
            float(row['has_brand_mention']),
            min(row['subject_length'] / 100.0, 1.0),
            min(row['body_length'] / 2000.0, 1.0),
            
            # Domain/sender features (6)
            float(row['sender_domain_suspicious']),
            float(row['domain_brand_mismatch']),
            float(pd.notna(row['sender_domain'])),
            float(pd.notna(row['receiver_domain'])),
            float(row['sender_receiver_same_domain']),
            min(float(row['num_brands_mentioned']) / 3.0, 1.0),
            
            # Source one-hot (4)
            float(row['source'] == 'ceas'),
            float(row['source'] == 'nazario'),
            float(row['source'] == 'nigerian'),
            float(row['source'] == 'spam_assassin'),
        ]
    except Exception as e:
        # Return zero vector if something goes wrong
        return [0.0] * 20

unified_df['metadata_vector'] = unified_df.apply(create_metadata_vector, axis=1)
print("  ✓ Created 20-dimensional metadata vectors")


STEP 4: CREATING METADATA VECTORS
  ✓ Created 20-dimensional metadata vectors


In [11]:
# ============================================
# STEP 5: CLEAN AND VALIDATE
# ============================================

print("\n" + "="*60)
print("STEP 5: CLEANING AND VALIDATING")
print("="*60)

# Check for any remaining issues
print(f"Total rows: {len(unified_df):,}")
print(f"Missing values in key columns:")
print(f"  subject: {unified_df['subject'].isna().sum()}")
print(f"  body: {unified_df['body'].isna().sum()}")
print(f"  label: {unified_df['label'].isna().sum()}")

# Remove any rows with missing labels (shouldn't happen but just in case)
before_clean = len(unified_df)
unified_df = unified_df.dropna(subset=['label'])
after_clean = len(unified_df)

if before_clean != after_clean:
    print(f"  ⚠️  Removed {before_clean - after_clean} rows with missing labels")

print("\n✓ Dataset cleaned and validated")


STEP 5: CLEANING AND VALIDATING
Total rows: 49,860
Missing values in key columns:
  subject: 0
  body: 0
  label: 0

✓ Dataset cleaned and validated


In [13]:
# ============================================
# STEP 6: SAVE DATASET
# ============================================

print("\n" + "="*60)
print("STEP 6: SAVING DATASET")
print("="*60)

output_file = 'dual_tower_text_features.csv'
unified_df.to_csv(output_file, index=False)

print(f"✓ Saved to: {output_file}")
print(f"  Shape: {unified_df.shape}")

# Print statistics
print("\n" + "="*60)
print("DATASET STATISTICS")
print("="*60)

print(f"\n--- Label Distribution ---")
label_counts = unified_df['label'].value_counts()
for label, count in label_counts.items():
    pct = (count / len(unified_df)) * 100
    name = "Phishing/Spam" if label == 1 else "Legitimate"
    print(f"  {name}: {count:,} ({pct:.1f}%)")

print(f"\n--- Source Distribution ---")
for source, count in unified_df['source'].value_counts().items():
    pct = (count / len(unified_df)) * 100
    print(f"  {source.upper()}: {count:,} ({pct:.1f}%)")

print(f"\n--- Feature Coverage ---")
print(f"  Valid sender domains: {unified_df['sender_domain'].notna().sum():,}")
print(f"  Emails with URLs: {unified_df['urls'].sum():,}")
print(f"  With urgent keywords: {unified_df['has_urgent_words'].sum():,}")
print(f"  Domain-brand mismatches: {unified_df['domain_brand_mismatch'].sum():,}")
print(f"  Suspicious domains: {unified_df['sender_domain_suspicious'].sum():,}")

print("\n" + "="*60)
print("DATASET CREATION COMPLETE!")
print("="*60)
print(f"✓ Ready for text model training")


STEP 6: SAVING DATASET
✓ Saved to: dual_tower_text_features.csv
  Shape: (49860, 21)

DATASET STATISTICS

--- Label Distribution ---
  Phishing/Spam: 28,457 (57.1%)
  Legitimate: 21,403 (42.9%)

--- Source Distribution ---
  CEAS: 39,154 (78.5%)
  SPAM_ASSASSIN: 5,809 (11.7%)
  NIGERIAN: 3,332 (6.7%)
  NAZARIO: 1,565 (3.1%)

--- Feature Coverage ---
  Valid sender domains: 48,310
  Emails with URLs: 33,843
  With urgent keywords: 9,648
  Domain-brand mismatches: 0
  Suspicious domains: 3,096

DATASET CREATION COMPLETE!
✓ Ready for text model training


In [14]:
import re
import json


common_phishing_brands = {
    # --- Financial Institutions & Banks ---
    'bank': [r'\bbank\b', r'\bbanking\b'],
    'chase': [r'\bchase\b'],
    'wells fargo': [r'\bwells[\s-]?fargo\b', r'\bwf\b'],
    'citibank': [r'\bcitibank\b', r'\bciti\b'],
    'boa': [r'\bbank[\s-]?of[\s-]?america\b', r'\bboa\b', r'\bbankofamerica\b'],
    'us bank': [r'\bus[\s-]?bank\b', r'\busbanks?\b'],
    'pnc': [r'\bpnc\b'],
    'capital one': [r'\bcapital[\s-]?one\b'],
    'barclays': [r'\bbarclays\b'],
    'hsbc': [r'\bhsbc\b'],
    'santander': [r'\bsantander\b'],
    'td bank': [r'\btd[\s-]?bank\b', r'\btds?\b'],
    'credit suisse': [r'\bcredit[\s-]?suisse\b'],
    'american express': [r'\bamerican[\s-]?express\b', r'\bamex\b'],
    'visa': [r'\bvisa\b'],
    'mastercard': [r'\bmastercard\b', r'\bmaster[\s-]?card\b'],
    'discover': [r'\bdiscover\b'],
    'revolut': [r'\brevolut\b'],
    'wise': [r'\bwise\b', r'\btransferwise\b'],
    'venmo': [r'\bvenmo\b'],
    'zelle': [r'\bzelle\b'],
    'cash app': [r'\bcash[\s-]?app\b', r'\bcashapp\b'],
    'sbi': [r'\bstate[\s-]?bank[\s-]?of[\s-]?india\b', r'\bsbi\b'],
    'hdfc': [r'\bhdfc\b', r'\bhdfc[\s-]?bank\b'],
    'icici': [r'\bicici\b'],
    'kotak': [r'\bkotak\b'],
    'axis bank': [r'\baxis[\s-]?bank\b'],
    'nab': [r'\bnab\b', r'\bnational[\s-]?australia[\s-]?bank\b'],
    'anz': [r'\banz\b', r'\baustralia[\s-]?and[\s-]?new[\s-]?zealand\b'],
    'standard chartered': [r'\bstandard[\s-]?chartered\b'],
    'ubs': [r'\bubs\b'],
    ' Deutsche bank': [r'\bdeutsche[\s-]?bank\b', r'\bdeutschebank\b'],

    # --- Payment & Fintech ---
    'paypal': [r'\bpaypal\b', r'\bpay[\s-]?pal\b'],
    'stripe': [r'\bstripe\b'],
    'square': [r'\bsquare\b'],
    'coinbase': [r'\bcoinbase\b'],
    'binance': [r'\bbinance\b'],
    'kraken': [r'\bkraken\b'],
    'robinhood': [r'\brobinhood\b'],
    'intuit': [r'\bintuit\b', r'\bturbotax\b'],
    'quickbooks': [r'\bquick[\s-]?books\b'],
    'authorize.net': [r'\bauthorize(?:\.|[\s-])?net\b'],
    'adyen': [r'\badyen\b'],
    'stripe connect': [r'\bstripe[\s-]?connect\b'],
    'payoneer': [r'\bpayoneer\b'],
    'squareup': [r'\bsquareup\b'],
    'apple pay': [r'\bapple[\s-]?pay\b'],
    'google pay': [r'\bgoogle[\s-]?pay\b', r'\bgpay\b'],
    'klarna': [r'\bklarna\b'],
    'afterpay': [r'\bafterpay\b'],

    # --- Tech Companies & Email Providers ---
    'microsoft': [r'\bmicrosoft\b', r'\bmicro[\s-]?soft\b', r'\boutlook\b', r'\boffice365\b', r'\bo365\b', r'\bmsft\b'],
    'google': [r'\bgoogle\b', r'\bgmail\b', r'\bgsuite\b', r'\bgoogle[\s-]?drive\b'],
    'yahoo': [r'\byahoo\b'],
    'apple': [r'\bapple\b', r'\bicloud\b', r'\bitunes\b', r'\bappleid\b'],
    'facebook': [r'\bfacebook\b', r'\bmeta\b'],
    'instagram': [r'\binstagram\b'],
    'twitter': [r'\btwitter\b', r'\bx\.com\b'],
    'linkedin': [r'\blinkedin\b'],
    'aol': [r'\baol\b'],
    'tiktok': [r'\btiktok\b'],
    'snapchat': [r'\bsnapchat\b'],
    'notion': [r'\bnotion\b'],
    'onedrive': [r'\bonedrive\b'],
    'icloud': [r'\bicloud\b'],
    'hotmail': [r'\bhotmail\b'],
    'msn': [r'\bmsn\b'],
    'protonmail': [r'\bprotonmail\b'],

    # --- Cloud & Developer Services / SaaS ---
    'aws': [r'\baws\b', r'\bamazon[\s-]?web[\s-]?services\b'],
    'azure': [r'\bazure\b', r'\bmicrosoft[\s-]?azure\b'],
    'github': [r'\bgithub\b'],
    'gitlab': [r'\bgitlab\b'],
    'dropbox': [r'\bdropbox\b'],
    'box': [r'\bbox\b'],
    'zoom': [r'\bzoom\b'],
    'slack': [r'\bslack\b'],
    'adobe': [r'\badobe\b', r'\bacrobat\b'],
    'salesforce': [r'\bsalesforce\b'],
    'workday': [r'\bworkday\b'],
    'okta': [r'\bokta\b'],
    'jira': [r'\bjira\b'],
    'confluence': [r'\bconfluence\b'],
    'zendesk': [r'\bzendesk\b'],
    'service now': [r'\bservice[\s-]?now\b', r'\bservicenow\b'],
    'dropbox paper': [r'\bdropbox[\s-]?paper\b'],

    # --- E-commerce & Marketplaces ---
    'amazon': [r'\bamazon\b', r'\bamaz[o0]n\b', r'\bamzn\b', r'\bamazon[\s-]?prime\b'],
    'ebay': [r'\bebay\b', r'\be[\s-]?bay\b'],
    'walmart': [r'\bwalmart\b'],
    'target': [r'\btarget\b'],
    'bestbuy': [r'\bbest[\s-]?buy\b'],
    'costco': [r'\bcostco\b'],
    'homedepot': [r'\bhome[\s-]?depot\b', r'\bthe[\s-]?home[\s-]?depot\b'],
    'lowes': [r"\blowes\b", r"\blowe\'?s\b"],
    'alibaba': [r'\balibaba\b'],
    'flipkart': [r'\bflipkart\b'],
    'shein': [r'\bshein\b'],
    'etsy': [r'\betsy\b'],
    'craigslist': [r'\bcraigslist\b'],

    # --- Shipping & Logistics ---
    'fedex': [r'\bfedex\b', r'\bfed[\s-]?ex\b'],
    'ups': [r'\bups\b'],
    'dhl': [r'\bdhl\b'],
    'usps': [r'\busps\b', r'\bpostal[\s-]?service\b'],
    'royal mail': [r'\broyal[\s-]?mail\b'],
    'canada post': [r'\bcanada[\s-]?post\b'],
    'australia post': [r'\baustralia[\s-]?post\b'],
    'hermes': [r'\bhermes\b'],
    'yodel': [r'\byodel\b'],
    'gls': [r'\bgls\b'],
    'dpd': [r'\bdpd\b'],
    'postnl': [r'\bpostnl\b'],

    # --- Streaming, Media & Entertainment ---
    'netflix': [r'\bnetflix\b'],
    'hulu': [r'\bhulu\b'],
    'disneyplus': [r'\bdisney[\s-]?plus\b', r'\bdisneyplus\b'],
    'spotify': [r'\bspotify\b'],
    'hbo': [r'\bhbo\b', r'\bhbo[\s-]?max\b', r'\bhbomax\b'],
    'paramount': [r'\bparamount\b'],
    'peacock': [r'\bpeacock\b'],
    'twitch': [r'\btwitch\b'],
    'prime video': [r'\bprime[\s-]?video\b', r'\bamazon[\s-]?prime\b'],
    'apple tv': [r'\bapple[\s-]?tv\b'],
    'soundcloud': [r'\bsoundcloud\b'],

    # --- Telecom & ISP ---
    'at&t': [r'\bat[\s&]?t\b', r'\batandt\b'],
    'verizon': [r'\bverizon\b'],
    't-mobile': [r'\bt[\s-]?mobile\b', r'\btmobile\b'],
    'spectrum': [r'\bspectrum\b'],
    'comcast': [r'\bcomcast\b', r'\bxfinity\b'],
    'vodafone': [r'\bvodafone\b'],
    'bt': [r'\bbt\b', r'\bbritish[\s-]?telecom\b'],
    'rogers': [r'\brogers\b'],
    'telstra': [r'\btelstra\b'],
    'bell': [r'\bbell\b'],
    'orange': [r'\borange\b'],
    'three': [r'\bthree\b', r'\b3mobile\b'],

    # --- Government, Tax & Official Bodies ---
    'irs': [r'\birs\b'],
    'ssa': [r'\bssa\b', r'\bsocial[\s-]?security\b'],
    'medicare': [r'\bmedicare\b'],
    'gov': [r'\bgov\b', r'\bgovernment\b', r'\bportal\.gov\b'],
    'hmrc': [r'\bhmrc\b', r'\bhm[\s-]?revenue\b'],
    'dwp': [r'\bdwp\b'],
    'eu': [r'\beuropean[\s-]?union\b', r'\beu\b'],
    'govt of india': [r'\bindia[\s-]?gov\b', r'\bgovin\b'],

    # --- Healthcare & Insurance ---
    'blue cross': [r'\bblue[\s-]?cross\b', r'\bbcbs\b'],
    'cigna': [r'\bcigna\b'],
    'aetna': [r'\baetna\b'],
    'kaiser': [r'\bkaiser\b'],
    'uhc': [r'\buhc\b', r'\bunited[\s-]?healthcare\b'],

    # --- Crypto & Exchanges ---
    'coinbase': [r'\bcoinbase\b'],
    'binance': [r'\bbinance\b'],
    'kraken': [r'\bkraken\b'],
    'coinmarketcap': [r'\bcoinmarketcap\b'],
    'gemini': [r'\bgemini\b'],
    'bitfinex': [r'\bbitfinex\b'],

    # --- Travel, Hospitality & Airlines ---
    'expedia': [r'\bexpedia\b'],
    'booking.com': [r'\bbooking(?:[\s-]?dot[\s-]?com|\.)?com\b', r'\bbooking\b'],
    'airbnb': [r'\bairbnb\b'],
    'delta': [r'\bdelta\b'],
    'united': [r'\bunited\b', r'\bunited[\s-]?airlines\b'],
    'american airlines': [r'\bamerican[\s-]?airlines\b', r'\bamerican[\s-]?air\b'],
    'marriott': [r'\bmarriott\b'],
    'hilton': [r'\bhilton\b'],
    'kayak': [r'\bkayak\b'],
    'skyscanner': [r'\bskyscanner\b'],

    # --- HR / Payroll / Enterprise Vendors ---
    'adp': [r'\badp\b', r'\badp[\s-]?payroll\b'],
    'workday': [r'\bworkday\b'],
    'ukg': [r'\bukg\b', r'\bultimate[\s-]?software\b'],
    'paycom': [r'\bpaycom\b'],
    'sage': [r'\bsage\b', r'\bsage[\s-]?pay\b'],

    # --- Domain Registrars / Hosting / CMS ---
    'godaddy': [r'\bgodaddy\b'],
    'namecheap': [r'\bnamecheap\b'],
    'bluehost': [r'\bbluehost\b'],
    'digitalocean': [r'\bdigitalocean\b'],
    'cloudflare': [r'\bcloudflare\b'],
    'wordpress': [r'\bwordpress\b', r'\bwp-admin\b'],
    'squarespace': [r'\bsquarespace\b'],

    # --- Antivirus, Security & Identity ---
    'mcafee': [r'\bmcafee\b'],
    'norton': [r'\bnorton\b', r'\bsymantec\b'],
    'bitdefender': [r'\bbitdefender\b'],
    'lastpass': [r'\blastpass\b'],
    '1password': [r'\b1[\s-]?password\b'],
    'dashlane': [r'\bdashlane\b'],
    'okta': [r'\bokta\b'],

    # --- Food Delivery & Services ---
    'uber': [r'\buber\b', r'\bubereats\b'],
    'doordash': [r'\bdoordash\b'],
    'grubhub': [r'\bgrubhub\b'],
    'deliveroo': [r'\bdeliveroo\b'],

    # --- Marketplaces & Social/apps ---
    'craigslist': [r'\bcraigslist\b'],
    'meetup': [r'\bmeetup\b'],
    'tinder': [r'\btinder\b'],
    'bumble': [r'\bbumble\b'],

    # --- Gaming & Entertainment Platforms ---
    'steam': [r'\bsteam\b'],
    'epic games': [r'\bepic[\s-]?games\b'],
    'playstation': [r'\bplaystation\b', r'\bpsn\b'],
    'xbox': [r'\bxbox\b'],
    'nintendo': [r'\bnintendo\b'],

    # --- Local / Regional Banks & Services (EMEA/APAC/Africa/LatAm) ---
    'santander uk': [r'\bsantander\b'],
    'bbva': [r'\bbbva\b'],
    'caixa': [r'\bcaixa\b'],
    'inecobank': [r'\binecobank\b'],
    'equifax': [r'\bequifax\b'],
    'transunion': [r'\btransunion\b'],

    # --- Common Indicators / Generic tokens to help grouping ---
    'account': [r'\baccount\b', r'\blogin\b', r'\bsign[\s-]?in\b'],
    'verify': [r'\bverify\b', r'\bverification\b', r'\bconfirm\b', r'\bupdate\b', r'\breactivate\b'],
    'security alert': [r'\bsecurity[\s-]?alert\b', r'\bunauthorized\b', r'\bsuspicious\b'],
}

with open("class_to_idx_image_resnet18.json", "r") as f:
    image_logo_to_idx = json.load(f)


def extend_brands_with_image_logos(base_brands: dict, logo_map: dict) -> dict:
    """
    Start from your curated common_phishing_brands, add entries for each logo.
    """
    extended = dict(base_brands)  # copy

    for raw_name in logo_map.keys():
        raw_lower = raw_name.lower()

        # Strip suffixes and trailing digits: _text, _fig, 1,2,...
        # FIX 1: use .strip() (no argument)
        base = re.sub(r"(?:_text|_fig|\d+)$", "", raw_lower).strip()

        if not base:
            continue

        if base in extended:
            continue

        # FIX 2: use a replacement FUNCTION so \s is not treated as an escape
        def repl(_m):
            return r"[\s\-]?"

        token = re.sub(r"[_\-]+", repl, base)
        patterns = [rf"\b{token}\b"]

        if raw_lower != base:
            token2 = re.sub(r"[_\-]+", repl, raw_lower)
            patterns.append(rf"\b{token2}\b")

        # Dedup patterns
        patterns = list(dict.fromkeys(patterns))

        extended[base] = patterns

    return extended


COMMON_PHISHING_BRANDS = extend_brands_with_image_logos(
    common_phishing_brands, image_logo_to_idx
)

print(COMMON_PHISHING_BRANDS)
print("Total brand keys:", len(COMMON_PHISHING_BRANDS))

{'bank': ['\\bbank\\b', '\\bbanking\\b'], 'chase': ['\\bchase\\b'], 'wells fargo': ['\\bwells[\\s-]?fargo\\b', '\\bwf\\b'], 'citibank': ['\\bcitibank\\b', '\\bciti\\b'], 'boa': ['\\bbank[\\s-]?of[\\s-]?america\\b', '\\bboa\\b', '\\bbankofamerica\\b'], 'us bank': ['\\bus[\\s-]?bank\\b', '\\busbanks?\\b'], 'pnc': ['\\bpnc\\b'], 'capital one': ['\\bcapital[\\s-]?one\\b'], 'barclays': ['\\bbarclays\\b'], 'hsbc': ['\\bhsbc\\b'], 'santander': ['\\bsantander\\b'], 'td bank': ['\\btd[\\s-]?bank\\b', '\\btds?\\b'], 'credit suisse': ['\\bcredit[\\s-]?suisse\\b'], 'american express': ['\\bamerican[\\s-]?express\\b', '\\bamex\\b'], 'visa': ['\\bvisa\\b'], 'mastercard': ['\\bmastercard\\b', '\\bmaster[\\s-]?card\\b'], 'discover': ['\\bdiscover\\b'], 'revolut': ['\\brevolut\\b'], 'wise': ['\\bwise\\b', '\\btransferwise\\b'], 'venmo': ['\\bvenmo\\b'], 'zelle': ['\\bzelle\\b'], 'cash app': ['\\bcash[\\s-]?app\\b', '\\bcashapp\\b'], 'sbi': ['\\bstate[\\s-]?bank[\\s-]?of[\\s-]?india\\b', '\\bsbi\\b'], '

In [None]:
import re
import pandas as pd
import json

# Load unified text dataset (from your earlier script)
unified_df = pd.read_csv("dual_tower_text_features.csv")

# Load image logo map
with open("class_to_idx_image_resnet18.json", "r") as f:
    image_logo_to_idx = json.load(f)


def extract_brands_from_text(text: str, brand_patterns: dict) -> list[str]:
    text = str(text).lower()
    found = []

    for brand, patterns in brand_patterns.items():
        if any(re.search(p, text) for p in patterns):
            found.append(brand)

    # unique
    return list(set(found))


# Cleaned mapping: logo key -> base brand name
def clean_logo_name(raw_name: str) -> str:
    base = re.sub(r"(text|_fig|\d+)$", "", raw_name.lower()).strip("")
    return base


# Build brand -> logo class IDs mapping
brand_to_logo_ids = {}

for raw_name, idx in image_logo_to_idx.items():
    base = clean_logo_name(raw_name)
    if not base:
        continue
    brand_to_logo_ids.setdefault(base, set()).add(idx)

# Set of brands that have logos
image_brand_set = set(brand_to_logo_ids.keys())


def map_brands_to_logo_ids(mentioned_brands: list[str]) -> list[int]:
    ids = []
    for b in mentioned_brands:
        if b in brand_to_logo_ids:
            ids.extend(list(brand_to_logo_ids[b]))
    # uniq & stable
    return sorted(list(set(ids)))


# ---- RUN OVER DATAFRAME ----

full_text = unified_df["subject"].fillna("") + " " + unified_df["body"].fillna("")

unified_df["mentioned_brands"] = full_text.apply(
    lambda x: extract_brands_from_text(x, COMMON_PHISHING_BRANDS)
)
unified_df["num_brands_mentioned"] = unified_df["mentioned_brands"].apply(len)
unified_df["has_brand_mention"] = unified_df["num_brands_mentioned"].gt(0).astype(int)

unified_df["image_brand_ids"] = unified_df["mentioned_brands"].apply(
    map_brands_to_logo_ids
)
unified_df["has_image_mappable_brand"] = unified_df["image_brand_ids"].apply(
    lambda x: len(x) > 0
)

print(unified_df[["mentioned_brands", "image_brand_ids"]].head())

# Save this as your multimodal-ready text dataset
unified_df.to_csv("unified_multimodal_text.csv", index=False)
print("Saved unified_multimodal_text.csv with brand mappings.")