In [None]:
import pandas as pd
import re
import math
from sklearn.metrics import precision_recall_fscore_support

# -----------------------------
# 1. Load dataset
# -----------------------------
df = pd.read_csv("test.csv")
df = df.dropna(subset=["candidate_string", "label"])

candidates = df["candidate_string"].astype(str).values
labels = df["label"].values

# -----------------------------
# 2. Utility: Shannon Entropy
# -----------------------------
def shannon_entropy(s: str) -> float:
    if not s:
        return 0.0
    probs = [s.count(c) / len(s) for c in set(s)]
    return -sum(p * math.log2(p) for p in probs)

# -----------------------------
# 3. Blacklist & Whitelist Patterns
# -----------------------------

# Dummy / placeholder values (blacklist) - Expanded
blacklist_patterns = [
    r"(xxxx+)",  # xxxx, xxxxxx
    r"(1111+|2222+|3333+|4444+|5555+|6666+|7777+|8888+|9999+|0000+)",  # repeating digits
    r"(1234+|4321+|abcd+|qwer+|asdf+|zxcv+)",  # sequential patterns
    r"(dummy|placeholder|sample|example|test|fake|mock|temp|temporary)",  # dummy values
    r"(your[_-]?api[_-]?key|replace[_-]?me|change[_-]?me|edit[_-]?me)",  # placeholder text
    r"(test[_-]?key|test[_-]?token|test[_-]?secret|demo[_-]?key)",  # test keys
    r"(insert[_-]?here|put[_-]?here|enter[_-]?here|add[_-]?here)",  # instruction placeholders
    r"(todo|fixme|hack|broken|disabled)",  # development placeholders
    r"(null|undefined|none|nil|empty|blank)",  # null values
    r"(lorem|ipsum|dolor|sit|amet|consectetur|adipiscing)",  # lorem ipsum text
    r"(\*\*\*+|---+|===+|####+|\.\.\.|___+)",  # asterisks, dashes, equals, dots
    r"(password|secret|token|key)\s*(=|:)\s*(password|secret|token|key)",  # self-referential
    r"^(a|b|c|x|y|z|n|m|i|j|k)$",  # single characters
    r"(aaa+|bbb+|ccc+|ddd+|eee+)",  # repeating letters
    r"(admin|root|user|guest|anonymous|default)",  # common usernames as secrets
    r"(qwerty|password123|123456|admin123|letmein|welcome)",  # common weak passwords
    r"(sk_test_|pk_test_).*test",  # test mode indicators
    r"(development|dev|local|localhost|127\.0\.0\.1)",  # dev environment indicators
    r"^(true|false|yes|no|on|off|enable|disable)$",  # boolean-like values
    r"(coming[_-]?soon|not[_-]?implemented|not[_-]?ready)",  # placeholder text
    r"(copy[_-]?paste|ctrl[_-]?c|ctrl[_-]?v)",  # copy-paste indicators
]

# Whitelist patterns (ignore false positives) - Expanded
whitelist_patterns = [
    r"^[0-9a-f]{7,40}$",  # Git commit hashes
    r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$",  # UUIDs
    r"^https?://",  # URLs
    r"^[0-9]+$",  # pure numbers
    r"^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$",  # IP addresses
    r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}$",  # Email addresses
    r"^\/[a-z0-9\/\-_.]*$",  # File paths
    r"^[a-z][a-z0-9\-]*\.[a-z]{2,}$",  # Domain names
    r"^[0-9]{4}-[0-9]{2}-[0-9]{2}",  # Date formats
    r"^[0-9]{2}:[0-9]{2}:[0-9]{2}",  # Time formats
    r"^(true|false)$",  # Boolean values
    r"^(http|https|ftp|ssh|tcp|udp)$",  # Protocol names
    r"^[0-9]{1,5}$",  # Port numbers
    r"^(get|post|put|delete|patch|head|options)$",  # HTTP methods
    r"^(application|text|image|video|audio)\/[a-z0-9\-+.]+$",  # MIME types
    r"^[a-f0-9]{32}$",  # MD5 hashes (common, not secrets)
    r"^[a-f0-9]{40}$",  # SHA1 hashes (when clearly hashes, not secrets)
    r"^[a-f0-9]{64}$",  # SHA256 hashes
    r"^[0-9]{10,13}$",  # Unix timestamps
    r"^rgb\([0-9]{1,3},[0-9]{1,3},[0-9]{1,3}\)$",  # RGB color codes
    r"^#[0-9a-f]{3,6}$",  # Hex color codes
    r"^[+-]?[0-9]*\.?[0-9]+([eE][+-]?[0-9]+)?$",  # Scientific notation
    r"^[A-Z]{2,3}[0-9]{3,6}$",  # Currency codes + amounts
]

# Comprehensive context keywords used by commercial tools
context_keywords = [
    # Generic secret terms
    "api", "key", "secret", "token", "auth", "bearer", "password", "pass", "pwd",
    "access", "private", "client", "login", "credential", "cred", "authorization",
    "authenticate", "signature", "sign", "oauth", "session", "cookie", "cert",
    "certificate", "private_key", "public_key", "symmetric", "asymmetric",
    
    # Cloud providers - AWS
    "aws", "amazon", "s3", "ec2", "lambda", "iam", "cloudformation", "dynamodb",
    "rds", "sns", "sqs", "cloudwatch", "elasticache", "redshift", "kinesis",
    "access_key", "secret_key", "aws_access_key_id", "aws_secret_access_key",
    "cloudfront", "route53", "elb", "vpc", "ecs", "eks", "fargate", "sagemaker",
    
    # Cloud providers - Azure
    "azure", "microsoft", "msft", "storage", "cosmos", "functionapp", "webapp",
    "servicebus", "eventhub", "keyvault", "subscription", "tenant", "client_secret",
    "resource_group", "active_directory", "ad", "graph", "outlook", "sharepoint",
    
    # Cloud providers - Google Cloud
    "google", "gcp", "firebase", "firestore", "bigquery", "pubsub", "gcs",
    "service_account", "project_id", "private_key_id", "compute_engine",
    "cloud_sql", "cloud_storage", "cloud_functions", "app_engine", "kubernetes_engine",
    
    # Database related
    "database", "db", "sql", "mysql", "postgres", "postgresql", "mongodb", "redis",
    "connection", "connect", "dsn", "jdbc", "odbc", "host", "port", "username",
    "user", "schema", "table", "collection", "cassandra", "elasticsearch", "influxdb",
    "neo4j", "couchdb", "sqlite", "oracle", "mariadb", "cockroachdb",
    
    # Payment & Financial
    "stripe", "paypal", "payment", "billing", "invoice", "charge", "card",
    "merchant", "publishable", "webhook", "endpoint", "square", "braintree",
    "adyen", "checkout", "authorize", "worldpay", "klarna", "razorpay",
    
    # Communication & Social
    "slack", "discord", "telegram", "whatsapp", "twilio", "sendgrid", "mailgun",
    "smtp", "email", "phone", "sms", "notification", "webhook", "bot",
    "teams", "zoom", "webex", "skype", "facebook", "twitter", "linkedin",
    "instagram", "snapchat", "tiktok", "youtube", "reddit",
    
    # Development & CI/CD
    "github", "gitlab", "bitbucket", "jenkins", "travis", "circle", "deployment",
    "build", "pipeline", "docker", "kubernetes", "helm", "terraform", "ansible",
    "vagrant", "chef", "puppet", "salt", "bamboo", "teamcity", "azure_devops",
    "codebuild", "codepipeline", "codedeploy", "actions", "workflows",
    
    # Monitoring & Analytics
    "datadog", "newrelic", "segment", "mixpanel", "amplitude", "sentry", "rollbar",
    "monitoring", "analytics", "tracking", "log", "metric", "splunk", "elastic",
    "grafana", "prometheus", "jaeger", "zipkin", "honeycomb", "lightstep",
    
    # Security & Encryption
    "encryption", "decrypt", "encrypt", "hash", "salt", "cipher", "certificate",
    "cert", "ssl", "tls", "x509", "rsa", "aes", "sha", "md5", "hmac",
    "keystore", "truststore", "passphrase", "pkcs", "pem", "der", "p12", "jks",
    "bcrypt", "scrypt", "argon2", "pbkdf2", "ecdsa", "ed25519",
    
    # API specific
    "bearer", "basic", "digest", "ntlm", "kerberos", "saml", "jwt", "refresh",
    "scope", "grant", "code", "redirect", "callback", "state", "nonce",
    "client_id", "client_secret", "app_id", "app_secret", "consumer_key",
    "consumer_secret", "oauth_token", "oauth_verifier",
    
    # Configuration
    "config", "configuration", "setting", "env", "environment", "production",
    "staging", "development", "test", "debug", "local", "remote", "global",
    "system", "application", "service", "infrastructure", "deployment",
    
    # Crypto & Blockchain
    "bitcoin", "ethereum", "crypto", "wallet", "mnemonic", "seed", "phrase",
    "blockchain", "web3", "metamask", "coinbase", "binance", "litecoin",
    "dogecoin", "ripple", "cardano", "polkadot", "chainlink", "solana",
    "polygon", "avalanche", "terra", "cosmos", "algorand",
    
    # CDN & Media
    "cloudflare", "fastly", "akamai", "cdn", "cloudinary", "imgix", "bunny",
    "jsdelivr", "unpkg", "cdnjs", "maxcdn", "keycdn",
    
    # Email & Marketing
    "mailchimp", "constant_contact", "campaign_monitor", "aweber", "getresponse",
    "convertkit", "klaviyo", "hubspot", "salesforce", "marketo", "pardot",
    
    # Search & AI
    "algolia", "elasticsearch", "solr", "opensearch", "pinecone", "weaviate",
    "anthropic", "openai", "cohere", "huggingface", "replicate", "stability",
]

# Service-specific patterns (based on TruffleHog rules) - Massively Expanded
service_patterns = {
    # AWS patterns
    "aws_access_key": r"AKIA[0-9A-Z]{16}",
    "aws_secret_key": r"[0-9a-zA-Z/+]{40}",
    "aws_session_token": r"[A-Za-z0-9+/]{100,}={0,2}",
    
    # GitHub tokens
    "github_token": r"gh[pousr]_[A-Za-z0-9_]{36,255}",
    "github_classic": r"[0-9a-fA-F]{40}",
    "github_app_token": r"(ghu|ghs)_[0-9a-zA-Z]{36}",
    "github_refresh_token": r"ghr_[0-9a-zA-Z]{76}",
    
    # GitLab tokens
    "gitlab_token": r"glpat-[0-9a-zA-Z\-\_]{20}",
    "gitlab_runner": r"GR1348941[0-9a-zA-Z\-\_]{20}",
    
    # Slack tokens
    "slack_bot_token": r"xoxb-[0-9]{11,13}-[0-9]{11,13}-[a-zA-Z0-9]{24}",
    "slack_user_token": r"xoxp-[0-9]{11,13}-[0-9]{11,13}-[a-zA-Z0-9]{12}-[a-zA-Z0-9]{32}",
    "slack_app_token": r"xapp-[0-9]{1}-[A-Z0-9]+-[0-9]+-[a-z0-9]+",
    "slack_webhook": r"https://hooks\.slack\.com/services/T[0-9A-Z]{8}/B[0-9A-Z]{8}/[0-9A-Za-z]{24}",
    
    # Google API keys
    "google_api_key": r"AIza[0-9A-Za-z\\-_]{35}",
    "google_oauth": r"[0-9]+-[0-9A-Za-z_]{32}\.apps\.googleusercontent\.com",
    "firebase_key": r"AAAA[A-Za-z0-9_-]{7}:[A-Za-z0-9_-]{140}",
    
    # Stripe keys
    "stripe_publishable": r"pk_(test|live)_[0-9a-zA-Z]{24,}",
    "stripe_secret": r"sk_(test|live)_[0-9a-zA-Z]{24,}",
    "stripe_restricted": r"rk_(test|live)_[0-9a-zA-Z]{24,}",
    
    # Azure patterns
    "azure_storage": r"DefaultEndpointsProtocol=https;AccountName=[^;]+;AccountKey=[A-Za-z0-9+/]+=*",
    "azure_service_principal": r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
    
    # JWT tokens
    "jwt": r"eyJ[A-Za-z0-9_/+-]*\.eyJ[A-Za-z0-9_/+-]*\.[A-Za-z0-9._/+-]*",
    
    # Database connection strings
    "postgres_uri": r"postgres://[^:]+:[^@]+@[^/]+/[^\s]+",
    "mysql_uri": r"mysql://[^:]+:[^@]+@[^/]+/[^\s]+",
    "mongodb_uri": r"mongodb(\+srv)?://[^:]+:[^@]+@[^/]+/[^\s]+",
    
    # Docker registry
    "dockerhub_token": r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
    
    # Twilio
    "twilio_sid": r"AC[a-z0-9]{32}",
    "twilio_auth_token": r"[a-z0-9]{32}",
    
    # SendGrid
    "sendgrid_key": r"SG\.[a-zA-Z0-9_\-]{22}\.[a-zA-Z0-9_\-]{43}",
    
    # Mailgun
    "mailgun_key": r"key-[a-f0-9]{32}",
    
    # PayPal
    "paypal_client": r"A[A-Z0-9]{80,}",
    
    # Square
    "square_token": r"sq0[a-z]{3}-[0-9A-Za-z\-_]{22,43}",
    
    # Discord
    "discord_bot_token": r"[MN][A-Za-z\d]{23}\.[\w-]{6}\.[\w-]{27}",
    "discord_webhook": r"https://discord(app)?\.com/api/webhooks/[0-9]{18}/[A-Za-z0-9\-_]{68}",
    
    # Telegram
    "telegram_bot": r"[0-9]{8,10}:[a-zA-Z0-9_-]{35}",
    
    # Shopify
    "shopify_token": r"shp(ss|at|ca|pa)_[a-fA-F0-9]{32}",
    
    # NPM tokens
    "npm_token": r"npm_[A-Za-z0-9]{36}",
    
    # PyPI tokens
    "pypi_token": r"pypi-AgEIcHlwaS5vcmc[A-Za-z0-9\-_]{50,}",
    
    # Heroku
    "heroku_key": r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}",
    
    # CloudFlare
    "cloudflare_key": r"[a-z0-9]{37}",
    "cloudflare_token": r"[A-Za-z0-9_-]{40}",
    
    # DataDog
    "datadog_key": r"[a-fA-F0-9]{32}",
    
    # New Relic
    "newrelic_key": r"[A-Za-z0-9]{47}",
    
    # Sentry
    "sentry_dsn": r"https://[a-f0-9]{32}@[a-z0-9\.-]+/[0-9]+",
    
    # Generic high-entropy patterns
    "base64_key": r"[A-Za-z0-9+/]{40,}={0,2}",
    "hex_key": r"[a-fA-F0-9]{32,}",
    "api_key_pattern": r"[aA][pP][iI][_-]?[kK][eE][yY]['\"]?\s*[:=]\s*['\"]?([A-Za-z0-9_\-]{20,})",
    "secret_pattern": r"[sS][eE][cC][rR][eE][tT]['\"]?\s*[:=]\s*['\"]?([A-Za-z0-9_\-]{20,})",
    "token_pattern": r"[tT][oO][kK][eE][nN]['\"]?\s*[:=]\s*['\"]?([A-Za-z0-9_\-]{20,})",
    "password_pattern": r"[pP][aA][sS][sS][wW][oO][rR][dD]['\"]?\s*[:=]\s*['\"]?([A-Za-z0-9_\-!@#$%^&*()]{8,})",
    
    # SSH keys
    "ssh_private_key": r"-----BEGIN [A-Z ]+PRIVATE KEY-----",
    "ssh_public_key": r"ssh-(rsa|dss|ed25519) [A-Za-z0-9+/]+=*",
    
    # RSA keys
    "rsa_private": r"-----BEGIN RSA PRIVATE KEY-----",
    "rsa_public": r"-----BEGIN PUBLIC KEY-----",
    
    # Certificate patterns
    "x509_cert": r"-----BEGIN CERTIFICATE-----",
    "pkcs12": r"-----BEGIN PKCS12-----",
    
    # JSON Web Keys
    "jwk": r'\{"kty":"[A-Z]{2,3}",".*"[kd]":"[A-Za-z0-9_-]+',
    
    # Connection strings
    "connection_string": r"(server|host|hostname)=([^;]+);.*password=([^;]+)",
    
    # Environment variable patterns
    "env_var_secret": r"[A-Z_]{3,}=(sk_|pk_|xox|AIza|AKIA|[A-Za-z0-9+/]{20,})",
}

# Additional entropy-based patterns for unknown services
entropy_patterns = [
    r"[A-Za-z0-9]{32,}",  # 32+ char alphanumeric (likely keys)
    r"[A-Za-z0-9+/]{40,}={0,2}",  # Base64-like strings
    r"[a-f0-9]{40,}",  # Long hex strings
    r"[A-Z0-9]{20,}",  # Uppercase alphanumeric
    r"[a-z0-9]{30,}",  # Lowercase alphanumeric
]

# -----------------------------
# 4. Enhanced Heuristic Pipeline
# -----------------------------
def heuristic_filter(candidate: str, entropy_threshold=3.5, min_length=20) -> int:
    c = candidate.strip()
    c_lower = c.lower()

    # --- Stage 1: Whitelist filter ---
    for pat in whitelist_patterns:
        if re.fullmatch(pat, c_lower):
            return 0  # non-secret

    # --- Stage 2: Blacklist filter ---
    for pat in blacklist_patterns:
        if re.search(pat, c_lower, re.IGNORECASE):
            return 0  # dummy -> non-secret

    # --- Stage 3: Service-specific patterns (high confidence) ---
    for service, pattern in service_patterns.items():
        if re.search(pattern, c):
            return 1  # matches known secret pattern

    # --- Stage 4: Entropy & Length filter ---
    if len(c) < min_length:
        return 0
    entropy = shannon_entropy(c)
    if entropy < entropy_threshold:
        return 0

    # --- Stage 5: Context keyword check ---
    # If candidate contains typical secret-related words
    if any(kw in c_lower for kw in context_keywords):
        return 1  # secret

    # --- Stage 6: Additional heuristics ---
    # Check for mixed case with numbers and special chars (common in secrets)
    has_upper = any(ch.isupper() for ch in c)
    has_lower = any(ch.islower() for ch in c)
    has_digit = any(ch.isdigit() for ch in c)
    has_special = any(ch in "!@#$%^&*()_+-=[]{}|;:,.<>?/\\~`" for ch in c)
    
    complexity_score = sum([has_upper, has_lower, has_digit, has_special])
    
    # If high complexity and reasonable length, likely a secret
    if complexity_score >= 3 and len(c) >= 16:
        return 1
    
    # --- Stage 7: High entropy patterns ---
    if entropy > 4.5 and len(c) >= 24:  # Very high entropy
        return 1
    
    # --- Stage 8: Entropy-based patterns for unknown formats ---
    for pattern in entropy_patterns:
        if re.fullmatch(pattern, c) and entropy > 4.0:
            return 1
    
    # --- Stage 9: Key-value pair detection ---
    # Check if looks like key=value where value is high entropy
    kv_match = re.match(r'([a-zA-Z_][a-zA-Z0-9_]*)\s*[=:]\s*(.+)', c)
    if kv_match:
        key_part, value_part = kv_match.groups()
        if any(kw in key_part.lower() for kw in ['key', 'secret', 'token', 'password', 'auth']):
            if len(value_part) >= 16 and shannon_entropy(value_part) > 3.0:
                return 1

    # fallback: if it looks random enough, still treat as secret
    return 1

# -----------------------------
# 5. Apply Heuristic Filtering
# -----------------------------
preds = [heuristic_filter(s) for s in candidates]

# -----------------------------
# 6. Evaluate
# -----------------------------
prec, rec, f1, _ = precision_recall_fscore_support(
    labels, preds, average="binary", zero_division=0
)

print("Enhanced Heuristic Filtering (TruffleHog/Gitleaks-like):")
print(f" Precision = {prec:.4f}")
print(f" Recall    = {rec:.4f}")
print(f" F1        = {f1:.4f}")

# Optional: Print some statistics about the patterns
print(f"\nPattern Statistics:")
print(f"- Blacklist patterns: {len(blacklist_patterns)}")
print(f"- Whitelist patterns: {len(whitelist_patterns)}")
print(f"- Context keywords: {len(context_keywords)}")
print(f"- Service patterns: {len(service_patterns)}")
print(f"- Entropy patterns: {len(entropy_patterns)}")