In [None]:
import pandas as pd
import numpy as np
import math
import re
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# -----------------------------
# 1. Load dataset
# -----------------------------
df = pd.read_csv("test.csv")
df = df.dropna(subset=["candidate_string", "label"])

X = df["candidate_string"].astype(str).values
y = df["label"].values

# -----------------------------
# Helper: Shannon Entropy
# -----------------------------
def shannon_entropy(s: str) -> float:
    if not s:
        return 0
    probs = [float(s.count(c)) / len(s) for c in set(s)]
    return -sum(p * math.log2(p) for p in probs)

# -----------------------------
# 2. Entropy Filtering Baseline
# -----------------------------
# Typical secret detection thresholds: entropy > 3.5 and length > 20
entropy_preds = [
    1 if (shannon_entropy(s) > 3.5 and len(s) > 20) else 0
    for s in X
]

prec_e, rec_e, f1_e, _ = precision_recall_fscore_support(
    y, entropy_preds, average="binary", zero_division=0
)

print("Entropy Filtering:")
print(f" Precision={prec_e:.4f} Recall={rec_e:.4f} F1={f1_e:.4f}")

