In [None]:
!pip -q install transformers==4.36.2 torch==2.3.1 pandas==2.2.2 scikit-learn==1.4.2


In [None]:
import os, re, time, random, numpy as np, pandas as pd, warnings
warnings.filterwarnings("ignore")
from urllib.parse import urlparse, parse_qsl, unquote, urlunparse


In [None]:
# ----------------- Mount & paths -----------------
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
BASE = "/content/drive/My Drive/DLI Assignment"
TRAIN_CSV, TEST_CSV = f"{BASE}/Train.csv", f"{BASE}/Test.csv"
VOCAB_TXT, URLBERT_PT = f"{BASE}/vocab.txt", f"{BASE}/urlBERT.pt"


Mounted at /content/drive


In [None]:
# ----------------- Repro / Device -----------------
SEED=42
random.seed(SEED); np.random.seed(SEED)
import torch, torch.nn as nn
torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [None]:
# ----------------- Default knobs (will be shrunk by time budget) -----------------
VAL_SIZE = 0.20
CAP_TRAIN_TOTAL   = None
CAP_VAL_TOTAL     = 8000
CAP_TEST_TOTAL    = None

# Stage-1 head train
TIME_BUDGET_STAGE1 = 15     # will drop if time is tight
BATCH_TRAIN        = 96
MAX_LEN_TRAIN      = 80
MAX_LEN_INFER      = 96

# Uncertainty BERT (strict Top-K; dynamically reduced)
MAX_BERT_VAL_BASE  = 4000
MAX_BERT_TEST_BASE = 5000
BATCH_INFER_INIT   = 4096
BATCH_INFER_MIN    = 256
VAL_IMPROVE_MARGIN = 0.0002  # require real gain from BERT before using it

# Lexical ensemble
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
CAP_TRAIN_LEX_BASE = 20000
N_FEATURES         = 2**20
MAX_ITER_SGD_BASE  = 12
CHAR_CFGS          = [(3,6),(4,7)]
TOK_CFGS           = [(1,3),(2,4)]

# ACC overrides & buckets
MIN_COUNT_LIST = [1,2,3,5,8,10,20]
P_HI_LIST      = [0.93,0.95,0.97,0.99,0.995]
P_LO_LIST      = [0.005,0.01,0.02,0.03,0.05]
HOST_BUCKETS   = [(0,0),(1,2),(3,5),(6,20),(21,100),(101,1000),(1001,9999999)]
DOM_BUCKETS    = [(0,0),(1,2),(3,5),(6,20),(21,100),(101,1000),(1001,9999999)]
TLD_BUCKETS    = [(0,0),(1,2),(3,10),(11,50),(51,9999999)]


In [None]:
# ----------------- Helpers -----------------
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score, recall_score, f1_score

def load_csv(path):
    df = pd.read_csv(path)
    url_col = next((c for c in df.columns if re.search(r"\burl\b", str(c), re.I)), df.columns[0])
    y_col = None
    for k in ["label","class","target","y","is_phish","phishing","malicious"]:
        m = [c for c in df.columns if re.search(rf"\b{k}\b", str(c), re.I)]
        if m: y_col = m[0]; break
    if y_col is None: y_col = df.columns[1]
    df = df[[url_col, y_col]].copy(); df.columns = ["url","raw"]
    def norm(v):
        s=str(v).strip().lower()
        if s in {"1","true","phish","phishing","malicious","bad","attack","fraud","spam","harm"}: return 1
        if s in {"0","false","benign","legit","legitimate","good","safe","normal","clean"}: return 0
        try: return 1 if float(s)>=0.5 else 0
        except: return None
    df["label"]=df["raw"].map(norm); df=df.dropna(subset=["url","label"])
    df["label"]=df["label"].astype(int); df["url"]=df["url"].astype(str)
    return df[["url","label"]]

def cap_total_stratified(df, cap):
    if cap is None or len(df)<=cap: return df.reset_index(drop=True)
    dist = df["label"].value_counts(normalize=True)
    quotas = (dist*cap).round().astype(int).to_dict()
    parts=[]
    for c,k in quotas.items():
        pool = df[df["label"]==c]
        parts.append(pool.sample(min(k,len(pool)), random_state=SEED))
    return pd.concat(parts).sample(frac=1.0, random_state=SEED).reset_index(drop=True)

# Canonicalizers for priors
def host_only(u):
    try:
        h=urlparse(u).netloc.lower()
        if "@" in h: h=h.split("@",1)[1]
        if ":" in h: h=h.split(":",1)[0]
        if h.startswith("www."): h=h[4:]
        return h
    except: return ""
def registrable_domain(u):
    h=host_only(u); parts=[p for p in h.split(".") if p]
    return ".".join(parts[-2:]) if len(parts)>=2 else h
def tld(u):
    h=host_only(u); return h.split(".")[-1] if "." in h else ""
def path_first(u):
    try:
        p=urlparse(u).path; seg=[s for s in p.split("/") if s]; return seg[0] if seg else ""
    except: return ""
def path_last(u):
    try:
        p=urlparse(u).path; seg=[s for s in p.split("/") if s]; return seg[-1] if seg else ""
    except: return ""
def query_keys(u):
    try:
        q=dict(parse_qsl(urlparse(u).query, keep_blank_values=True)); return "&".join(sorted(q.keys()))
    except: return ""
def normalize_url(u: str) -> str:
    try:
        p=urlparse(u)
        net=p.netloc.lower()
        if net.startswith("www."): net=net[4:]
        if net.endswith(":80"): net=net[:-3]
        if net.endswith(":443"): net=net[:-4]
        path = unquote(p.path)
        path = re.sub(r"/{2,}", "/", path)
        if len(path)>1 and path.endswith("/"): path=path[:-1]
        return urlunparse((p.scheme, net, path, p.params, p.query, p.fragment))
    except: return u

def best_acc_threshold(y_true, scores):
    y_true=np.asarray(y_true); scores=np.asarray(scores)
    P=int(y_true.sum()); N=y_true.size-P
    if y_true.size==0: return 0.5, float("nan")
    if P==0: return 1.0, 1.0
    if N==0: return 0.0, 1.0
    fpr,tpr,thr = roc_curve(y_true, scores)
    accs=(tpr*P + (1-fpr)*N)/(P+N)
    i=int(np.argmax(accs))
    return float(thr[i]), float(accs[i])

def make_prior(series, labels):
    grp = pd.DataFrame({"k":series, "y":labels}).groupby("k")["y"].agg(["sum","count"]).reset_index()
    grp["rate"] = (grp["sum"]+1)/(grp["count"]+2)
    return dict(zip(grp["k"], grp["rate"])), dict(zip(grp["k"], grp["count"]))

def apply_overrides(scores,
                    cnt_host, rate_host, cnt_dom, rate_dom, cnt_tld, rate_tld,
                    cnt_url, rate_url, cnt_pf, rate_pf, cnt_pl, rate_pl, cnt_qk, rate_qk,
                    cnt_h_pf, rate_h_pf, cnt_d_pf, rate_d_pf, cnt_d_pl, rate_d_pl,
                    minc, plo, phi, hi=0.99995, lo=0.00005):
    out=scores.copy()
    # Priority: URL > host+pf > dom+pf > dom+pl > host > dom > tld > pf > pl > qk
    M = (cnt_url>=max(1,minc)) & (rate_url>=phi); out[M]=hi
    M = (cnt_url>=max(1,minc)) & (rate_url<=plo); out[M]=lo
    M = (cnt_h_pf>=minc) & (rate_h_pf>=phi); out[M]=hi
    M = (cnt_h_pf>=minc) & (rate_h_pf<=plo); out[M]=lo
    M = (cnt_d_pf>=minc) & (rate_d_pf>=phi); out[M]=hi
    M = (cnt_d_pf>=minc) & (rate_d_pf<=plo); out[M]=lo
    M = (cnt_d_pl>=minc) & (rate_d_pl>=phi); out[M]=hi
    M = (cnt_d_pl>=minc) & (rate_d_pl<=plo); out[M]=lo
    M = (cnt_host>=minc) & (rate_host>=phi); out[M]=hi
    M = (cnt_host>=minc) & (rate_host<=plo); out[M]=lo
    M = (cnt_dom>=minc) & (rate_dom>=phi); out[M]=hi
    M = (cnt_dom>=minc) & (rate_dom<=plo); out[M]=lo
    M = (cnt_tld>=minc) & (rate_tld>=phi); out[M]=hi
    M = (cnt_tld>=minc) & (rate_tld<=plo); out[M]=lo
    M = (cnt_pf>=minc) & (rate_pf>=phi); out[M]=hi
    M = (cnt_pf>=minc) & (rate_pf<=plo); out[M]=lo
    M = (cnt_pl>=minc) & (rate_pl>=phi); out[M]=hi
    M = (cnt_pl>=minc) & (rate_pl<=plo); out[M]=lo
    M = (cnt_qk>=minc) & (rate_qk>=phi); out[M]=hi
    M = (cnt_qk>=minc) & (rate_qk<=plo); out[M]=lo
    return out

def bucketize(cnt, BUCKS):
    for i,(lo,hi) in enumerate(BUCKS):
        if lo <= cnt <= hi: return i
    return len(BUCKS)-1
def bucketize_vec(cnts, bucks):
    return np.array([bucketize(int(c), bucks) for c in cnts], dtype=np.int32)


In [None]:
# ----------------- Load & split -----------------
train_full = load_csv(TRAIN_CSV); test_full = load_csv(TEST_CSV)
train_df, val_df = train_test_split(train_full, test_size=VAL_SIZE, random_state=SEED, stratify=train_full["label"])
train_df = cap_total_stratified(train_df, CAP_TRAIN_TOTAL)
val_df   = cap_total_stratified(val_df,   CAP_VAL_TOTAL)
test_df  = cap_total_stratified(test_full, CAP_TEST_TOTAL)

# add fields
for df in (train_df, val_df, test_df):
    df["_url_raw"] = df["url"]
    df["url"]   = df["url"].map(normalize_url)
    df["_url"]  = df["url"]
    df["_host"] = df["url"].map(host_only)
    df["_dom"]  = df["url"].map(registrable_domain)
    df["_tld"]  = df["url"].map(tld)
    df["_pf"]   = df["url"].map(path_first)
    df["_pl"]   = df["url"].map(path_last)
    df["_qk"]   = df["url"].map(query_keys)
    df["_h_pf"] = df["_host"] + "||" + df["_pf"]
    df["_d_pf"] = df["_dom"]  + "||" + df["_pf"]
    df["_d_pl"] = df["_dom"]  + "||" + df["_pl"]


In [None]:
# ----------------- urlBERT-CNN (unchanged main model) -----------------
from transformers import BertTokenizerFast, BertModel, BertConfig
tokenizer = BertTokenizerFast(vocab_file=VOCAB_TXT, do_lower_case=False)
cfg = BertConfig(vocab_size=tokenizer.vocab_size, hidden_size=768,
                 num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072,
                 max_position_embeddings=512, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1)
encoder = BertModel(cfg)
try:
    sd = torch.load(URLBERT_PT, map_location="cpu")
    if isinstance(sd, dict) and "state_dict" in sd: sd = sd["state_dict"]
    encoder.load_state_dict(sd, strict=False)
except: pass
encoder.config.output_hidden_states = True
encoder.to(device)

class CNNHead(nn.Module):
    def __init__(self, hidden=768, out_ch=256, ks=(2,3,4,5), dropout=0.10):
        super().__init__()
        self.convs = nn.ModuleList([nn.Conv1d(hidden, out_ch, k) for k in ks])
        self.drop  = nn.Dropout(dropout)
        self.fc    = nn.Linear(out_ch*len(ks), 2)
        for m in self.convs: nn.init.kaiming_uniform_(m.weight, a=0.1)
        nn.init.xavier_uniform_(self.fc.weight); nn.init.zeros_(self.fc.bias)
    def forward(self, tok):
        x = tok.transpose(1,2)
        feats = [torch.max(torch.relu(conv(x)), dim=-1).values for conv in self.convs]
        z = torch.cat(feats, dim=1)
        return self.fc(self.drop(z))

class URLBERT_CNN(nn.Module):
    def __init__(self, enc):
        super().__init__()
        self.enc = enc
        self.cnn = CNNHead(hidden=enc.config.hidden_size)
        self.cls = nn.Linear(enc.config.hidden_size, 2)
        nn.init.xavier_uniform_(self.cls.weight); nn.init.zeros_(self.cls.bias)
        self.blend_logit = nn.Parameter(torch.tensor(0.0))
    def forward(self, input_ids, attention_mask):
        out = self.enc(input_ids=input_ids, attention_mask=attention_mask, return_dict=True, output_hidden_states=True)
        last4 = torch.stack(out.hidden_states[-4:], dim=0).mean(0)
        cls_rep = last4[:,0,:]
        logits_cnn = self.cnn(last4)
        logits_cls = self.cls(cls_rep)
        w = torch.sigmoid(self.blend_logit)
        return w*logits_cnn + (1-w)*logits_cls

model = URLBERT_CNN(encoder).to(device)
criterion = nn.CrossEntropyLoss()

from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import TensorDataset, DataLoader

def pretokenize(df, max_len=MAX_LEN_TRAIN, chunk=20000):
    urls=df["url"].tolist(); input_ids=[]; attn=[]
    for i in range(0,len(urls),chunk):
        enc = tokenizer(urls[i:i+chunk], truncation=True, padding="max_length", max_length=max_len, return_tensors="pt")
        input_ids.append(enc["input_ids"]); attn.append(enc["attention_mask"])
    input_ids=torch.cat(input_ids,0); attn=torch.cat(attn,0)
    y=torch.tensor(df["label"].astype(int).values, dtype=torch.long)
    return TensorDataset(input_ids, attn, y)

# shrink Stage-1 if time is already low
if remaining() < 200: TIME_BUDGET_STAGE1 = max(10, int(remaining()*0.1))
if remaining() < 200: MAX_LEN_TRAIN = min(MAX_LEN_TRAIN, 80)

train_ds = pretokenize(train_df, MAX_LEN_TRAIN)
pin = torch.cuda.is_available()
train_loader = DataLoader(train_ds, batch_size=BATCH_TRAIN, shuffle=True, num_workers=0, pin_memory=pin)

for p in model.enc.parameters(): p.requires_grad=False
model.enc.eval(); model.cnn.train(); model.cls.train()
opt = torch.optim.AdamW([p for n,p in model.named_parameters() if not n.startswith("enc.")], lr=1e-3, weight_decay=1e-4)
scaler = GradScaler(enabled=torch.cuda.is_available())

def forward_frozen(self, input_ids, attention_mask):
    with torch.no_grad():
        out = self.enc(input_ids=input_ids, attention_mask=attention_mask, return_dict=True, output_hidden_states=True)
        last4 = torch.stack(out.hidden_states[-4:], dim=0).mean(0)
        cls_rep = last4[:,0,:]
    logits_cnn = self.cnn(last4); logits_cls = self.cls(cls_rep)
    w = torch.sigmoid(self.blend_logit)
    return w*logits_cnn + (1-w)*logits_cls

import types; model.forward_frozen = types.MethodType(forward_frozen, model)

deadline = time.time() + TIME_BUDGET_STAGE1
batches = 0
while time.time() < deadline:
    for ids,attn,y in train_loader:
        if time.time() >= deadline: break
        ids,attn,y = ids.to(device), attn.to(device), y.to(device)
        opt.zero_grad(set_to_none=True)
        with autocast(enabled=torch.cuda.is_available()):
            logits = model.forward_frozen(ids,attn); loss = criterion(logits,y)
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(opt); scaler.update()
        batches += 1
        if batches >= 120: break
    if batches >= 120: break


In [None]:
# ----------------- Priors (incl. composites) -----------------
def prior_tables(df):
    pri={}
    for key in ["_url","_host","_dom","_tld","_pf","_pl","_qk","_h_pf","_d_pf","_d_pl"]:
        pri[f"rate{key}"], pri[f"cnt{key}"] = make_prior(df[key], df["label"])
    pri["global_rate"] = (df["label"].sum()+1)/(len(df)+2)
    return pri
pri = prior_tables(train_df)

def vec_maps(df, pri):
    gr = pri["global_rate"]
    def mapv(tag):
        rmap, cmap = pri[f"rate{tag}"], pri[f"cnt{tag}"]
        vals = df[tag].values
        r = np.array([rmap.get(v,gr) for v in vals], dtype=np.float32)
        c = np.array([cmap.get(v,0)   for v in vals], dtype=np.int32)
        return r,c
    r_url,c_url = mapv("_url");  r_host,c_host=mapv("_host"); r_dom,c_dom=mapv("_dom"); r_tld,c_tld=mapv("_tld")
    r_pf,c_pf   = mapv("_pf");   r_pl,c_pl   = mapv("_pl");   r_qk,c_qk   = mapv("_qk")
    r_hpf,c_hpf = mapv("_h_pf"); r_dpf,c_dpf = mapv("_d_pf"); r_dpl,c_dpl = mapv("_d_pl")
    return (r_url,c_url, r_host,c_host, r_dom,c_dom, r_tld,c_tld, r_pf,c_pf, r_pl,c_pl, r_qk,c_qk, r_hpf,c_hpf, r_dpf,c_dpf, r_dpl,c_dpl)


In [None]:
# ----------------- Lexical + Hinge (shared char transforms) -----------------
# dynamically shrink lexical cap & iters if time is tight
CAP_TRAIN_LEX = CAP_TRAIN_LEX_BASE if remaining() > 160 else int(CAP_TRAIN_LEX_BASE*0.6)
MAX_ITER_SGD  = MAX_ITER_SGD_BASE if remaining() > 160 else max(8, int(MAX_ITER_SGD_BASE*0.75))

lex_train = train_df.sample(min(CAP_TRAIN_LEX, len(train_df)), random_state=SEED)
urls_tr = lex_train["url"].tolist(); y_tr = lex_train["label"].values
urls_va = val_df["url"].tolist();    y_va = val_df["label"].values
urls_te = test_df["url"].tolist()

def token_analyzer(s): return re.split(r'[/\.\?\&=\-_:%\d]+', s)

# Char vectorizers (shared)
char_vecs = []
for rng in CHAR_CFGS:
    v = HashingVectorizer(analyzer="char", ngram_range=rng, n_features=N_FEATURES,
                          alternate_sign=False, norm="l2")
    char_vecs.append(v)
# Fit both log-loss and hinge on SAME X
char_log_clfs, char_hinge_clfs = [], []
for v in char_vecs:
    Xtr = v.transform(urls_tr); Xva = v.transform(urls_va); Xte = v.transform(urls_te)
    clf_log   = SGDClassifier(loss="log_loss", alpha=1e-5, max_iter=MAX_ITER_SGD, tol=1e-3,
                              penalty="l2", random_state=SEED).fit(Xtr,y_tr)
    clf_hinge = SGDClassifier(loss="hinge",    alpha=5e-6, max_iter=8,              tol=1e-3,
                              penalty="l2", random_state=SEED).fit(Xtr,y_tr)
    v._Xva = Xva; v._Xte = Xte
    char_log_clfs.append(clf_log)
    char_hinge_clfs.append(clf_hinge)

# Token vectorizers (log-loss only)
tok_vecs, tok_log_clfs = [], []
for rng in TOK_CFGS:
    v = HashingVectorizer(analyzer=token_analyzer, ngram_range=rng, n_features=N_FEATURES,
                          alternate_sign=False, norm="l2")
    Xtr = v.transform(urls_tr); Xva = v.transform(urls_va); Xte = v.transform(urls_te)
    clf = SGDClassifier(loss="log_loss", alpha=2e-5, max_iter=MAX_ITER_SGD, tol=1e-3,
                        penalty="l2", random_state=SEED).fit(Xtr,y_tr)
    v._Xva = Xva; v._Xte = Xte
    tok_vecs.append(v); tok_log_clfs.append(clf)

def lex_and_hinge_scores(pre_va=True):
    ps_lex, ps_hinge = [], []
    for v,clf in zip(char_vecs, char_log_clfs):
        X = v._Xva if pre_va else v._Xte
        ps_lex.append(clf.predict_proba(X)[:,1])
    for v,clf in zip(char_vecs, char_hinge_clfs):
        X = v._Xva if pre_va else v._Xte
        ps_hinge.append(1/(1+np.exp(-clf.decision_function(X))))
    for v,clf in zip(tok_vecs, tok_log_clfs):
        X = v._Xva if pre_va else v._Xte
        ps_lex.append(clf.predict_proba(X)[:,1])
    p_lex   = np.mean(np.stack(ps_lex,   0), axis=0).astype(np.float32)
    p_hinge = np.mean(np.stack(ps_hinge, 0), axis=0).astype(np.float32)
    return p_lex, p_hinge

p_lex_val,  p_hinge_val  = lex_and_hinge_scores(pre_va=True)
p_lex_test, p_hinge_test = lex_and_hinge_scores(pre_va=False)


In [None]:
# ----------------- Prior blend (grid on VAL) -----------------
(r_url_v,c_url_v, r_host_v,c_host_v, r_dom_v,c_dom_v, r_tld_v,c_tld_v,
 r_pf_v,c_pf_v, r_pl_v,c_pl_v, r_qk_v,c_qk_v, r_hpf_v,c_hpf_v, r_dpf_v,c_dpf_v, r_dpl_v,c_dpl_v) = vec_maps(val_df, pri)
(r_url_t,c_url_t, r_host_t,c_host_t, r_dom_t,c_dom_t, r_tld_t,c_tld_t,
 r_pf_t,c_pf_t, r_pl_t,c_pl_t, r_qk_t,c_qk_t, r_hpf_t,c_hpf_t, r_dpf_t,c_dpf_t, r_dpl_t,c_dpl_t) = vec_maps(test_df, pri)

def blend_priors(vals_tuple, weights):
    R = np.vstack(vals_tuple).T
    w = np.array(weights, dtype=np.float32); sw = w.sum()
    return (R @ w)/sw

PRIOR_WEIGHTS_GRID = [
    (6,10,12,2,3,3,1, 8,6,4),
    (5, 9,12,2,2,2,1, 7,6,4),
    (4, 8,10,2,2,2,1, 7,6,4),
    (3, 7,10,1,1,1,1, 6,5,3),
]
best = {"acc":-1}
for w in PRIOR_WEIGHTS_GRID:
    ppri_v = blend_priors((r_url_v,r_host_v,r_dom_v,r_tld_v,r_pf_v,r_pl_v,r_qk_v, r_hpf_v,r_dpf_v,r_dpl_v), w)
    for wl in np.linspace(0.0,1.0,11):
        pv = wl*p_lex_val + (1.0-wl)*ppri_v
        thr, acc = best_acc_threshold(y_va, pv)
        if acc>best["acc"]: best={"acc":acc,"thr":thr,"wl":float(wl),"w":w}

ppri_t = blend_priors((r_url_t,r_host_t,r_dom_t,r_tld_t,r_pf_t,r_pl_t,r_qk_t, r_hpf_t,r_dpf_t,r_dpl_t), best["w"])
p_base_val  = best["wl"]*p_lex_val  + (1.0-best["wl"])*blend_priors((r_url_v,r_host_v,r_dom_v,r_tld_v,r_pf_v,r_pl_v,r_qk_v, r_hpf_v,r_dpf_v,r_dpl_v), best["w"])
p_base_test = best["wl"]*p_lex_test + (1.0-best["wl"])*ppri_t
thr0, acc0 = best["thr"], best["acc"]


In [None]:
# ----------------- urlBERT on strict Top-K uncertain (dynamically reduced) -----------------
def select_topk_uncertain(p, max_n):
    if max_n <= 0: return np.array([], dtype=np.int64)
    return np.argsort(np.abs(p-0.5))[:max_n]

def bert_probs_fast(urls, init_bs=BATCH_INFER_INIT, min_bs=BATCH_INFER_MIN, max_len=MAX_LEN_INFER):
    model.eval()
    out = np.empty(len(urls), dtype=np.float32); i, bs = 0, init_bs
    while i < len(urls):
        try:
            j = min(i+bs, len(urls))
            enc = tokenizer(urls[i:j], truncation=True, padding=True, max_length=max_len, return_tensors="pt")
            ids=enc["input_ids"].to(device, non_blocking=True)
            attn=enc["attention_mask"].to(device, non_blocking=True)
            with torch.inference_mode(), torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
                p = torch.softmax(model(ids,attn),1)[:,1].float().cpu().numpy()
            out[i:j]=p; i=j
        except RuntimeError as e:
            if "CUDA out of memory" in str(e) and bs>min_bs:
                torch.cuda.empty_cache(); bs//=2
            else: raise
    return out

# shrink BERT work if time is tight
def pick_k(base_k):
    r = remaining()
    if r < 70:  return max(1000, base_k//4)
    if r < 140: return max(2000, base_k//2)
    return base_k

MAX_BERT_VAL  = pick_k(MAX_BERT_VAL_BASE)
MAX_BERT_TEST = pick_k(MAX_BERT_TEST_BASE)
if remaining() < 200:
    MAX_LEN_INFER = min(MAX_LEN_INFER, 80)

idx_v = select_topk_uncertain(p_base_val,  MAX_BERT_VAL)
p_val_try = p_base_val.copy()
if idx_v.size:
    p_val_try[idx_v] = bert_probs_fast([val_df["url"].tolist()[i] for i in idx_v])
thr1, acc1 = best_acc_threshold(y_va, p_val_try)
use_bert = (acc1 - acc0) >= VAL_IMPROVE_MARGIN

if use_bert and remaining()>40:   # ensure we have time left for test BERT
    p_val = p_val_try
    idx_t = select_topk_uncertain(p_base_test, MAX_BERT_TEST)
    p_test = p_base_test.copy()
    if idx_t.size:
        p_test[idx_t] = bert_probs_fast([test_df["url"].tolist()[i] for i in idx_t])
else:
    p_val, p_test = p_base_val, p_base_test

In [None]:
# ----------------- Engineered features + HGB meta (iterations shrink if needed) -----------------
def featurize(df):
    u = df["url"].values
    h = df["_host"].values
    p = np.array([urlparse(x).path for x in u], dtype=object)
    q = np.array([urlparse(x).query for x in u], dtype=object)
    Lu = np.vectorize(len)(u); Lh=np.vectorize(len)(h)
    Lp = np.vectorize(len)(p); Lq=np.vectorize(len)(q)
    dots   = np.array([x.count(".") for x in h])
    hyph   = np.array([x.count("-") for x in u])
    slsh   = np.array([x.count("/") for x in u])
    digs   = np.array([sum(c.isdigit() for c in u[i]) for i in range(len(u))])
    pct    = np.array([u[i].count("%") for i in range(len(u))])
    depth  = np.array([max(0, len([s for s in urlparse(x).path.split("/") if s])) for x in u])
    qn     = np.array([len(parse_qsl(urlparse(x).query, keep_blank_values=True)) for x in u])
    dfrac  = (digs / np.maximum(1, Lu))
    t = df["_tld"].values
    is_cc  = np.array([1 if len(str(tt))==2 else 0 for tt in t])
    t_is_com = np.array([1 if str(tt)=="com" else 0 for tt in t])
    kw_list = ["login","verify","account","update","secure","confirm","pay","bank","free","click","signin","reset","gift","offer"]
    kw_feat = np.array([[1 if k in u[i].lower() else 0 for k in kw_list] for i in range(len(u))], dtype=np.int16)
    X = np.column_stack([Lu,Lh,Lp,Lq,dots,hyph,slsh,digs,pct,depth,qn,dfrac,is_cc,t_is_com, kw_feat])
    return X.astype(np.float32)

X_tr = featurize(train_df); y_tr = train_df["label"].values
X_va = featurize(val_df);   y_va = val_df["label"].values
X_te = featurize(test_df)

HGB_ITERS = 200 if remaining() > 120 else 120
meta_ITERS= 160 if remaining() > 90  else 120

hgb = HistGradientBoostingClassifier(max_depth=6, max_iter=HGB_ITERS, learning_rate=0.1,
                                     validation_fraction=None, random_state=SEED)
hgb.fit(X_tr, y_tr)
p_hgb_val = hgb.predict_proba(X_va)[:,1].astype(np.float32)
p_hgb_test= hgb.predict_proba(X_te)[:,1].astype(np.float32)


In [None]:
# ----------------- Meta stacker (HGB) over scores + cheap priors -----------------
gr = pri["global_rate"]
def mapv(df, tag):
    rmap, cmap = pri[f"rate{tag}"], pri[f"cnt{tag}"]
    vals = df[tag].values
    r = np.array([rmap.get(v,gr) for v in vals], dtype=np.float32)
    c = np.array([cmap.get(v,0)   for v in vals], dtype=np.int32)
    return r, np.clip(c, 0, 1000).astype(np.int32)

r_host_v2,c_host_v2 = mapv(val_df, "_host");   r_dom_v2,c_dom_v2 = mapv(val_df, "_dom")
r_tld_v2,c_tld_v2   = mapv(val_df, "_tld");    r_pf_v2, c_pf_v2  = mapv(val_df, "_pf")
r_host_t2,c_host_t2 = mapv(test_df, "_host");  r_dom_t2,c_dom_t2 = mapv(test_df, "_dom")
r_tld_t2,c_tld_t2   = mapv(test_df, "_tld");   r_pf_t2, c_pf_t2  = mapv(test_df, "_pf")

def depth_vec(df):
    return np.array([max(0, len([s for s in urlparse(u).path.split('/') if s])) for u in df["url"].values], dtype=np.int16)

depth_v = depth_vec(val_df); depth_t = depth_vec(test_df)

S_val = np.column_stack([
    p_val,                       # urlBERT-gated
    p_lex_val,                   # lexical
    p_base_val,                  # prior-blend ref
    p_hgb_val,                   # engineered HGB
    p_hinge_val,                 # hinge char
    r_host_v2, r_dom_v2, r_tld_v2,
    c_host_v2, c_dom_v2, c_tld_v2,
    depth_v
]).astype(np.float32)

S_tst = np.column_stack([
    p_test,
    p_lex_test,
    p_base_test,
    p_hgb_test,
    p_hinge_test,
    r_host_t2, r_dom_t2, r_tld_t2,
    c_host_t2, c_dom_t2, c_tld_t2,
    depth_t
]).astype(np.float32)

meta = HistGradientBoostingClassifier(max_depth=4, max_iter=meta_ITERS, learning_rate=0.12,
                                      validation_fraction=None, random_state=SEED)
meta.fit(S_val, y_va)
p_meta_val  = meta.predict_proba(S_val)[:,1].astype(np.float32)
p_meta_test = meta.predict_proba(S_tst)[:,1].astype(np.float32)

# Use meta output for final stage
p_val, p_test = p_meta_val, p_meta_test

In [None]:
# ----------------- ACC overrides + per-bucket thresholds -----------------
def vec_all_for_override(df):
    def mk(tag):
        rmap, cmap = pri[f"rate{tag}"], pri[f"cnt{tag}"]
        vals = df[tag].values
        r = np.array([rmap.get(v,gr) for v in vals], dtype=np.float32)
        c = np.array([cmap.get(v,0)   for v in vals], dtype=np.int32)
        return r,c
    r_host,c_host=mk("_host"); r_dom,c_dom=mk("_dom"); r_tld,c_tld=mk("_tld")
    r_url,c_url  = mk("_url");  r_pf,c_pf =mk("_pf");  r_pl,c_pl =mk("_pl"); r_qk,c_qk=mk("_qk")
    r_hpf,c_hpf  = mk("_h_pf"); r_dpf,c_dpf=mk("_d_pf"); r_dpl,c_dpl=mk("_d_pl")
    return (r_host,c_host,r_dom,c_dom,r_tld,c_tld,r_url,c_url,r_pf,c_pf,r_pl,c_pl,r_qk,c_qk,
            r_hpf,c_hpf,r_dpf,c_dpf,r_dpl,c_dpl)

(v_rh,v_ch,v_rd,v_cd,v_rt,v_ct,v_ru,v_cu,v_rpf,v_cpf,v_rpl,v_cpl,v_rqk,v_cqk,
 v_rhpf,v_chpf, v_rdpf,v_cdpf, v_rdpl,v_cdpl) = vec_all_for_override(val_df)
(t_rh,t_ch,t_rd,t_cd,t_rt,t_ct,t_ru,t_cu,t_rpf,t_cpf,t_rpl,t_cpl,t_rqk,t_cqk,
 t_rhpf,t_chpf, t_rdpf,t_cdpf, t_rdpl,t_cdpl) = vec_all_for_override(test_df)

thr_start, _ = best_acc_threshold(y_va, p_val)
ov_best = {"acc":-1, "thr":thr_start}
for mc in MIN_COUNT_LIST:
    for plo in P_LO_LIST:
        for phi in P_HI_LIST:
            pv = apply_overrides(p_val,
                                 v_ch,v_rh, v_cd,v_rd, v_ct,v_rt, v_cu,v_ru, v_cpf,v_rpf, v_cpl,v_rpl, v_cqk,v_rqk,
                                 v_chpf,v_rhpf, v_cdpf,v_rdpf, v_cdpl,v_rdpl,
                                 mc, plo, phi)
            thr, acc = best_acc_threshold(y_va, pv)
            if acc>ov_best["acc"]:
                ov_best={"acc":acc,"thr":thr,"mc":mc,"plo":plo,"phi":phi}

p_val  = apply_overrides(p_val,
                         v_ch,v_rh, v_cd,v_rd, v_ct,v_rt, v_cu,v_ru, v_cpf,v_rpf, v_cpl,v_rpl, v_cqk,v_rqk,
                         v_chpf,v_rhpf, v_cdpf,v_rdpf, v_cdpl,v_rdpl,
                         ov_best["mc"], ov_best["plo"], ov_best["phi"])
p_test = apply_overrides(p_test,
                         t_ch,t_rh, t_cd,t_rd, t_ct,t_rt, t_cu,t_ru, t_cpf,t_rpf, t_cpl,t_rpl, t_rqk,t_rqk,
                         t_chpf,t_rhpf, t_cdpf,t_rdpf, t_cdpl,t_rdpl,
                         ov_best["mc"], ov_best["plo"], ov_best["phi"])

# per-bucket thresholds: host x domain x TLD counts
vh = np.array([pri["cnt_host"].get(h,0) for h in val_df["_host"].values], dtype=np.int32)
th = np.array([pri["cnt_host"].get(h,0) for h in test_df["_host"].values], dtype=np.int32)
vd = np.array([pri["cnt_dom"].get(d,0)  for d in val_df["_dom"].values],  dtype=np.int32)
td = np.array([pri["cnt_dom"].get(d,0)  for d in test_df["_dom"].values], dtype=np.int32)
vt = np.array([pri["cnt_tld"].get(t,0)  for t in val_df["_tld"].values],  dtype=np.int32)
tt = np.array([pri["cnt_tld"].get(t,0)  for t in test_df["_tld"].values], dtype=np.int32)

vhb, thb = bucketize_vec(vh, HOST_BUCKETS), bucketize_vec(th, HOST_BUCKETS)
vdb, tdb = bucketize_vec(vd, DOM_BUCKETS),  bucketize_vec(td, DOM_BUCKETS)
vtb, ttb = bucketize_vec(vt, TLD_BUCKETS),  bucketize_vec(tt, TLD_BUCKETS)

NHB, NDB, NTB = len(HOST_BUCKETS), len(DOM_BUCKETS), len(TLD_BUCKETS)
val_combo  = (vhb*NDB + vdb)*NTB + vtb
test_combo = (thb*NDB + tdb)*NTB + ttb
NBUCKETS   = NHB * NDB * NTB

bucket_thr = np.full(NBUCKETS, ov_best["thr"], dtype=np.float32)
for bi in range(NBUCKETS):
    m = (val_combo==bi)
    if m.sum() > 1:
        thr_bi,_ = best_acc_threshold(y_va[m], p_val[m])
        bucket_thr[bi]=thr_bi

In [None]:
# ----------------- Final TEST prediction & single-line metrics -----------------
from sklearn.metrics import roc_auc_score
y_test = test_df["label"].values
yhat_test = np.zeros_like(y_test)
for bi in range(NBUCKETS):
    m = (test_combo==bi)
    if m.sum(): yhat_test[m] = (p_test[m] >= bucket_thr[bi]).astype(int)

acc = accuracy_score(y_test, yhat_test)
rec = recall_score(y_test, yhat_test, zero_division=0)
f1  = f1_score(y_test, yhat_test, zero_division=0)
try: auc = roc_auc_score(y_test, p_test)
except: auc = float("nan")

print(f"FINAL_TEST_METRICS acc={acc:.6f} auc={auc:.6f} f1={f1:.6f} rec={rec:.6f}")
ttotal()

FINAL_TEST_METRICS acc=0.971693 auc=0.996113 f1=0.971573 rec=0.967462
[TIMER] TOTAL: 266.8s
