In [None]:
# ===============================================================
# SBERT Embedding Generation
# Outputs (under ./embeddings)
# ===============================================================

from pathlib import Path
import numpy as np
import pandas as pd

import torch
from sentence_transformers import SentenceTransformer

from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import root_mean_squared_error, r2_score

# --------------------
# Config
# --------------------
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

DATA_DIR = Path("./data")
EMB_DIR  = Path("./embeddings"); EMB_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_PATH = DATA_DIR / "train_preprocessed.csv"
TEST_PATH  = DATA_DIR / "test_preprocessed.csv"

SBERT_MODEL = "sentence-transformers/all-mpnet-base-v2"
MODEL_TAG   = SBERT_MODEL.split("/")[-1]                  # e.g. all-mpnet-base-v2

EMB_MODEL_DIR = EMB_DIR / MODEL_TAG;    EMB_MODEL_DIR.mkdir(parents=True, exist_ok=True)
RAW_DIR = EMB_MODEL_DIR / "raw";        RAW_DIR.mkdir(parents=True, exist_ok=True)
PCA_DIR = EMB_MODEL_DIR / "pca";        PCA_DIR.mkdir(parents=True, exist_ok=True)
SEV_DIR = EMB_MODEL_DIR / "severity";   SEV_DIR.mkdir(parents=True, exist_ok=True)

BATCH_SIZE = 512
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
EMB_NORMALIZE = True                 # normalize to unit length
ADD_LENGTH_FEATURES = True           # append desc lengths in PCA parquet
DO_PCA = True                        # toggle PCA parquet output
N_COMPONENTS_PCA = 128               # num PCA components (fit on train, apply to test)

ID_COL = "ClaimNumber"
TEXT_COL = "ClaimDescription"

# --------------------
# Utilities
# --------------------
def normalize_text(s: pd.Series) -> pd.Series:
    s = s.fillna("").astype(str)
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    return s

def to_numpy_backed(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # PCA cols -> float32
    for c in [col for col in df.columns if col.startswith("text_pca_")]:
        df[c] = df[c].astype(np.float32)
    # length features -> int32
    for c in ["desc_char_len","desc_token_len"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(np.int32)
    if ID_COL in df.columns:
        df[ID_COL] = df[ID_COL].astype(str)
    # ensure numpy-backed dtypes (avoid Arrow ext types)
    try:
        df = df.convert_dtypes(dtype_backend="numpy_nullable")
    except TypeError:
        pass
    return df

def load_data(train_path: Path, test_path: Path) -> tuple[pd.DataFrame, pd.DataFrame]:
    tr = pd.read_csv(train_path)
    te = pd.read_csv(test_path)
    for frame in (tr, te):
        frame.replace([np.inf, -np.inf], np.nan, inplace=True)
    assert ID_COL in tr.columns and ID_COL in te.columns, f"Missing {ID_COL}"
    assert TEXT_COL in tr.columns and TEXT_COL in te.columns, f"Missing {TEXT_COL}"
    return tr, te

def build_lengths(df: pd.DataFrame, src_col: str) -> tuple[np.ndarray, np.ndarray]:
    char_len = df[src_col].str.len().astype(np.int32).values
    tok_len  = df[src_col].str.split().str.len().astype(np.int32).values
    return char_len, tok_len

def encode_texts(model: SentenceTransformer, texts: pd.Series,
                 batch_size: int, normalize: bool) -> np.ndarray:
    embs = model.encode(
        texts.tolist(),
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=normalize,
    )
    return embs.astype(np.float32)

def run_pca(train_emb: np.ndarray, test_emb: np.ndarray,
            n_components: int, seed: int) -> tuple[np.ndarray, np.ndarray, PCA, float]:
    pca = PCA(n_components=n_components, svd_solver="auto", random_state=seed)
    Xtr = pca.fit_transform(train_emb)
    Xte = pca.transform(test_emb)
    return Xtr.astype(np.float32), Xte.astype(np.float32), pca, float(pca.explained_variance_ratio_.sum())


In [None]:
# --------------------
# Orchestration
# --------------------
# 1) Load & normalize text
train, test = load_data(TRAIN_PATH, TEST_PATH)
train[f"{TEXT_COL}_norm"] = normalize_text(train[TEXT_COL])
test[f"{TEXT_COL}_norm"]  = normalize_text(test[TEXT_COL])

if ADD_LENGTH_FEATURES:
    tr_char, tr_tok = build_lengths(train, f"{TEXT_COL}_norm")
    te_char, te_tok = build_lengths(test, f"{TEXT_COL}_norm")

# 2) Load encoder model
model = SentenceTransformer(SBERT_MODEL, device=DEVICE)
emb_dim = model.get_sentence_embedding_dimension()
print(f"Using SBERT: {SBERT_MODEL} | Emb dim: {emb_dim} | Device: {DEVICE}")

# 3) Encode
train_emb = encode_texts(model, train[f"{TEXT_COL}_norm"], BATCH_SIZE, EMB_NORMALIZE)
test_emb  = encode_texts(model, test[f"{TEXT_COL}_norm"],  BATCH_SIZE, EMB_NORMALIZE)
print(f"Embeddings: train {train_emb.shape}, test {test_emb.shape}")

# 4) Save raw embeddings (.npy)
np.save(RAW_DIR / f"sbert_train.npy", train_emb)
np.save(RAW_DIR / f"sbert_test.npy",  test_emb)
print(f"Saved raw embeddings: {RAW_DIR / f'sbert_train.npy'}, {RAW_DIR / f'sbert_test.npy'}")

# 5) Optional PCA → parquet with ClaimNumber (+ optional length features)
if DO_PCA:
    tr_pca, te_pca, pca, expl = run_pca(train_emb, test_emb, N_COMPONENTS_PCA, SEED)
    print(f"PCA: n_components={N_COMPONENTS_PCA} | explained_variance_sum={expl:.4f}")

    pca_cols = [f"text_pca_{i}" for i in range(N_COMPONENTS_PCA)]

    pca_tr_df = pd.DataFrame(tr_pca, columns=pca_cols)
    pca_tr_df.insert(0, ID_COL, train[ID_COL].astype(str).values)
    pca_te_df = pd.DataFrame(te_pca, columns=pca_cols)
    pca_te_df.insert(0, ID_COL, test[ID_COL].astype(str).values)

    if ADD_LENGTH_FEATURES:
        pca_tr_df["desc_char_len"] = tr_char
        pca_tr_df["desc_token_len"] = tr_tok
        pca_te_df["desc_char_len"] = te_char
        pca_te_df["desc_token_len"] = te_tok

    pca_tr_df = to_numpy_backed(pca_tr_df)
    pca_te_df = to_numpy_backed(pca_te_df)

    out_tr = PCA_DIR / f"text_pca_train.parquet"
    out_te = PCA_DIR / f"text_pca_test.parquet"
    pca_tr_df.to_parquet(out_tr, engine="pyarrow", index=False)
    pca_te_df.to_parquet(out_te, engine="pyarrow", index=False)
    print(f"Saved PCA parquet: {out_tr}, {out_te}")

# 6) Text severity (Ridge OOF on logUICC)
X = tr_pca
Xtest = te_pca
y = train["logUICC"].values
y_strat = train["accident_year"].astype(int).values

RIDGE_ALPHAS = np.logspace(-2, 2, 9)  # [0.01..100]
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

oof = np.zeros(len(train), dtype=np.float32)
test_pred_folds = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y_strat), 1):
    Xtr, ytr = X[tr_idx], y[tr_idx]
    Xva, yva = X[va_idx], y[va_idx]

    rc = RidgeCV(alphas=RIDGE_ALPHAS, cv=3, scoring="neg_mean_squared_error")
    rc.fit(Xtr, ytr)
    alpha = float(rc.alpha_)

    mdl = Ridge(alpha=alpha, random_state=SEED)
    mdl.fit(Xtr, ytr)

    oof[va_idx] = mdl.predict(Xva).astype(np.float32)
    test_pred_folds.append(mdl.predict(Xtest).astype(np.float32))

# OOF diagnostics
oof_rmse_log = root_mean_squared_error(y, oof)
oof_rmse_u   = root_mean_squared_error(np.expm1(y), np.expm1(oof))
oof_r2       = r2_score(y, oof)
print(f"[text_severity] OOF: RMSE(log)={oof_rmse_log:.4f} | R2={oof_r2:.4f} | RMSE(orig)={oof_rmse_u:,.2f}")

# full-train model and test preds
rc_full = RidgeCV(alphas=RIDGE_ALPHAS, cv=5, scoring="neg_mean_squared_error").fit(X, y)
ridge_full = Ridge(alpha=float(rc_full.alpha_), random_state=SEED).fit(X, y)
test_pred_full = ridge_full.predict(Xtest).astype(np.float32)

test_pred_stack = np.mean(np.stack(test_pred_folds, axis=0), axis=0).astype(np.float32)

# Save severity parquet
sev_oof_df = pd.DataFrame({ID_COL: train[ID_COL].astype(str).values, "text_sev_oof": oof})
sev_te_df  = pd.DataFrame({ID_COL: test[ID_COL].astype(str).values,  "text_sev_pred": test_pred_stack})

sev_oof_df.to_parquet(SEV_DIR / "text_severity_oof.parquet", index=False)
sev_te_df.to_parquet(SEV_DIR / "text_severity_test.parquet", index=False)
print(f"Saved severity parquet: {SEV_DIR / 'text_severity_oof.parquet'}, {SEV_DIR / 'text_severity_test.parquet'}")


Using SBERT: sentence-transformers/all-mpnet-base-v2 | Emb dim: 768 | Device: cuda


Batches: 100%|██████████| 106/106 [00:39<00:00,  2.71it/s]
Batches: 100%|██████████| 71/71 [00:26<00:00,  2.65it/s]


Embeddings: train (54000, 768), test (36000, 768)
Saved raw embeddings: embeddings\all-mpnet-base-v2\raw\sbert_train.npy, embeddings\all-mpnet-base-v2\raw\sbert_test.npy
PCA: n_components=128 | explained_variance_sum=0.9144
Saved PCA parquet: embeddings\all-mpnet-base-v2\pca\text_pca_train.parquet, embeddings\all-mpnet-base-v2\pca\text_pca_test.parquet
[text_severity] OOF: RMSE(log)=1.1641 | R2=0.4178 | RMSE(orig)=33,381.98
Saved severity parquet: embeddings\all-mpnet-base-v2\severity\text_severity_oof.parquet, embeddings\all-mpnet-base-v2\severity\text_severity_test.parquet
