In [8]:
import numpy as np, pandas as pd, torch
from torch.nn.functional import softmax
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from sklearn.metrics import classification_report

FINAL_DIR = "artifacts/distilbert_resume_cls_final"
TEST_PARQUET = "../data/processed/classification_test.parquet"
TRAIN_PARQUET = "../data/processed/classification_train.parquet"

def load_inference_bundle(final_dir=FINAL_DIR):
    tok = AutoTokenizer.from_pretrained(final_dir)                               # tokenizer for inference [web:527]
    cfg = AutoConfig.from_pretrained(final_dir)                                   # config carries id2label/label2id [web:343]
    model = AutoModelForSequenceClassification.from_pretrained(final_dir, config=cfg).eval()  # model weights [web:349]
    return tok, model

def resolve_id2label(model, train_parquet=TRAIN_PARQUET):
    # Prefer saved mapping; if generic LABEL_x, rebuild from training labels (sorted)
    if getattr(model.config, "id2label", None) and len(model.config.id2label) > 0:
        id2label = {int(k): v for k, v in dict(model.config.id2label).items()}   # ensure int keys [web:343]
        if all(str(v).startswith("LABEL_") for v in id2label.values()):
            df = pd.read_parquet(train_parquet)
            labels = sorted(df["label"].dropna().unique().tolist())
            id2label = {i: labels[i] for i in range(len(labels))}
    else:
        df = pd.read_parquet(train_parquet)
        labels = sorted(df["label"].dropna().unique().tolist())
        id2label = {i: labels[i] for i in range(len(labels))}
    return id2label

def infer_predictions_df(sample_n=50, batch_size=64, max_length=256):
    tok, model = load_inference_bundle()                                          # load tokenizer/model [web:349][web:527]
    id2label = resolve_id2label(model)                                            # consistent label names [web:343]
    test_df = pd.read_parquet(TEST_PARQUET)                                       # load held‑out data [web:442]
    test_df = test_df[test_df["label"].notna()].copy()                            # drop missing labels [web:442]
    # Optional sub‑sample for quicker view
    if sample_n and sample_n < len(test_df):
        test_df = test_df.sample(sample_n, random_state=42)
    # Batched inference
    logits_all = []
    texts = test_df["text"].tolist()
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tok(batch, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
        with torch.no_grad():
            logits_all.append(model(**enc).logits.cpu())
    if not logits_all:
        return pd.DataFrame(), pd.DataFrame()
    logits = torch.cat(logits_all, dim=0)
    probs = softmax(logits, dim=1).numpy()
    pred_ids = probs.argmax(axis=1)
    pred_labels = [id2label[int(i)] for i in pred_ids]
    true_labels = test_df["label"].astype(str).str.strip().tolist()
    # Assemble per‑row predictions
    rows = []
    for i, text in enumerate(texts[:len(pred_labels)]):
        idxs = probs[i].argsort()[-3:][::-1]
        top3 = [(id2label[int(j)], float(probs[i][j])) for j in idxs]
        rows.append({
            "text_snippet": (text or "")[:200].replace("\n", " ") + ("..." if text and len(text) > 200 else ""),
            "true_label": true_labels[i],
            "pred_label": pred_labels[i],
            "confidence": float(probs[i].max()),
            "top3": top3
        })
    pred_df = pd.DataFrame(rows)
    return pred_df, (probs, true_labels, pred_labels, id2label)

def metrics_df_from_outputs(probs, true_labels, pred_labels, id2label):
    # Accuracy and top‑k on strings
    acc = float(np.mean([p == t for p, t in zip(pred_labels, true_labels)]))
    def topk_hit(i, k):
        idxs = probs[i].argsort()[-k:][::-1]
        return true_labels[i] in [id2label[int(j)] for j in idxs]
    top3 = float(np.mean([topk_hit(i, 3) for i in range(len(true_labels))])) if len(true_labels) else 0.0
    top5 = float(np.mean([topk_hit(i, 5) for i in range(len(true_labels))])) if len(true_labels) else 0.0
    # Classification report to DataFrame
    report = classification_report(true_labels, pred_labels, zero_division=0, output_dict=True)  # sklearn report [web:442]
    report_df = pd.DataFrame(report).T.reset_index().rename(columns={"index": "label_or_avg"})
    overall_df = pd.DataFrame([{"metric": "accuracy", "value": round(acc, 4)},
                               {"metric": "top3", "value": round(top3, 4)},
                               {"metric": "top5", "value": round(top5, 4)}])
    return overall_df, report_df

# Run once and display clean DataFrames
pred_df, bundle = infer_predictions_df(sample_n=200)                               # increase sample_n for more rows [web:349]
probs, y_true_lbls, y_pred_lbls, id2label = bundle
overall_df, report_df = metrics_df_from_outputs(probs, y_true_lbls, y_pred_lbls, id2label)

# Show in notebook (or save to CSV)
print("Overall metrics:\n", overall_df)
print("\nPer‑class report (head):\n", report_df.head(12))
print("\nPredictions (head):\n", pred_df.head(12))
# pred_df.to_csv("artifacts/predictions_sample.csv", index=False)
# overall_df.to_csv("artifacts/metrics_overall.csv", index=False)
# report_df.to_csv("artifacts/metrics_report.csv", index=False)


Overall metrics:
      metric  value
0  accuracy  0.905
1      top3  0.955
2      top5  0.970

Per‑class report (head):
                  label_or_avg  precision    recall  f1-score  support
0                  Accountant   1.000000  1.000000  1.000000      8.0
1                    Advocate   0.800000  1.000000  0.888889      4.0
2                 Agriculture   0.750000  0.600000  0.666667      5.0
3                     Apparel   0.600000  1.000000  0.750000      3.0
4                Architecture   0.833333  1.000000  0.909091      5.0
5                        Arts   0.857143  1.000000  0.923077      6.0
6                  Automobile   0.750000  0.500000  0.600000      6.0
7                    Aviation   1.000000  1.000000  1.000000      5.0
8                         BPO   0.666667  0.500000  0.571429      4.0
9                     Banking   1.000000  1.000000  1.000000      5.0
10  Building and Construction   1.000000  0.714286  0.833333      7.0
11           Business Analyst   1.00000

In [9]:
# eval_inference_string_labels_fixed.py
# Use saved model; rebuild id2label if config has generic "LABEL_x" names.

import numpy as np, pandas as pd, torch
from torch.nn.functional import softmax
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from sklearn.metrics import classification_report

FINAL_DIR = "artifacts/distilbert_resume_cls_final"
TEST_PARQUET = "../data/processed/classification_test.parquet"
TRAIN_PARQUET = "../data/processed/classification_train.parquet"

# 1) Load model/tokenizer/config
tok = AutoTokenizer.from_pretrained(FINAL_DIR)
cfg = AutoConfig.from_pretrained(FINAL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(FINAL_DIR, config=cfg).eval()

# 2) Build id2label
def load_id2label_from_config(cfg):
    if getattr(cfg, "id2label", None) and len(cfg.id2label) > 0:
        return {int(k): v for k, v in dict(cfg.id2label).items()}
    return None

id2label = load_id2label_from_config(cfg)

# If mapping is missing or generic (e.g., all start with "LABEL_"), rebuild from training labels
def is_generic(mapping):
    return mapping and all(str(v).startswith("LABEL_") for v in mapping.values())

if (id2label is None) or is_generic(id2label):
    train_df = pd.read_parquet(TRAIN_PARQUET)
    labels = sorted(train_df["label"].dropna().unique().tolist())  # LabelEncoder uses sorted order
    id2label = {i: labels[i] for i in range(len(labels))}
label2id = {v: k for k, v in id2label.items()}

# 3) Load and clean test data
test_df = pd.read_parquet(TEST_PARQUET)
test_df = test_df[test_df["label"].notna()].copy()
y_true_labels = test_df["label"].astype(str).str.strip().tolist()

# 4) Batched inference
def predict_logits(texts, batch_size=64, max_length=256):
    outs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size].tolist()
        enc = tok(batch, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
        with torch.no_grad():
            outs.append(model(**enc).logits.cpu())
    return torch.cat(outs, dim=0) if outs else torch.empty((0, len(id2label)))

logits = predict_logits(test_df["text"], batch_size=64, max_length=256)
num_model_labels = logits.shape[1]
if num_model_labels != len(id2label):
    raise ValueError(f"Label count mismatch: model has {num_model_labels} outputs but id2label has {len(id2label)} names.")

probs = softmax(logits, dim=1).numpy()
pred_ids = probs.argmax(axis=1)
y_pred_labels = [id2label[int(i)] for i in pred_ids]

# 5) Metrics
acc = float(np.mean([p == t for p, t in zip(y_pred_labels, y_true_labels)]))
def topk_hit(i, k=3):
    idxs = probs[i].argsort()[-k:][::-1]
    return y_true_labels[i] in [id2label[int(j)] for j in idxs]
top3 = float(np.mean([topk_hit(i, 3) for i in range(len(y_true_labels))]))
top5 = float(np.mean([topk_hit(i, 5) for i in range(len(y_true_labels))]))
print({"accuracy": round(acc, 4), "top3": round(top3, 4), "top5": round(top5, 4)})

# 6) Classification report (string labels on both sides)
print(classification_report(y_true_labels, y_pred_labels, zero_division=0))

# 7) Compact predictions DataFrame
rows = []
sample_n = min(20, len(test_df))
for i in range(sample_n):
    idxs = probs[i].argsort()[-3:][::-1]
    rows.append({
        "text_snippet": (test_df["text"].iloc[i] or "")[:200].replace("\n", " ") + ("..." if len(test_df["text"].iloc[i]) > 200 else ""),
        "true_label": y_true_labels[i],
        "pred_label": y_pred_labels[i],
        "confidence": float(probs[i].max()),
        "top3": [(id2label[int(j)], float(probs[i][j])) for j in idxs]
    })
pred_df = pd.DataFrame(rows)
print("\nSample predictions (first 20):")
print(pred_df.to_string(index=False))


{'accuracy': 0.885, 'top3': 0.9459, 'top5': 0.9638}
                           precision    recall  f1-score   support

               Accountant       0.93      0.96      0.94        70
                 Advocate       0.92      0.95      0.93        58
              Agriculture       0.84      0.88      0.86        59
                  Apparel       0.90      0.83      0.86        64
             Architecture       0.92      0.71      0.80        69
                     Arts       0.91      0.92      0.92        66
               Automobile       0.77      0.55      0.64        62
                 Aviation       0.99      0.97      0.98        68
                      BPO       0.63      0.66      0.64        41
                  Banking       0.92      0.94      0.93        63
               Blockchain       1.00      1.00      1.00         9
Building and Construction       0.83      0.86      0.84        69
         Business Analyst       0.94      0.88      0.91        68
         