# Setup

In [1]:
import numpy as np, pandas as pd, joblib
import os
from pathlib import Path
from sentence_transformers import SentenceTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, classification_report

In [2]:
# Writing a class to wrap encoder so that it can be pickeled with the rest of the pipeline
class SbertEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, model_name="all-MiniLM-L6-v2", batch_size=32, device=None):
        self.model_name = model_name
        self.batch_size = batch_size
        self.device = device
        self._model = None
    def _m(self):
        if self._model is None:
            self._model = SentenceTransformer(self.model_name, device=self.device)
        return self._model
    def fit(self, X, y=None): return self
    def transform(self, X):
        return np.asarray(
            self._m().encode(X, batch_size=self.batch_size,
                             show_progress_bar=False, convert_to_numpy=True)
        )



In [3]:
DATA_PATH  = Path("../data/craft-ml-data.jsonl")
ART_DIR    = Path("artifacts"); ART_DIR.mkdir(exist_ok=True)
PIPE_PATH  = ART_DIR / "pipeline.joblib"
MLB_PATH   = ART_DIR / "mlb.joblib"
TAU_PATH   = ART_DIR / "tau.txt"
RANDOM     = 42
CV_FOLDS   = 5
TAU_GRID   = np.linspace(0.1, 0.90, 20)
WRITE = True
FILL_LABELS = False

os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option('display.max_colwidth', None)


 # Preprocessing

In [4]:
df = pd.read_json(DATA_PATH, lines=True)
df = df.loc[df["text"].notna() & df["labels"].notna()].reset_index(drop=True)

if FILL_LABELS:
    df["labels"] = df["labels"].apply(
    lambda lbls: lbls if (isinstance(lbls, list) and len(lbls) > 0) else ["none"]
    )

X_raw = df["text"].tolist()
y_raw = df["labels"].tolist()

mlb = MultiLabelBinarizer()
Y_all = mlb.fit_transform(y_raw)
mask   = Y_all.sum(1) > 0
X_lab  = [X_raw[i] for i,m in enumerate(mask) if m]
Y_lab  = Y_all[mask]

unlab_mask  = ~mask
X_unlab     = [X_raw[i] for i, m in enumerate(unlab_mask) if m]

print(f"{len(X_lab):,} labelled docs | {Y_lab.shape[1]} classes")

strat_y = np.where(Y_lab.sum(1)==1, Y_lab.argmax(1), -1)

261 labelled docs | 8 classes


In [5]:
# Build Pipeline
base_pipe = Pipeline([
    ("sbert",  SbertEncoder(batch_size=32)),
    ("scale",  StandardScaler(with_mean=False)),
    ("clf",    OneVsRestClassifier(
                   LogisticRegression(max_iter=1000,
                                      C=0.1,
                                      random_state=RANDOM)
                                      )
    ),
])


# Cross fold training

In [6]:

CLASS_COUNT  = Y_lab.shape[1]
VAL_FRAC     = 0.20

skf = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM)

oof_proba = np.zeros_like(Y_lab, dtype=float)
oof_pred  = np.zeros_like(Y_lab, dtype=int)
oof_tau   = np.zeros_like(Y_lab, dtype=float)

taus_folds   = []
fold_metrics = []

for fold, (tr_idx, te_idx) in enumerate(skf.split(X_lab, strat_y)):

    # Fold level split
    X_tr_all  = [X_lab[i] for i in tr_idx]
    Y_tr_all  = Y_lab[tr_idx]
    X_te      = [X_lab[i] for i in te_idx]
    Y_te      = Y_lab[te_idx]

    # Within fold split for threshold tuning
    strat_inner = np.where(Y_tr_all.sum(1)==1,
                           Y_tr_all.argmax(1), -1)
    X_tr, X_val, Y_tr, Y_val = train_test_split(
        X_tr_all, Y_tr_all,
        test_size   = VAL_FRAC,
        stratify    = strat_inner,
        random_state= RANDOM)


    pipe_inner  = base_pipe.fit(X_tr, Y_tr)
    proba_val   = pipe_inner.predict_proba(X_val)

    # learn best threshold per class
    best_taus_fold = np.zeros(CLASS_COUNT)
    for c in range(CLASS_COUNT):
        f1_by_t = [
            f1_score(Y_val[:, c],
                     (proba_val[:, c] >= t).astype(int),
                     average="binary", zero_division=0)
            for t in TAU_GRID
        ]
        best_taus_fold[c] = TAU_GRID[int(np.argmax(f1_by_t))]

    # Fit on full-fold train with that vector
    pipe = base_pipe.fit(X_tr_all, Y_tr_all)
    proba_te = pipe.predict_proba(X_te)
    Y_pred   = (proba_te >= best_taus_fold).astype(int)


    # Save Predictions
    oof_proba[te_idx] = proba_te
    oof_pred [te_idx] = Y_pred
    oof_tau  [te_idx] = best_taus_fold

    macro_f1 = f1_score(Y_te, Y_pred,
                        average="macro", zero_division=0)
    micro_f1 = f1_score(Y_te, Y_pred,
                        average="micro", zero_division=0)
    
    fold_metrics.append((macro_f1, micro_f1))
    taus_folds.append(best_taus_fold)

    print(f"fold {fold}:  macro-F1={macro_f1:.3f}  micro-F1={micro_f1:.3f}")



taus_folds = np.stack(taus_folds)
tau_vec    = taus_folds.mean(0)
print("\nPer-class τ:", np.round(tau_vec, 2))

macro_cv, micro_cv = np.mean(fold_metrics, axis=0)
print(f"CV macro-F1={macro_cv:.3f}  micro-F1={micro_cv:.3f}")



fold 0:  macro-F1=0.626  micro-F1=0.621
fold 1:  macro-F1=0.536  micro-F1=0.552
fold 2:  macro-F1=0.479  micro-F1=0.522
fold 3:  macro-F1=0.578  micro-F1=0.590
fold 4:  macro-F1=0.572  micro-F1=0.565

Per-class τ: [0.25 0.15 0.18 0.46 0.19 0.13 0.13 0.4 ]
CV macro-F1=0.558  micro-F1=0.570


# Evaluation

In [7]:
# Out of fold evaluation
print(classification_report(
        Y_lab,  
        oof_pred,
        target_names=mlb.classes_, 
        digits=3
))

macro_f1 = f1_score(Y_lab, oof_pred, average="macro", zero_division=0)
micro_f1 = f1_score(Y_lab, oof_pred, average="micro", zero_division=0)

print(f"\nMacro-F1: {macro_f1:.3f}   |   Micro-F1: {micro_f1:.3f}")

                                         precision    recall  f1-score   support

                epidemics_and_pandemics      0.528     0.559     0.543        34
                     financial_distress      0.373     0.564     0.449        39
                      financial_success      0.471     0.706     0.565        34
                    geopolitical_issues      0.655     0.528     0.585        36
                            investments      0.414     0.600     0.490        40
labor_workforce_compliance_human_rights      0.558     0.774     0.649        31
                      natural_disasters      0.809     0.927     0.864        41
                            supplychain      0.462     0.439     0.450        41

                              micro avg      0.519     0.635     0.571       296
                              macro avg      0.534     0.637     0.574       296
                           weighted avg      0.534     0.635     0.574       296
                          

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
final_pipe = base_pipe.fit(X_lab, Y_lab)

if not FILL_LABELS:
      proba_unlab = final_pipe.predict_proba(X_unlab)
      pred_unlab  = (proba_unlab >= tau_vec).astype(int)

      n_unlab = pred_unlab.shape[0]
      hits_per_doc = pred_unlab.sum(1)
      n_predicted_any = int((hits_per_doc > 0).sum())
      print(f"{n_predicted_any}/{n_unlab} unlabelled docs "
            f"({n_predicted_any / n_unlab:.1%}) received at least one tag")


117/137 unlabelled docs (85.4%) received at least one tag


# Save

In [9]:
final_pipe = base_pipe.fit(X_lab, Y_lab)

# This will only run on a fresh kernal. Run all cells at once for model to be saved propperly.
if WRITE:
    joblib.dump(final_pipe, PIPE_PATH, compress=3)
    joblib.dump(mlb,       MLB_PATH,  compress=3)
    np.save(ART_DIR / "tau_vec.npy",   tau_vec)

# Notes
- The model is too "Excited" to identify things right now. Need to take some steps to be more conservative in classification. This is probably due to the imbalance of the data.
- More advanced methods would likely perform better here, but we do not have a large amoutn of data to use with those mthods
- More work on word embedding could be done. Only looked at TF-IDF and sentance transformer.
- There may be some value in feature reduction here as well 
- Need to understand business context on "cost" of misclassification to inform how model is tuned and evaluated
- Approaching this as a document similarity task instead of one vs all classification may be one way to handle the issues arising from the diverse unlabled class  