In [1]:
from datasets import load_dataset

ds = load_dataset("google/civil_comments")
label_cols = [
    "toxicity",
    "severe_toxicity",
    "obscene",
    "threat",
    "insult",
    "identity_attack",
    "sexual_explicit",
]


In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

splits = list(ds.keys())
train_ds = ds["train"] if "train" in ds else ds[splits[0]]
val_ds = (
    ds["validation"] if "validation" in ds else (ds["test"] if "test" in ds else None)
)
test_ds = ds["test"] if ("test" in ds and val_ds is not ds.get("test")) else None

# # only 10k samples for faster experimentation
# train_ds = train_ds.select(range(100000))
# if val_ds is not None:
#     val_ds = val_ds.select(range(1000))
# if test_ds is not None:
#     test_ds = test_ds.select(range(1000))

# Prepare texts for vectorization
texts_train = np.array(train_ds["text"])
if val_ds is None:
    base_strat = (np.array(train_ds[label_cols[0]]) >= 0.5).astype(int)
    idx_train, idx_val = train_test_split(
        np.arange(len(texts_train)), test_size=0.2, random_state=42, stratify=base_strat
    )
    tr_texts = texts_train[idx_train]
    val_texts = texts_train[idx_val]
    use_split_indices = (idx_train, idx_val)
else:
    tr_texts = texts_train
    val_texts = np.array(val_ds["text"])
    use_split_indices = None

test_texts = np.array(test_ds["text"]) if test_ds is not None else None


In [3]:
# Vectorize text once and reuse for all labels
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=100_000,
    min_df=2,
    stop_words="english",
    dtype=np.float32,
)
X_train = tfidf_vectorizer.fit_transform(tr_texts)
X_val = tfidf_vectorizer.transform(val_texts)
X_test = tfidf_vectorizer.transform(test_texts) if test_texts is not None else None


def safe_auc(y_true, y_score):
    y_true = np.asarray(y_true)
    if len(np.unique(y_true)) < 2:
        return float("nan")
    return roc_auc_score(y_true, y_score)


# Train one model per label
logreg_models = {}
metrics_rows = []
print(
    f"Vectorized: train={X_train.shape}, val={X_val.shape}"
    + (f", test={X_test.shape}" if X_test is not None else "")
)

for label in label_cols:
    # Building binary targets (threshold at 0.5)
    y_full_train = np.array(train_ds[label])
    if use_split_indices is None:
        y_train = (y_full_train >= 0.4).astype(int)
        y_val = (np.array(val_ds[label]) >= 0.4).astype(int)
    else:
        idx_tr, idx_v = use_split_indices
        y_train = (y_full_train[idx_tr] >= 0.4).astype(int)
        y_val = (y_full_train[idx_v] >= 0.4).astype(int)

    y_test = (
        (np.array(test_ds[label]) >= 0.5).astype(int) if X_test is not None else None
    )

    # Logistic Regression model
    clf = LogisticRegression(
        solver="liblinear",
        class_weight="balanced",
        max_iter=1000,
    )
    clf.fit(X_train, y_train)
    logreg_models[label] = clf

    # Validation metrics
    val_scores = clf.decision_function(X_val)
    val_pred = (val_scores >= 0).astype(int)
    val_auc = safe_auc(y_val, val_scores)
    val_f1 = f1_score(y_val, val_pred)
    val_acc = accuracy_score(y_val, val_pred)

    msg = f"{label}: val_auc={val_auc:.3f} val_f1={val_f1:.3f} val_acc={val_acc:.3f}"

    row = {
        "label": label,
        "val_auc": float(val_auc) if not np.isnan(val_auc) else np.nan,
        "val_f1": float(val_f1),
        "val_acc": float(val_acc),
    }

    # Test metrics
    if X_test is not None:
        test_scores = clf.decision_function(X_test)
        test_pred = (test_scores >= 0).astype(int)
        test_auc = safe_auc(y_test, test_scores)
        test_f1 = f1_score(y_test, test_pred)
        test_acc = accuracy_score(y_test, test_pred)
        msg += (
            f" | test_auc={test_auc:.3f} test_f1={test_f1:.3f} test_acc={test_acc:.3f}"
        )

        row.update(
            {
                "test_auc": float(test_auc) if not np.isnan(test_auc) else np.nan,
                "test_f1": float(test_f1),
                "test_acc": float(test_acc),
            }
        )
    else:
        row.update(
            {
                "test_auc": np.nan,
                "test_f1": np.nan,
                "test_acc": np.nan,
            }
        )

    metrics_rows.append(row)
    print(msg)


Vectorized: train=(1804874, 100000), val=(97320, 100000), test=(97320, 100000)
toxicity: val_auc=0.938 val_f1=0.625 val_acc=0.893 | test_auc=0.949 test_f1=0.543 test_acc=0.882
severe_toxicity: val_auc=nan val_f1=0.000 val_acc=1.000 | test_auc=nan test_f1=0.000 test_acc=1.000
obscene: val_auc=0.948 val_f1=0.380 val_acc=0.979 | test_auc=0.972 test_f1=0.320 test_acc=0.979
threat: val_auc=0.965 val_f1=0.195 val_acc=0.976 | test_auc=0.965 test_f1=0.120 test_acc=0.975
insult: val_auc=0.951 val_f1=0.620 val_acc=0.914 | test_auc=0.961 test_f1=0.523 test_acc=0.903
identity_attack: val_auc=0.962 val_f1=0.346 val_acc=0.960 | test_auc=0.971 test_f1=0.217 test_acc=0.956
sexual_explicit: val_auc=0.967 val_f1=0.319 val_acc=0.988 | test_auc=0.973 test_f1=0.245 test_acc=0.987


In [4]:
# Build metrics table
metrics_df = pd.DataFrame(
    metrics_rows,
    columns=[
        "label",
        "val_auc",
        "val_f1",
        "val_acc",
        "test_auc",
        "test_f1",
        "test_acc",
    ],
)

# Compute mean of numeric columns
avg_vals = metrics_df.drop(columns=["label"]).mean(numeric_only=True)
avg_row = {**{"label": "AVG"}, **avg_vals.to_dict()}
metrics_df = pd.concat([metrics_df, pd.DataFrame([avg_row])], ignore_index=True)

# Round for readability
for col in metrics_df.columns:
    if col != "label":
        metrics_df[col] = metrics_df[col].round(3)

metrics_df

Unnamed: 0,label,val_auc,val_f1,val_acc,test_auc,test_f1,test_acc
0,toxicity,0.938,0.625,0.893,0.949,0.543,0.882
1,severe_toxicity,,0.0,1.0,,0.0,1.0
2,obscene,0.948,0.38,0.979,0.972,0.32,0.979
3,threat,0.965,0.195,0.976,0.965,0.12,0.975
4,insult,0.951,0.62,0.914,0.961,0.523,0.903
5,identity_attack,0.962,0.346,0.96,0.971,0.217,0.956
6,sexual_explicit,0.967,0.319,0.988,0.973,0.245,0.987
7,AVG,0.955,0.355,0.958,0.965,0.281,0.955


In [5]:
import joblib
import os
 
# Create a directory to save the models if it doesn't exist
os.makedirs('inference/models', exist_ok=True)

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'inference/models/tfidf_vectorizer.joblib')

# Save each logistic regression model
for label, model in logreg_models.items():
    model_filename = f'inference/models/logreg_{label}.joblib'
    joblib.dump(model, model_filename)

print("Models and vectorizer saved in the 'models' directory")

Models and vectorizer saved in the 'models' directory


In [8]:
# CPU inference benchmarking on test set (LogReg one-vs-rest)
import time
import numpy as np
import pandas as pd


# Optional warm-up to avoid one-time overheads
_ = logreg_models[label_cols[0]].decision_function(X_test)

per_label_timings = []
t_total_start = time.perf_counter()
for label in label_cols:
    clf = logreg_models[label]
    t0 = time.perf_counter()
    _ = clf.decision_function(X_test)
    t1 = time.perf_counter()
    per_label_timings.append({
        "label": label,
        "test_infer_seconds": t1 - t0,
    })
t_total_end = time.perf_counter()

total_seconds = t_total_end - t_total_start
n_samples = X_test.shape[0]

# Build timing table
time_metrics_df = pd.DataFrame(per_label_timings)
time_metrics_df["per_sample_ms"] = (time_metrics_df["test_infer_seconds"] / n_samples) * 1000.0

# Append AVG row
avg_vals = time_metrics_df.drop(columns=["label"]).mean(numeric_only=True)
avg_row = {**{"label": "AVG"}, **avg_vals.to_dict()}
time_metrics_df = pd.concat([time_metrics_df, pd.DataFrame([avg_row])], ignore_index=True)

# Round for readability
time_metrics_df["test_infer_seconds"] = time_metrics_df["test_infer_seconds"].round(6)
time_metrics_df["per_sample_ms"] = time_metrics_df["per_sample_ms"].round(6)

# Summary
throughput = n_samples / total_seconds if total_seconds > 0 else float("inf")
per_sample_ms_all = (total_seconds / n_samples) * 1000.0

print("Benchmark (CPU) on test set:")
print(f" - samples: {n_samples}")
print(f" - total_inference_seconds_all_labels: {total_seconds:.6f}")
print(f" - throughput_samples_per_sec_all_labels: {throughput:.2f}")
print(f" - avg_per_sample_latency_ms_all_labels: {per_sample_ms_all:.6f}")

time_metrics_df

Benchmark (CPU) on test set:
 - samples: 97320
 - total_inference_seconds_all_labels: 0.070732
 - throughput_samples_per_sec_all_labels: 1375895.81
 - avg_per_sample_latency_ms_all_labels: 0.000727


Unnamed: 0,label,test_infer_seconds,per_sample_ms
0,toxicity,0.011617,0.000119
1,severe_toxicity,0.011108,0.000114
2,obscene,0.010488,0.000108
3,threat,0.009916,0.000102
4,insult,0.009777,0.0001
5,identity_attack,0.009187,9.4e-05
6,sexual_explicit,0.008437,8.7e-05
7,AVG,0.010076,0.000104
