In [1]:
# ================================================
# Baseline: TF-IDF + Logistic Regression
# ================================================
from pathlib import Path
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import json

# ---- config ----
CSV_DIR = Path("../data/cpc_cls")
OUT_METRICS = Path("../artifacts/baseline_tfidf_logreg.json")
MAX_FEATURES = 50000  # adjust if memory is low

# ---- load data ----
train_df = pd.read_csv(CSV_DIR / "cpc_cls_train.csv")
val_df   = pd.read_csv(CSV_DIR / "cpc_cls_val.csv")
test_df  = pd.read_csv(CSV_DIR / "cpc_cls_test.csv")

# ---- vectorize ----
tfidf = TfidfVectorizer(max_features=MAX_FEATURES, ngram_range=(1,2))
X_train = tfidf.fit_transform(train_df["text"])
X_val   = tfidf.transform(val_df["text"])
X_test  = tfidf.transform(test_df["text"])

# ---- train model ----
clf = LogisticRegression(max_iter=500, n_jobs=-1)
clf.fit(X_train, train_df["label"])

# ---- evaluate ----
def eval_split(X, y_true):
    preds = clf.predict(X)
    return {
        "accuracy": accuracy_score(y_true, preds),
        "macro_f1": f1_score(y_true, preds, average="macro"),
        "weighted_f1": f1_score(y_true, preds, average="weighted")
    }

metrics = {
    "val": eval_split(X_val, val_df["label"]),
    "test": eval_split(X_test, test_df["label"]),
}

# ---- save ----
OUT_METRICS.parent.mkdir(parents=True, exist_ok=True)
OUT_METRICS.write_text(json.dumps(metrics, indent=2))
print(json.dumps(metrics, indent=2))


{
  "val": {
    "accuracy": 0.7927686332531904,
    "macro_f1": 0.7466847279673036,
    "weighted_f1": 0.79089364393868
  },
  "test": {
    "accuracy": 0.7968374329572776,
    "macro_f1": 0.7604504286491491,
    "weighted_f1": 0.7958749623320344
  }
}


In [2]:
# ================================================
# Baseline: DistilRoBERTa fine-tune on CPC (A–H)
# ================================================
from pathlib import Path
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import (DistilBertTokenizerFast, RobertaTokenizerFast,
                          AutoTokenizer, AutoModelForSequenceClassification,
                          Trainer, TrainingArguments)
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import json

# ---- config ----
CSV_DIR = Path("../data/cpc_cls")
MODEL_NAME = "distilroberta-base"
OUT_METRICS = Path("../artifacts/baseline_distilroberta.json")
MAX_LEN = 128
BATCH_SIZE = 64
EPOCHS = 3
LR = 2e-5

LABELS = list("ABCDEFGH")
label2id = {l:i for i,l in enumerate(LABELS)}
id2label = {i:l for l,i in label2id.items()}

# ---- load CSVs into HF datasets ----
def load_split(name):
    df = pd.read_csv(CSV_DIR / f"cpc_cls_{name}.csv")
    df["labels"] = df["label"].map(label2id)
    return Dataset.from_pandas(df[["text", "labels"]], preserve_index=False)

ds = DatasetDict({
    "train": load_split("train"),
    "validation": load_split("val"),
    "test": load_split("test")
})

# ---- tokenize ----
tok = AutoTokenizer.from_pretrained(MODEL_NAME)
def enc(batch):
    return tok(batch["text"], truncation=True, max_length=MAX_LEN)
ds = ds.map(enc, batched=True)

# ---- model ----
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABELS),
    id2label=id2label,
    label2id=label2id
)

# ---- metrics ----
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro"),
        "weighted_f1": f1_score(labels, preds, average="weighted")
    }

# ---- train ----
args = TrainingArguments(
    output_dir="./tmp",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    report_to="none",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    processing_class=tok,
    compute_metrics=compute_metrics
)

trainer.train()

# ---- evaluate & save ----
metrics = {
    "val": trainer.evaluate(ds["validation"]),
    "test": trainer.evaluate(ds["test"])
}

OUT_METRICS.parent.mkdir(parents=True, exist_ok=True)
OUT_METRICS.write_text(json.dumps(metrics, indent=2))
print(json.dumps(metrics, indent=2))


Map:   0%|          | 0/194656 [00:00<?, ? examples/s]

Map:   0%|          | 0/10814 [00:00<?, ? examples/s]

Map:   0%|          | 0/10814 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 