In [None]:
# !pip install transformers datasets accelerate evaluate scikit-learn

In [None]:
from collections import Counter
from pathlib import Path
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline

import numpy as np
import random
import evaluate
import yaml

## Data Loading / Splitting

In [None]:

def split_jsonl(input_file,
                train_file="train.jsonl",
                valid_file="valid.jsonl",
                test_file="test.jsonl",
                ratios=(0.8, 0.1, 0.1),
                seed=42):
    """
    Split a single JSONL file into train/valid/test files.
    ratios must sum to 1.0 (otherwise they'll be normalized).
    Returns a tuple with the counts (train, valid, test).
    """
    # normalize ratios
    total = sum(ratios)
    if total <= 0:
        raise ValueError("ratios must sum to a positive number")
    r = [x / total for x in ratios]

    p = Path(input_file)
    if not p.exists():
        raise FileNotFoundError(f"{input_file} not found")

    # read all non-empty lines (preserve original JSON lines)
    with p.open("r", encoding="utf-8") as f:
        lines = [ln.rstrip("\n") for ln in f if ln.strip()]

    rng = random.Random(seed)
    rng.shuffle(lines)
    print(lines[0])

    n = len(lines)
    n_train = int(n * r[0])
    n_valid = int(n * r[1])

    train_lines = lines[:n_train]
    valid_lines = lines[n_train:n_train + n_valid]
    test_lines = lines[n_train + n_valid:]

    # write out files (ensure trailing newline if non-empty)
    def write_lines(path, arr):
        path = Path(path)
        if arr:
            path.write_text("\n".join(arr) + "\n", encoding="utf-8")
        else:
            # create empty file
            path.write_text("", encoding="utf-8")

    write_lines(train_file, train_lines)
    write_lines(valid_file, valid_lines)
    write_lines(test_file, test_lines)

    return (len(train_lines), len(valid_lines), len(test_lines))

# Example usage:
# split_jsonl("dataset.jsonl", "train.jsonl", "valid.jsonl", "test.jsonl", ratios=(0.8,0.1,0.1), seed=42)

In [None]:

dataset = load_dataset("json", data_files={"train": "train.jsonl",
                                           "validation": "valid.jsonl",
                                           "test": "test.jsonl"})

In [None]:
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

def tokenize(batch):
    return tokenizer(batch["text"],
                     padding="max_length",
                     truncation=True,
                     max_length=512)

tokenized = dataset.map(tokenize, batched=True)
# tokenized = tokenized.rename_column("label", "labels")
tokenized.set_format("torch",
                     columns=["input_ids", "attention_mask", "label"])

In [1]:
# Build a deterministic mapping from string labels to integers and apply it to `tokenized`
unique_labels = set()
for split in tokenized:
    unique_labels.update(set(tokenized[split]["label"]))

label_list = sorted(unique_labels)  # deterministic order
label2id = {lab: i for i, lab in enumerate(label_list)}

def _map_label(example):
    lab = example["label"]
    # if already integer, keep as is
    if isinstance(lab, int):
        return example
    example["label"] = label2id[lab]
    return example

tokenized = tokenized.map(_map_label)

# ensure torch format (re-apply to be safe)
tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# update num_labels variable
num_labels = len(label2id)

print("label2id:", label2id)

id2label = {"LABEL_" + str(v): k for k, v in label2id.items()}
print(id2label)

NameError: name 'tokenized' is not defined

In [None]:
counts = Counter(dataset["train"]["label"])
print(counts)

## Training

In [None]:


model = AutoModelForSequenceClassification.from_pretrained(
    "allenai/scibert_scivocab_uncased",
    num_labels=num_labels
)

In [None]:

training_args = TrainingArguments(
    output_dir="model_out",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_steps=50
)

In [None]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
)

In [None]:
trainer.train()

## Testing / Evaluation

In [None]:
trainer.evaluate(tokenized["test"])

In [None]:

accuracy = evaluate.load("accuracy")

preds = trainer.predict(tokenized["test"])
y_pred = preds.predictions.argmax(-1)
y_true = preds.label_ids

print(y_pred)
print(y_true)

acc = accuracy.compute(predictions=y_pred, references=y_true)
print(acc)

f1 = evaluate.load("f1")
preds = trainer.predict(tokenized["test"])
f1_score = f1.compute(predictions=preds.predictions.argmax(-1),
                      references=preds.label_ids,
                      average="macro")
print(f1_score)

In [None]:
trainer.save_model("final_model")
tokenizer.save_pretrained("final_model")