In [1]:
{"artificial_intelligence": 0,
 "computer_architecture": 1, 
 "computer_networks": 2, 
 "computer_vision": 3, 
 "databases": 4, 
 "machine_learning": 5, 
 "nlp": 6, 
 "prog_languages": 7, 
 "security": 8}

{'artificial_intelligence': 0,
 'computer_architecture': 1,
 'computer_networks': 2,
 'computer_vision': 3,
 'databases': 4,
 'machine_learning': 5,
 'nlp': 6,
 'prog_languages': 7,
 'security': 8}

In [2]:
!pip install transformers datasets accelerate evaluate

Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.11.3-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downl

In [4]:
from pathlib import Path
import random

def split_jsonl(input_file,
                train_file="train.jsonl",
                valid_file="valid.jsonl",
                test_file="test.jsonl",
                ratios=(0.8, 0.1, 0.1),
                seed=42):
    """
    Split a single JSONL file into train/valid/test files.
    ratios must sum to 1.0 (otherwise they'll be normalized).
    Returns a tuple with the counts (train, valid, test).
    """
    # normalize ratios
    total = sum(ratios)
    if total <= 0:
        raise ValueError("ratios must sum to a positive number")
    r = [x / total for x in ratios]

    p = Path(input_file)
    if not p.exists():
        raise FileNotFoundError(f"{input_file} not found")

    # read all non-empty lines (preserve original JSON lines)
    with p.open("r", encoding="utf-8") as f:
        lines = [ln.rstrip("\n") for ln in f if ln.strip()]

    rng = random.Random(seed)
    rng.shuffle(lines)

    n = len(lines)
    n_train = int(n * r[0])
    n_valid = int(n * r[1])

    train_lines = lines[:n_train]
    valid_lines = lines[n_train:n_train + n_valid]
    test_lines = lines[n_train + n_valid:]

    # write out files (ensure trailing newline if non-empty)
    def write_lines(path, arr):
        path = Path(path)
        if arr:
            path.write_text("\n".join(arr) + "\n", encoding="utf-8")
        else:
            # create empty file
            path.write_text("", encoding="utf-8")

    write_lines(train_file, train_lines)
    write_lines(valid_file, valid_lines)
    write_lines(test_file, test_lines)

    return (len(train_lines), len(valid_lines), len(test_lines))

# Example usage:
split_jsonl("dataset.jsonl", "train.jsonl", "valid.jsonl", "test.jsonl", ratios=(0.8,0.1,0.1), seed=42)

(108, 13, 14)

In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer

dataset = load_dataset("json", data_files={"train": "train.jsonl",
                                           "validation": "valid.jsonl",
                                           "test": "test.jsonl"})

Generating train split: 108 examples [00:00, 14595.93 examples/s]
Generating validation split: 13 examples [00:00, 2164.33 examples/s]
Generating test split: 14 examples [00:00, 2332.39 examples/s]


In [54]:
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

def tokenize(batch):
    return tokenizer(batch["text"],
                     padding="max_length",
                     truncation=True,
                     max_length=512)

tokenized = dataset.map(tokenize, batched=True)
# tokenized = tokenized.rename_column("label", "labels")
tokenized.set_format("torch",
                     columns=["input_ids", "attention_mask", "label"])

In [55]:
# Build a deterministic mapping from string labels to integers and apply it to `tokenized`
unique_labels = set()
for split in tokenized:
    unique_labels.update(set(tokenized[split]["label"]))

label_list = sorted(unique_labels)  # deterministic order
label2id = {lab: i for i, lab in enumerate(label_list)}

def _map_label(example):
    lab = example["label"]
    # if already integer, keep as is
    if isinstance(lab, int):
        return example
    example["label"] = label2id[lab]
    return example

tokenized = tokenized.map(_map_label)

# ensure torch format (re-apply to be safe)
tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# update num_labels variable
num_labels = len(label2id)

print("label2id:", label2id)

label2id: {'artificial_intelligence': 0, 'computer_architecture': 1, 'computer_networks': 2, 'computer_vision': 3, 'databases': 4, 'machine_learning': 5, 'nlp': 6, 'prog_languages': 7, 'security': 8}


In [74]:
from collections import Counter

counts = Counter(dataset["train"]["label"])
print(counts)

Counter({'prog_languages': 15, 'nlp': 14, 'computer_networks': 14, 'security': 12, 'computer_vision': 12, 'artificial_intelligence': 11, 'machine_learning': 11, 'databases': 10, 'computer_architecture': 9})


In [57]:
batch = next(iter(tokenized["train"]))
for k, v in batch.items():
    print(k, v.dtype)

label torch.int64
input_ids torch.int64
attention_mask torch.int64


In [58]:
from transformers import AutoModelForSequenceClassification

# num_labels = 1

model = AutoModelForSequenceClassification.from_pretrained(
    "allenai/scibert_scivocab_uncased",
    num_labels=num_labels
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [84]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="model_out",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_steps=50
)

In [85]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
)

In [86]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,1.607599
2,No log,1.437704
3,No log,1.324036
4,0.494100,1.263627
5,0.494100,1.261231




TrainOutput(global_step=70, training_loss=0.40813522338867186, metrics={'train_runtime': 706.3584, 'train_samples_per_second': 0.764, 'train_steps_per_second': 0.099, 'total_flos': 142088899645440.0, 'train_loss': 0.40813522338867186, 'epoch': 5.0})

In [87]:
trainer.evaluate(tokenized["test"])



{'eval_loss': 1.289337158203125,
 'eval_runtime': 2.6917,
 'eval_samples_per_second': 5.201,
 'eval_steps_per_second': 0.372,
 'epoch': 5.0}

In [88]:
import evaluate
accuracy = evaluate.load("accuracy")

preds = trainer.predict(tokenized["test"])
y_pred = preds.predictions.argmax(-1)
y_true = preds.label_ids

acc = accuracy.compute(predictions=y_pred, references=y_true)
print(acc)

f1 = evaluate.load("f1")
preds = trainer.predict(tokenized["test"])
f1_score = f1.compute(predictions=preds.predictions.argmax(-1),
                      references=preds.label_ids,
                      average="macro")
print(f1_score)

{'accuracy': 0.5}




{'f1': 0.425}


In [89]:
trainer.save_model("final_model")
tokenizer.save_pretrained("final_model")

('final_model\\tokenizer_config.json',
 'final_model\\special_tokens_map.json',
 'final_model\\vocab.txt',
 'final_model\\added_tokens.json',
 'final_model\\tokenizer.json')