In [6]:
from collections import Counter
from pathlib import Path
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline

import numpy as np
import random
import evaluate
import yaml

In [None]:
# !pip install transformers datasets accelerate evaluate scikit-learn

Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting numpy>=1.17 (from transformers)
  Downloading numpy-2.3.5-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading pyyaml-6.0.3-cp311-cp311-win_amd64.whl.metadata (2.4 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.11.3-cp311-cp311-win_amd64.whl.metadata (41 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.5

In [None]:

def split_jsonl(input_file,
                train_file="train.jsonl",
                valid_file="valid.jsonl",
                test_file="test.jsonl",
                ratios=(0.8, 0.1, 0.1),
                seed=42):
    """
    Split a single JSONL file into train/valid/test files.
    ratios must sum to 1.0 (otherwise they'll be normalized).
    Returns a tuple with the counts (train, valid, test).
    """
    # normalize ratios
    total = sum(ratios)
    if total <= 0:
        raise ValueError("ratios must sum to a positive number")
    r = [x / total for x in ratios]

    p = Path(input_file)
    if not p.exists():
        raise FileNotFoundError(f"{input_file} not found")

    # read all non-empty lines (preserve original JSON lines)
    with p.open("r", encoding="utf-8") as f:
        lines = [ln.rstrip("\n") for ln in f if ln.strip()]

    rng = random.Random(seed)
    rng.shuffle(lines)
    print(lines[0])

    n = len(lines)
    n_train = int(n * r[0])
    n_valid = int(n * r[1])

    train_lines = lines[:n_train]
    valid_lines = lines[n_train:n_train + n_valid]
    test_lines = lines[n_train + n_valid:]

    # write out files (ensure trailing newline if non-empty)
    def write_lines(path, arr):
        path = Path(path)
        if arr:
            path.write_text("\n".join(arr) + "\n", encoding="utf-8")
        else:
            # create empty file
            path.write_text("", encoding="utf-8")

    write_lines(train_file, train_lines)
    write_lines(valid_file, valid_lines)
    write_lines(test_file, test_lines)

    return (len(train_lines), len(valid_lines), len(test_lines))

# Example usage:
# split_jsonl("dataset.jsonl", "train.jsonl", "valid.jsonl", "test.jsonl", ratios=(0.8,0.1,0.1), seed=42)

In [None]:

dataset = load_dataset("json", data_files={"train": "train.jsonl",
                                           "validation": "valid.jsonl",
                                           "test": "test.jsonl"})

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

def tokenize(batch):
    return tokenizer(batch["text"],
                     padding="max_length",
                     truncation=True,
                     max_length=512)

tokenized = dataset.map(tokenize, batched=True)
# tokenized = tokenized.rename_column("label", "labels")
tokenized.set_format("torch",
                     columns=["input_ids", "attention_mask", "label"])

In [4]:
# Build a deterministic mapping from string labels to integers and apply it to `tokenized`
unique_labels = set()
for split in tokenized:
    unique_labels.update(set(tokenized[split]["label"]))

label_list = sorted(unique_labels)  # deterministic order
label2id = {lab: i for i, lab in enumerate(label_list)}

def _map_label(example):
    lab = example["label"]
    # if already integer, keep as is
    if isinstance(lab, int):
        return example
    example["label"] = label2id[lab]
    return example

tokenized = tokenized.map(_map_label)

# ensure torch format (re-apply to be safe)
tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# update num_labels variable
num_labels = len(label2id)

print("label2id:", label2id)

id2label = {"LABEL_" + str(v): k for k, v in label2id.items()}
print(id2label)

label2id: {'artificial_intelligence': 0, 'computer_architecture': 1, 'computer_networks': 2, 'computer_vision': 3, 'databases': 4, 'machine_learning': 5, 'nlp': 6, 'prog_languages': 7, 'security': 8}
{'LABEL_0': 'artificial_intelligence', 'LABEL_1': 'computer_architecture', 'LABEL_2': 'computer_networks', 'LABEL_3': 'computer_vision', 'LABEL_4': 'databases', 'LABEL_5': 'machine_learning', 'LABEL_6': 'nlp', 'LABEL_7': 'prog_languages', 'LABEL_8': 'security'}


In [None]:
counts = Counter(dataset["train"]["label"])
print(counts)

Counter({'prog_languages': 15, 'nlp': 14, 'computer_networks': 14, 'security': 12, 'computer_vision': 12, 'artificial_intelligence': 11, 'machine_learning': 11, 'databases': 10, 'computer_architecture': 9})


In [17]:
batch = next(iter(tokenized["train"]))
for k, v in batch.items():
    print(k, v.dtype)

label torch.int64
input_ids torch.int64
attention_mask torch.int64


In [None]:

# num_labels = 1

model = AutoModelForSequenceClassification.from_pretrained(
    "allenai/scibert_scivocab_uncased",
    num_labels=num_labels
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

training_args = TrainingArguments(
    output_dir="model_out",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_steps=50
)

In [None]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
)

In [26]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,1.746937
2,No log,1.498726
3,No log,1.601284
4,0.998200,1.331737
5,0.998200,1.300024




TrainOutput(global_step=70, training_loss=0.8461762428283691, metrics={'train_runtime': 1504.974, 'train_samples_per_second': 0.359, 'train_steps_per_second': 0.047, 'total_flos': 142088899645440.0, 'train_loss': 0.8461762428283691, 'epoch': 5.0})

In [27]:
trainer.evaluate(tokenized["test"])



{'eval_loss': 1.3104392290115356,
 'eval_runtime': 2.3601,
 'eval_samples_per_second': 5.932,
 'eval_steps_per_second': 0.424,
 'epoch': 5.0}

In [None]:

accuracy = evaluate.load("accuracy")

preds = trainer.predict(tokenized["test"])
y_pred = preds.predictions.argmax(-1)
y_true = preds.label_ids

print(y_pred)
print(y_true)

acc = accuracy.compute(predictions=y_pred, references=y_true)
print(acc)

f1 = evaluate.load("f1")
preds = trainer.predict(tokenized["test"])
f1_score = f1.compute(predictions=preds.predictions.argmax(-1),
                      references=preds.label_ids,
                      average="macro")
print(f1_score)

[8 6 0 0 3 6 1 1 2 6 4 4 0 2]
[1 0 0 0 3 5 1 1 2 3 4 4 0 1]
{'accuracy': 0.6428571428571429}




{'f1': 0.4821428571428571}


In [None]:


# Read YAML file
with open("test_CA.yaml", "r", encoding="utf8") as file:
    test_CA = yaml.safe_load(file)

clf = pipeline("text-classification",
               model="final_model",
               tokenizer="final_model",
               top_k=2)

tp = 0
fn = 0
fn_counter = Counter()
for paper in test_CA['papers']:
    text = paper['title'] + " " + paper['abstract']

    # print(clf(text))
    # print(sorted(clf(text)[0], key=lambda x: x["score"], reverse=True))
    
    correct_label = False
    preds = sorted(clf(text)[0], key=lambda x: x["score"], reverse=True)
    for pred in preds:
        if pred['label'] == "LABEL_1" :
            tp += 1
            correct_label = True
    if not correct_label:
        fn_counter[id2label[preds[0]['label']]] += 1
        fn += 1


print("Results on Computer Architecture Test Set:")
print(f"True Positives: {tp}")
print(f"False Negatives: {fn}")
print(fn_counter)

with open("test_AI.yaml", "r", encoding="utf8") as file:
    test_AI = yaml.safe_load(file)


tp = 0
fn = 0
fn_counter = Counter()
for paper in test_AI['papers']:
    text = paper['title'] + " " + paper['abstract']

    # print(clf(text))
    # print(sorted(clf(text)[0], key=lambda x: x["score"], reverse=True))
    
    correct_label = False
    preds = sorted(clf(text)[0], key=lambda x: x["score"], reverse=True)
    for pred in preds:
        if pred['label'] == "LABEL_0" :
            tp += 1
            correct_label = True
    if not correct_label:
        fn_counter[id2label[preds[0]['label']]] += 1
        fn += 1

print("Results on Artificial Intelligence Test Set:")
print(f"True Positives: {tp}")
print(f"False Negatives: {fn}")
print(fn_counter)
        
with open("test_CN.yaml", "r", encoding="utf8") as file:
    test_CN = yaml.safe_load(file)


tp = 0
fn = 0
fn_counter = Counter()
for paper in test_CN['papers']:
    text = paper['title'] + " " + paper['abstract']

    # print(clf(text))
    # print(sorted(clf(text)[0], key=lambda x: x["score"], reverse=True))
    
    correct_label = False
    preds = sorted(clf(text)[0], key=lambda x: x["score"], reverse=True)
    for pred in preds:
        if pred['label'] == "LABEL_0" :
            tp += 1
            correct_label = True
    if not correct_label:
        fn_counter[id2label[preds[0]['label']]] += 1
        fn += 1

print("Results on Computer Networks Test Set:")
print(f"True Positives: {tp}")
print(f"False Negatives: {fn}")
print(fn_counter)

    

Device set to use cpu


Results on Computer Architecture Test Set:
True Positives: 13
False Negatives: 3
Counter({'machine_learning': 3})
Results on Artificial Intelligence Test Set:
True Positives: 4
False Negatives: 15
Counter({'machine_learning': 5, 'computer_architecture': 4, 'nlp': 2, 'computer_vision': 2, 'security': 1, 'databases': 1})
Results on Computer Networks Test Set:
True Positives: 4
False Negatives: 14
Counter({'computer_networks': 5, 'machine_learning': 5, 'computer_architecture': 2, 'security': 1, 'databases': 1})


In [30]:
trainer.save_model("final_model")
tokenizer.save_pretrained("final_model")

SafetensorError: Error while serializing: I/O error: The requested operation cannot be performed on a file with a user-mapped section open. (os error 1224)