In [1]:
from collections import Counter
from pathlib import Path
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline

import numpy as np
import random
import evaluate
import yaml

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset("json", data_files={"train": "train.jsonl",
                                           "validation": "valid.jsonl",
                                           "test": "test.jsonl"})

tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

def tokenize(batch):
    return tokenizer(batch["text"],
                     padding="max_length",
                     truncation=True,
                     max_length=512)

tokenized = dataset.map(tokenize, batched=True)
# tokenized = tokenized.rename_column("label", "labels")
tokenized.set_format("torch",
                     columns=["input_ids", "attention_mask", "label"])

# Build a deterministic mapping from string labels to integers and apply it to `tokenized`
unique_labels = set()
for split in tokenized:
    unique_labels.update(set(tokenized[split]["label"]))

label_list = sorted(unique_labels)  # deterministic order
label2id = {lab: i for i, lab in enumerate(label_list)}

def _map_label(example):
    lab = example["label"]
    # if already integer, keep as is
    if isinstance(lab, int):
        return example
    example["label"] = label2id[lab]
    return example

tokenized = tokenized.map(_map_label)

# ensure torch format (re-apply to be safe)
tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# update num_labels variable
num_labels = len(label2id)

print("label2id:", label2id)

id2label = {"LABEL_" + str(v): k for k, v in label2id.items()}
print(id2label)

Generating train split: 108 examples [00:00, 1316.33 examples/s]
Generating validation split: 13 examples [00:00, 2166.91 examples/s]
Generating test split: 14 examples [00:00, 2333.32 examples/s]
Map: 100%|██████████| 108/108 [00:00<00:00, 1277.95 examples/s]
Map: 100%|██████████| 13/13 [00:00<00:00, 1444.43 examples/s]
Map: 100%|██████████| 14/14 [00:00<00:00, 1555.59 examples/s]
Map: 100%|██████████| 108/108 [00:00<00:00, 4318.95 examples/s]
Map: 100%|██████████| 13/13 [00:00<00:00, 1624.29 examples/s]
Map: 100%|██████████| 14/14 [00:00<00:00, 1750.34 examples/s]

label2id: {'artificial_intelligence': 0, 'computer_architecture': 1, 'computer_networks': 2, 'computer_vision': 3, 'databases': 4, 'machine_learning': 5, 'nlp': 6, 'prog_languages': 7, 'security': 8}
{'LABEL_0': 'artificial_intelligence', 'LABEL_1': 'computer_architecture', 'LABEL_2': 'computer_networks', 'LABEL_3': 'computer_vision', 'LABEL_4': 'databases', 'LABEL_5': 'machine_learning', 'LABEL_6': 'nlp', 'LABEL_7': 'prog_languages', 'LABEL_8': 'security'}





In [None]:
# Test on CA papers
with open("test_CA.yaml", "r", encoding="utf8") as file:
    test_CA = yaml.safe_load(file)

clf = pipeline("text-classification",
               model="final_model",
               tokenizer="final_model",
               top_k=2)

tp = 0
fn = 0
fn_counter = Counter()
for paper in test_CA['papers']:
    text = paper['title'] + " " + paper['abstract']
    
    correct_label = False
    preds = sorted(clf(text)[0], key=lambda x: x["score"], reverse=True)
    for pred in preds:
        if pred['label'] == "LABEL_1" :
            tp += 1
            correct_label = True
    if not correct_label:
        fn_counter[id2label[preds[0]['label']]] += 1
        fn += 1


print("Results on Computer Architecture Test Set:")
print(f"True Positives: {tp}")
print(f"False Negatives: {fn}")
print(fn_counter)


# Test on AI papers

with open("test_AI.yaml", "r", encoding="utf8") as file:
    test_AI = yaml.safe_load(file)


tp = 0
fn = 0
fn_counter = Counter()
for paper in test_AI['papers']:
    text = paper['title'] + " " + paper['abstract']
    
    correct_label = False
    preds = sorted(clf(text)[0], key=lambda x: x["score"], reverse=True)
    for pred in preds:
        if pred['label'] == "LABEL_0" :
            tp += 1
            correct_label = True
    if not correct_label:
        fn_counter[id2label[preds[0]['label']]] += 1
        fn += 1

print("Results on Artificial Intelligence Test Set:")
print(f"True Positives: {tp}")
print(f"False Negatives: {fn}")
print(fn_counter)


# Test on CN papers  
with open("test_CN.yaml", "r", encoding="utf8") as file:
    test_CN = yaml.safe_load(file)


tp = 0
fn = 0
fn_counter = Counter()
for paper in test_CN['papers']:
    text = paper['title'] + " " + paper['abstract']
    
    correct_label = False
    preds = sorted(clf(text)[0], key=lambda x: x["score"], reverse=True)
    for pred in preds:
        if pred['label'] == "LABEL_2" :
            tp += 1
            correct_label = True
    if not correct_label:
        fn_counter[id2label[preds[0]['label']]] += 1
        fn += 1

print("Results on Computer Networks Test Set:")
print(f"True Positives: {tp}")
print(f"False Negatives: {fn}")
print(fn_counter)

    

Device set to use cpu


Results on Computer Architecture Test Set:
True Positives: 13
False Negatives: 3
Counter({'machine_learning': 3})
Results on Artificial Intelligence Test Set:
True Positives: 4
False Negatives: 15
Counter({'machine_learning': 5, 'computer_architecture': 4, 'nlp': 2, 'computer_vision': 2, 'security': 1, 'databases': 1})
Results on Computer Networks Test Set:
True Positives: 9
False Negatives: 9
Counter({'machine_learning': 3, 'computer_architecture': 3, 'artificial_intelligence': 2, 'databases': 1})
