# SciBERT Topic Classification Pipeline
This notebook sets up a standard local environment for fine-tuning SciBERT to classify scientific abstracts by topic.


## 1. Environment Setup
Install necessary packages. Run this in your terminal or add `!` to run in notebook.

In [None]:
pip install transformers datasets torch sentencepiece

In [None]:
pip install --upgrade transformers tokenizers huggingface-hub

## 2. Imports

In [12]:
import pandas as pd
import numpy as np
import transformers
import inspect
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset #seamless integration with PyTorch/Transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline, BertTokenizer
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import torch

In [3]:
print("Transformers version:", transformers.__version__)
print("TrainingArguments class:", TrainingArguments)
print("Defined in module:", TrainingArguments.__module__)
print("Signature:", inspect.signature(TrainingArguments.__init__))

Transformers version: 4.51.3
TrainingArguments class: <class 'transformers.training_args.TrainingArguments'>
Defined in module: transformers.training_args


## 3. Load and Prepare Data

In [None]:
df = pd.read_csv(r"C:\Users\maxwell.bicking\texts.csv")  # Update path as needed

In [None]:
# 1. Load the raw strings
ds = Dataset.from_pandas(df[['TEXT_CONTENT', 'TOPIC_NEW']])

# 2. Convert TOPIC_NEW → ClassLabel, creating a new column "labels"
ds = ds.class_encode_column('TOPIC_NEW').rename_column('TOPIC_NEW','labels')

# 3. Do a stratified split on the ClassLabel
ds = ds.train_test_split(test_size=0.2, stratify_by_column='labels')

train_ds = ds['train']
val_ds   = ds['test']

Casting to class labels: 100%|██████████| 6529/6529 [00:00<00:00, 91734.29 examples/s]


## 4. Load SciBERT Model & Tokenizer

In [3]:
MODEL_NAME = 'allenai/scibert_scivocab_uncased'
NUM_LABELS  = train_ds.features['labels'].num_classes
LABEL_NAMES = train_ds.features['labels'].names

First clone the scibert github repo to use locally (if using VPN)

**git lfs install** <br>
**git clone https://huggingface.co/allenai/scibert_scivocab_uncased [insert your path here after the space]**

In [None]:
'''
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, do_lower_case=True)
model     = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS
)
'''

LOCAL = r"C:\Users\maxwell.bicking\scibert_local"
tokenizer = BertTokenizer.from_pretrained(LOCAL, do_lower_case=True)
model     = AutoModelForSequenceClassification.from_pretrained(
    LOCAL,
    num_labels=NUM_LABELS
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at C:\Users\maxwell.bicking\scibert_local and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 5. Tokenize Dataset

In [None]:
def tokenize_fn(batch):
    return tokenizer(
        batch["TEXT_CONTENT"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

# Map and remove the raw text column afterwards
train_tok = train_ds.map(
    tokenize_fn,
    batched=True,
    remove_columns=["TEXT_CONTENT"]
)
val_tok = val_ds.map(
    tokenize_fn,
    batched=True,
    remove_columns=["TEXT_CONTENT"]
)

# Now we should have: ['labels','input_ids','token_type_ids','attention_mask']
print(train_tok.column_names)

# Tell the datasets to return PyTorch tensors for these three columns:
train_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_tok.set_format  ("torch", columns=["input_ids", "attention_mask", "labels"])


'''
old

def tokenize_fn(batch):
    return tokenizer(
        batch['TEXT_CONTENT'],
        padding='max_length',
        truncation=True,
        max_length=512
    )

train_tok = train_ds.map(tokenize_fn, batched=True)
val_tok = val_ds.map(tokenize_fn, batched=True)
train_tok = train_tok.rename_column('label_id', 'labels')
val_tok = val_tok.rename_column('label_id', 'labels')
train_tok.set_format('torch', columns=['input_ids','attention_mask','labels'])
val_tok.set_format('torch', columns=['input_ids','attention_mask','labels'])
'''

Map: 100%|██████████| 5223/5223 [00:38<00:00, 135.74 examples/s]
Map: 100%|██████████| 1306/1306 [00:09<00:00, 138.44 examples/s]

['labels', 'input_ids', 'token_type_ids', 'attention_mask']





"\nold\n\ndef tokenize_fn(batch):\n    return tokenizer(\n        batch['ABSTRACT_CONTENT'],\n        padding='max_length',\n        truncation=True,\n        max_length=512\n    )\n\ntrain_tok = train_ds.map(tokenize_fn, batched=True)\nval_tok = val_ds.map(tokenize_fn, batched=True)\ntrain_tok = train_tok.rename_column('label_id', 'labels')\nval_tok = val_tok.rename_column('label_id', 'labels')\ntrain_tok.set_format('torch', columns=['input_ids','attention_mask','labels'])\nval_tok.set_format('torch', columns=['input_ids','attention_mask','labels'])\n"

## 6. Fine-Tuning with Trainer

## Next Steps

- Pull representative subsample to use here (locally) and train
- Research different fine-tuning options with Azure Foundry
- Figure out how deployment works with Foundry

In [9]:
import os
print("CWD files:", os.listdir(os.getcwd()))

CWD files: ['$ spend Grid.xlsx', '01ae53c7-0004-b5a6-0037-fd070261102e.csv (1).gz', '01ae53c7-0004-b5a6-0037-fd070261102e.csv.gz', '038264fe-bf8d-4723-9edb-19ce3e12c9c1.csv', '06c7bcfbd8ca.csv', '10YearGivingHistory.xlsx', '11-1-2022.csv', '110316112.webp', '1687273277904.xlsx', '1d064a71f2d2.csv', '2004-2005 Program Comm Roster.pdf', '2004-2005 Program Comm Roster.xlsx', '2014-2015 Education Committee Suggested Members_ Alpha Order.csv', '2015 Education Comm Suggestions as of 7.23.14.csv', '2015 Education Committee Member Invitation Tracking Report 9.17.14.csv', '2015 Education Committee Member Invitation Tracking Report 9.3.14.csv', '2015 Education Committee Members - FINAL.csv', '2015 Education Committee Suggested Members_ Approved 8.11.14.csv', '2015 Montly E-Blast Reports.xlsx', '2016-2017 AM Education Committee Roster (Final).csv', '2017 Annual Meeting Education Comm Nominations.csv', '2023 Advances in Kidney Cancer Research.csv', '2023 Meetings & Attendees.xlsx', '2023 Time Card

In [7]:
pip install accelerate

Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.6.0-py3-none-any.whl (354 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.6.0
Note: you may need to restart the kernel to use updated packages.




In [None]:
training_args = TrainingArguments(
    output_dir='./scibert-finetuned',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='f1'
)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'f1': f1_score(p.label_ids, preds, average='macro')
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

trainer.train()

In [None]:
# save the best model (or last checkpoint)
trainer.save_model("./scibert-finetuned/final_model")

# also save the tokenizer
tokenizer.save_pretrained("./scibert-finetuned/final_model")

In [None]:
#Recall the model later

MODEL_DIR = "./scibert-finetuned/final_model"
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model     = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)

## 7. Evaluation on Validation Set

In [None]:
metrics = trainer.evaluate(eval_dataset=val_tok)
print(metrics)

pred_output = trainer.predict(test_dataset=val_tok)
preds = np.argmax(pred_output.predictions, axis=1)
labels = pred_output.label_ids
print(classification_report(labels, preds, target_names=le.classes_))

cm = confusion_matrix(labels, preds)
print(cm)

## 8. Inference Pipeline

In [None]:
classifier = pipeline(
    'text-classification',
    model=trainer.state.best_model_checkpoint,
    tokenizer=tokenizer,
    return_all_scores=True
)

def classify_abstract(text):
    res = classifier(text)[0]
    top = max(res, key=lambda x: x['score'])
    return le.inverse_transform([int(top['label'])])[0], top['score']

# Example Usage
example = "Sample abstract text here"
print(classify_abstract(example))