In [10]:
import os
import random
import urllib.request
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    TrainerCallback,
    SchedulerType,
)


In [11]:
import torch

In [12]:
print(torch.cuda.is_available())
print(torch.__version__)
print(torch.version.cuda)

True
2.6.0+cu126
12.6


In [13]:
# Custom Dataset that streams PMC articles on the fly
class PMCDataset(Dataset):
    def __init__(self, file_list, base_url, tokenizer, max_length=512):
        self.file_paths = file_list
        self.base_url = base_url
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        path = self.file_paths[idx]
        url = self.base_url + path
        try:
            response = urllib.request.urlopen(url)
            article_bytes = response.read()
            article_text = article_bytes.decode('utf-8', errors='ignore')
        except Exception as e:
            article_text = ""
        tokenised = self.tokenizer(
            article_text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        tokenised = {key: value.squeeze(0) for key, value in tokenised.items()}
        return tokenised

# callback to print loss information using tqdm
class TqdmLoggingCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None and "loss" in logs:
            # Print current loss; you can customise the output here
            print(f"Step {state.global_step}: loss = {logs['loss']:.4f}")

In [14]:
# Let's go

# Configurable parameters
FILE_LIST = "oa_file_list.txt"
BASE_URL = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/"
NUMBER_OF_ARTICLES = 10  # For proof-of-concept
MAX_LENGTH = 2048  # Increased to capture more from each article

# Read and sample the file list
with open(FILE_LIST, "r", encoding="utf-8") as f:
    file_paths = [line.split('\t')[0] for line in f if line.strip()]
if len(file_paths) > NUMBER_OF_ARTICLES:
    file_paths = random.sample(file_paths, NUMBER_OF_ARTICLES)

# Load tokenizer and model from Hugging Face
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Create the streaming dataset
dataset = PMCDataset(file_paths, BASE_URL, tokenizer, max_length=MAX_LENGTH)

# Define training arguments. Notice the lr_scheduler_type set to cosine.
training_args = TrainingArguments(
    output_dir="./finetuned_model",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    logging_steps=5,
    save_steps=50,
    evaluation_strategy="no",
    fp16=True,  # Use mixed precision if supported
    lr_scheduler_type=SchedulerType.COSINE,  # Cosine annealing scheduler
    warmup_steps=10,  
)

# Create the Trainer instance with our custom callback for additional logging.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    callbacks=[TqdmLoggingCallback()],
)

# Start fine-tuning
trainer.train()



ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`