In [1]:
import torch
from datasets import load_dataset
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import DataCollatorForCompletionOnlyLM, SFTConfig, SFTTrainer

In [2]:
model_id = "HuggingFaceTB/SmolLM-135M-Instruct"

In [3]:
dataset = load_dataset("medalpaca/medical_meadow_medical_flashcards")
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 33955
    })
})

In [4]:
dataset["train"][0]

{'input': 'What is the relationship between very low Mg2+ levels, PTH levels, and Ca2+ levels?',
 'output': 'Very low Mg2+ levels correspond to low PTH levels which in turn results in low Ca2+ levels.',
 'instruction': 'Answer this question truthfully'}

In [5]:
def format_dataset(dataset):
    dataset = dataset.remove_columns(['instruction'])
    dataset = dataset.rename_column("output", "response")
    dataset = dataset.rename_column("input", "instruction")
    return dataset

formatted_dataset = format_dataset(dataset)
formatted_dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'response'],
        num_rows: 33955
    })
})

In [6]:
split_dataset = formatted_dataset["train"].train_test_split(test_size=0.2)
train_dataset, eval_dataset = split_dataset["train"], split_dataset["test"]
print(f"{train_dataset=}")
print(f"{eval_dataset=}")

train_dataset=Dataset({
    features: ['instruction', 'response'],
    num_rows: 27164
})
eval_dataset=Dataset({
    features: ['instruction', 'response'],
    num_rows: 6791
})


In [7]:
def print_example(example):
    print(f"Instruction: {example['instruction']}")
    print(f"Response: {example['response']}")
    
example1 = train_dataset[0]
print_example(example1)

Instruction: What type of stain can be used to visualize Cryptosporidium oocysts?
Response: Cryptosporidium oocysts can be visualized with an Acid-Fast (Ziehl-Neelsen) stain.


In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_id)
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")`
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)

messages = [{"role": "user", "content": example1['instruction']}]
input_text=tokenizer.apply_chat_template(messages, tokenize=False)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=50, temperature=0.2, top_p=0.9, do_sample=True)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


user
What type of stain can be used to visualize Cryptosporidium oocysts?
assistant
Cryptosporidium oocysts are a type of parasite that can be visualized using various stains, including:

1. **Fluorescent Inhibitors (FIs)**: FIs are a class of fluorescent dyes that can be


In [9]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        text = f"<|im_start|>user\n{example['instruction'][i]}<|im_end|>\n<|im_start|>assistant\n{example['response'][i]}<|im_end|>"
        output_texts.append(text)
    return output_texts

response_template = "<|im_start|>assistant\n"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

In [12]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none"
)

output_dir = f"fine-tuned-{model_id.split('/')[-1]}"


sft_config = SFTConfig(
    output_dir=output_dir,
    num_train_epochs=5,
    max_seq_length=512,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    save_steps=500,
    logging_steps=500,
    learning_rate=1e-3,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    warmup_ratio=0.05,
    lr_scheduler_type="constant",
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    peft_config=peft_config,
    args=sft_config,
)

Map:   0%|          | 0/27164 [00:00<?, ? examples/s]

Map:   0%|          | 0/6791 [00:00<?, ? examples/s]

In [13]:
trainer.train()

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
500,1.2299
1000,1.1677
1500,1.1378
2000,1.1017
2500,1.0904
3000,1.0535
3500,1.0398
4000,1.0168


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  return fn(*args, **kwargs)
  with torch.enab

TrainOutput(global_step=4245, training_loss=1.0997838535915416, metrics={'train_runtime': 3474.1434, 'train_samples_per_second': 39.095, 'train_steps_per_second': 1.222, 'total_flos': 1.860542916305357e+16, 'train_loss': 1.0997838535915416, 'epoch': 5.0})

In [14]:
trainer.save_model()

In [16]:
ft_model = AutoModelForCausalLM.from_pretrained(output_dir).to(device)

messages = [{"role": "user", "content": example1['instruction']}]
input_text=tokenizer.apply_chat_template(messages, tokenize=False)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
outputs = ft_model.generate(inputs, max_new_tokens=50, temperature=0.2, top_p=0.9, do_sample=True)
print_example(example1)
print("Fine-tuned model response:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Instruction: What type of stain can be used to visualize Cryptosporidium oocysts?
Response: Cryptosporidium oocysts can be visualized with an Acid-Fast (Ziehl-Neelsen) stain.
Fine-tuned model response:
user
What type of stain can be used to visualize Cryptosporidium oocysts?
assistant
Cryptosporidium oocysts can be visualized using a silver-stain. Cryptosporidium oocysts are a type of parasite that can cause cryptoquiron-resistant infections in humans. Silver-stain is a type
