In [None]:
# Check that PyTorch is using the GPU.
import torch
torch.cuda.is_available()

In [None]:
# Set environment variables, this must be done before importing transformers.
from dotenv import load_dotenv
load_dotenv(override=True)

import os
import torch

if "TRANSFORMERS_OFFLINE" in os.environ and int(os.environ["TRANSFORMERS_OFFLINE"]):
    print("Using cached models from", os.environ["HF_HOME"])
else:
    print("Loading model from huggingface hub and saving to", os.environ["HF_HOME"])

# Optional: disable warnings.
# import warnings
# warnings.filterwarnings("ignore")

In [None]:
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

# Introduction to Llama-3

Meta's Llama-3 is one of the best-performing open-source language models. We will use the 8 billion parameter instruction-tuned version in this tutorial.

Llama-3 can be prompted using a chat template: a prompt is a list of messages, each of which has a role (one of "system", "user" or "assistant") and content. System messages are meta-prompts that are used to define instructions that are hidden from the end user, whereas the user and assistant (the LLM) are taking turns in a chat dialogue.

**Note:** Make sure to set the `HF_TOKEN` environment variable to access Llama-3 (unless you are running on Puhti or Mahti, where Llama-3-8B-Instruct is already downloaded).

In [None]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs[0]["generated_text"][-1])

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful AI assistant."},
    {"role": "user", "content": "What is the capital of France?"},
]
outputs = pipeline(
    messages,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs[0]["generated_text"][-1])

# Fine-tuning Llama-3

We will now fine-tune Llama-3-8B on a new dataset. Specifically, we will use human preference data to train the model to better reflect the preferences of human users.

Huggingface contains a vast library of datasets, which you can explore at https://huggingface.co/datasets.

Instead of fine-tuning all parameters, we will use LoRA, low-rank adaptation, to reduce the computational cost of fine-tuning: https://arxiv.org/abs/2106.09685.

## Preparing data


In [None]:
import functools
import json

from datasets import load_dataset

In [None]:
dataset_name = "lmsys/lmsys-arena-human-preference-55k"
dataset = load_dataset(dataset_name, split='train[:2000]')

dataset

In [None]:
# Drop row IDs and model names.
# dataset = dataset.remove_columns(["id", "model_a", "model_b"])

In [None]:
dataset.info.features

In [None]:
import json

def parse_list_from_str_column(text):
    lst = []
    try:
        lst = json.loads(text)
        if len(lst) > 0 and lst[-1] is None:
            lst = lst[:-1]
    except _ as e:
        print(e)
        print(text)
        print()
    return lst

def format_prompt(example):
    preference_prompt = 'Which response to the following prompt is better? Answer with "A", "B" or "tie" only.'
    answer = "A" if example["winner_model_a"] else ("B" if example["winner_model_b"] else "tie")
    prompts = parse_list_from_str_column(example["prompt"])
    responses_a = parse_list_from_str_column(example["response_a"])
    responses_b = parse_list_from_str_column(example["response_b"])
    full_prompt = (
        preference_prompt
        + "\n\nUser prompt:\n" + "\n\n".join(prompts)
        + "\n\nResponse A:\n" + "\n\n".join(responses_a)
        + "\n\nResponse B:\n" + "\n\n".join(responses_b))
    example["preference_prompt"] = full_prompt
    example["preference_label"] = answer
    return example

def is_single_turn_dialogue(example):
    prompts = parse_list_from_str_column(example["prompt"])
    responses_a = parse_list_from_str_column(example["response_a"])
    responses_b = parse_list_from_str_column(example["response_b"])
    is_single_turn = len(prompts) == 1 and len(responses_a) == 1 and len(responses_b) == 1
    if is_single_turn:
        is_null = prompts[0] == "null" or responses_a[0] == "null" or responses_b[0] == "null"
        # print(type(prompts[0]), type(responses_a[0]), type(responses_b[0]))
        if is_null:
            print("IS NULL")
            print(prompts, responses_a, responses_b)
        return not is_null
    return False

# Simplify the task to one-turn dialogues.
prompt_dataset = dataset.filter(is_single_turn_dialogue)
# Format the dataset into (prompt, answer) pairs.
prompt_dataset = prompt_dataset.map(format_prompt)
print('Original dataset length', len(dataset), 'filtered length', len(prompt_dataset))

In [None]:
prompt_dataset[0]

In [None]:
print(prompt_dataset[0]["preference_prompt"])

In [None]:
print(prompt_dataset[0]["preference_label"])

In [None]:
# Prompt model before fine-tuning
messages = [
    {"role": "system", "content": "You are a helpful AI assistant."},
    {"role": "user", "content": prompt_dataset[0]["preference_prompt"]},
]
outputs = pipeline(
    messages,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs[0]["generated_text"][-1])

In [None]:
prompt_dataset_split = prompt_dataset.train_test_split(train_size=0.8, seed=42)

In [None]:
for k, v in prompt_dataset_split.items():
    print(k, len(v))

In [None]:
def predict_with_pipeline(example, pipeline):
    messages = [
        {"role": "system", "content": "You are a helpful AI assistant."},
        {"role": "user", "content": example["preference_prompt"]},
    ]
    outputs = pipeline(messages)
    example["initial_prediction"] = outputs[0]["generated_text"][-1]["content"]
    return example

pipeline_with_kwargs = functools.partial(
    pipeline,
    max_new_tokens=1,
    eos_token_id=terminators,
    do_sample=False,
)

dataset_with_predictions = prompt_dataset.map(
    predict_with_pipeline,
    batched=True,
    num_proc=1,
    fn_kwargs={"pipeline": pipeline_with_kwargs})

In [None]:
# from transformers.utils import send_example_telemetrya

# send_example_telemetry("question_answering_notebook", framework="pytorch")

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def tokenize_fn(text):
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256,
    )
    
    return tokenized_inputs

def tokenize_columns(dataset, text_columns):
    for col in text_columns:
        dataset[col] = tokenize_fn(dataset[col])
    return dataset

In [None]:
from functools import partial

In [None]:
tokenized_dataset = dataset.map(
    partial(tokenize_columns, text_columns=('prompt', 'response_a', 'response_b')),
    batched=True,
    batch_size=1,
    drop_last_batch=True)

In [None]:
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True,
    remove_columns=dataset.column_names,
)

print(tokenized_dataset)

In [None]:
tokenized_dataset.train_test_split(test_size=0.3)

In [None]:
split = tokenized_dataset.train_test_split(test_size=0.3)
train_dataset,eval_dataset = split['train'],split['test']

In [None]:
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling


In [None]:
data_collator=DataCollatorForLanguageModeling(tokenizer,mlm=False)

train_dataloader = DataLoader(
        train_dataset, shuffle=True, collate_fn=data_collator, batch_size=2, pin_memory=True
    )
eval_dataloader = DataLoader(
        eval_dataset, shuffle=True, collate_fn=data_collator, batch_size=2, pin_memory=True
    )

# Model parallel
- Mannually
- Huggingface


In [None]:
import torch
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        # First linear layer
        self.linear1 = nn.Linear(10000, 10)
        # ReLU activation
        self.relu = nn.ReLU()
        # Second linear layer
        self.linear2 = nn.Linear(10, 5)

    def forward(self, x):
        x = self.linear1(x.to("cuda:0")) # Apply the first linear layer
        x = self.relu(x)    # Apply the ReLU activation
        x = self.linear2(x.to("cuda:1")) # Apply the second linear layer
        return x

# Example of creating an instance of the model
model = MyModel()
print(model)

In [None]:
next(model.linear1.parameters()).device

In [None]:
next(model.linear2.parameters()).device

In [None]:
model = model.to('cuda:0')

In [None]:
import torch
torch.cuda.current_device()

In [None]:

# Define a loss function, for example, Mean Squared Error for a regression task
loss_function = nn.MSELoss()

# Define an optimizer, e.g., Stochastic Gradient Descent, with a learning rate of 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)


batch_size = 3
input_size = 10000
output_size = 5
dummy_input = torch.randn(batch_size, input_size)
dummy_label = torch.randn(batch_size, output_size).to(next(model.linear2.parameters()).device)

# Forward step
model.train()  # Set the model to training mode
optimizer.zero_grad()  # Clear any gradients from the previous step
output = model(dummy_input)  # Compute the model's output
loss = loss_function(output, dummy_target)  # Compute the loss

# Backward step
loss.backward()  # Compute gradients
optimizer.step()  # Update parameters

print(f"Loss: {loss.item()}")


In [None]:
from transformers import DistilBertModel, DistilBertConfig

class ModelParallelDistilBERT(nn.Module):
    def __init__(self, *args, **kwargs):
        super(ModelParallelDistilBERT, self).__init__()

        # Load the configuration and create a DistilBERT model
        config = DistilBertConfig.from_pretrained('distilbert-base-uncased', *args, **kwargs)
        distilbert = DistilBertModel(config)

        # Split the model into two parts
        self.part1 = nn.Sequential(
            distilbert.embeddings,
            *distilbert.transformer.layer[:3]  # First half of the layers
        )

        self.part2 = nn.Sequential(
            *distilbert.transformer.layer[3:]  # Second half of the layers
        )

        # Place each part on a different GPU
        self.part1.cuda(0)
        self.part2.cuda(1)

    def forward(self, input_ids, attention_mask=None):
        # Forward pass through the first part
        output = self.part1(input_ids.to('cuda:0'), attention_mask.to('cuda:0'))

        # Forward pass through the second part
        output = self.part2(*output.to('cuda:1'))

        return output

# Example usage
model = ModelParallelDistilBERT()

# Create dummy input data
input_ids = torch.randint(0, 30522, (1, 512)).cuda(0)  # Example input token IDs
attention_mask = torch.ones((1, 512)).cuda(0)          # Example attention mask

# Forward pass
output = model(input_ids, attention_mask)


device_map
"auto" "balanced" ...
GPUs>CPU>Disk

In [None]:
# # Example usage
# prompt = "Once upon a time"  # Replace with your own prompt
# # Encode the prompt
# input_ids = tokenizer.encode(prompt, return_tensors='pt')

# # Generate text
# output = base_model.generate(input_ids, max_length=20, num_return_sequences=3, no_repeat_ngram_size=2)

# # Decode and return the generated text
# for text in [tokenizer.decode(generated_id, skip_special_tokens=True) for generated_id in output]:
#     print(text)

In [None]:
# import os
# import csv
# import torch


#  DataCollatorForSeq2Seq

# from datasets import load_dataset
# from peft import LoraConfig, TaskType, get_peft_model
# from peft.utils.other import fsdp_auto_wrap_policy


# PEFT: Parameter-efficient fine-tuning

In [None]:
from transformers import AutoModelForCausalLM
from peft import PeftModel, LoraConfig, get_peft_model
import torch

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(model_name,
#                                                  load_in_8bit=True,
#                                                   torch_dtype=torch.float16,
#                                                  device_map="auto",
                                                  # device_map = {"": "cuda:" + str(int(os.environ.get("LOCAL_RANK") or 0))}
                                                 )

In [None]:
base_model

In [None]:
# for i, param in enumerate(base_model.named_parameters()):
#     print(f'{i},{param[0]}\t {param[1].device} \t{param[1].dtype}')

In [None]:
# Freeze all parameters
for param in base_model.parameters():
    param.requires_grad = False

In [None]:
def print_trainable_parameters(model):
    """
    Print the names and shapes of trainable parameters in a Hugging Face model.

    Args:
    model: A Hugging Face model instance.
    """
    trainable_params = 0
    all_params = 0
    for name, param in model.named_parameters():
        all_params += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable_params: {trainable_params}")
    print(f"all_params: {all_params}")
    
print_trainable_parameters(base_model)

In [None]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

LoRA: Low-Rank Adaptation of Large Language Models
https://arxiv.org/abs/2106.09685

In [None]:
base_model

In [None]:
peft_model = get_peft_model(base_model, config)

In [None]:
peft_model

**Different training tools from Huggingface**:

- Huggingface accelerate library

- Trainer: this API supports distributed training on multiple GPUs/TPUs, mixed precision through [NVIDIA Apex] for NVIDIA GPUs, ROCm APEX for AMD GPUs, and Native AMP for PyTorch.


## Huggingface accelerate library

You may want to use different settings/resources/environments for model training in different phases of your research, different APIs or libraries can provide interfaces to run training:
```bash
# A single GPU/CPU
python your_script.py
```
or 

```bash
# Multiple GPUs
torchrun --nnode=1 --nproc_per_node=4 your_script.py
```
or 

```bash
# Multiple GPUs
deepspeed --num_gpus=4 your_script.py
```
or

......

This often means many lines of code changed. 

Is there a better way of doing this? 

Yes, the accelerate library solves this and ensures the same code can be ran on different computing resources.

In [None]:
from accelerate import Accelerator

In [None]:
accelerator = Accelerator()

batch_size = 1
gradient_accumulation_steps = 8
max_length = 512
lr = 1e-4
num_epochs = 3

accelerator.print(peft_model.print_trainable_parameters())

In [None]:
# for batch in train_dataloader:
#     print(batch)
#     input_ids, attention_mask = batch
#     outputs = model(input_ids, attention_mask=attention_mask)
#     # Now, you can use outputs for your task


In [None]:
from torch import optim
from transformers import get_linear_schedule_with_warmup

In [None]:
optimizer = optim.AdamW(peft_model.parameters(), lr=lr)

lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=(len(train_dataloader) * num_epochs)
    )

In [None]:
peft_model, train_dataloader, eval_dataloader, optimizer, lr_scheduler = accelerator.prepare(
        peft_model, train_dataloader, eval_dataloader, optimizer, lr_scheduler
    )

In [None]:
if getattr(accelerator.state, "fsdp_plugin", None) is not None:
        accelerator.state.fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(model)

In [None]:
from tqdm import tqdm
for epoch in range(num_epochs):
    peft_model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        # print(step)
        outputs = peft_model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        # print(loss)
        accelerator.backward(loss)
        
        if step % gradient_accumulation_steps == 0:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            peft_model.zero_grad()

#         capture_batch_analytics(epoch, 'train', step, loss.detach().float(), total_loss, batch["input_ids"], batch["labels"])

#     peft_model.eval()
#     eval_loss = 0
#     for step, batch in enumerate(tqdm(eval_dataloader)):
#         with torch.no_grad():
#             outputs = model(**batch)
#         loss = outputs.loss
#         eval_loss += loss.detach().float()
#         capture_batch_analytics(epoch, 'eval', step, loss.detach().float(), eval_loss, batch["input_ids"], batch["labels"])

# #     model.save_pretrained(f"trained_model-{epoch}")

In [None]:
model

In [None]:
# import torch
# device_count = torch.cuda.device_count()
# if device_count > 0:
# #     logger.debug("Select GPU device")
#     device = torch.device("cuda")
# else:
# #     logger.debug("Select CPU device")
#     device = torch.device("cpu")

# transformers Trainer API

In [None]:
from transformers import Trainer, TrainingArguments
# DataCollatorForLanguageModeling

In [None]:
training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  num_train_epochs=1,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=30,
  output_dir='out',
  # Batch size for training
  per_device_train_batch_size=1,
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, 
  logging_steps=1,
  gradient_accumulation_steps = 4,
  ddp_find_unused_parameters=False,
)

In [None]:
trainer = Trainer(
    model=peft_model,
    train_dataset=tokenized_dataset,
#     eval_dataset=test_dataset,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer,mlm=False)
    
)

In [None]:
result = trainer.train()

In [None]:

peft_model.save_pretrained("./llama_7b_peft", save_adapter=True, save_config=True)

# model_to_merge = PeftModel.from_pretrained(AutoModelForCausalLM.from_pretrained(base_model).to("cuda"), "./llama_7b_peft")

# merged_model = model_to_merge.merge_and_unload()
# merged_model.save_pretrained(merged_model)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=False,
    torch_dtype=torch.float16,
    device_map="auto",
    offload_folder="offload", 
)


In [None]:
model = PeftModel.from_pretrained(
    model, 
    "./llama_7b_peft", 
    torch_dtype=torch.float16,
    device_map="auto",
    offload_folder="offload", 
)

In [None]:
model = model.merge_and_unload()

In [None]:
model.save_pretrained("./finetuned_llama2-7b")

In [None]:
######  TEST THIS ##########
trainer.generate()

In [None]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
    # Tokenize
    input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
    )

    # Generate
    device = model.device
    generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    max_length=max_output_tokens
    )

    # Decode
    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

    # Strip the prompt
    generated_text_answer = generated_text_with_prompt[0][len(text):]

    return generated_text_answer