<a href="https://colab.research.google.com/github/louiezzang/next-gpt/blob/main/examples/huggingface_sft_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Supervised Fine-tuning with huggingface


Build a Supervised Fine-tuning model to answer well to the question.

- SFT(Supervised Fine Tuning)
- Fine-tune a pretrained LLM on a specific domain or corpus of instructions and human demonstrations

- Dataset example
```json
[
    {
        "prompt": "",
        "completion": ""        
    }, ...
]
```

# Environment setup

#### Installation (python>=3.8)

In [None]:
# Install next-gpt lib.
!rm -rf ./next-gpt/
!git clone https://github.com/louiezzang/next-gpt.git
%cd next-gpt/
!pip install .
%cd ../

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import json
import yaml
import argparse

import numpy as np
import pandas as pd

import torch
from datasets import load_dataset
import transformers

from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForCausalLM, pipeline, 
    TrainingArguments, AutoModelWithLMHead,
    ProgressCallback
)
from nextgpt.dataset import (
    SupervisedDataset, DataCollatorForSupervisedDataset
)
from nextgpt.finetuning import (
    SupervisedTrainer, LoggingCallback
)

In [None]:
# Define arguments.
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, default="gpt2", choices=["gpt2", "bloom", "opt"])
parser.add_argument("--max_epochs", type=int, default=1)
parser.add_argument("--train_batch_size", type=int, default=4)
parser.add_argument("--output_dir", type=str, default="./output_1_sft")

args = parser.parse_args(args=[])
print(args)

In [None]:
# Get the tokenizer.
tokenizer = transformers.AutoTokenizer.from_pretrained(args.model, 
                                        #   bos_token="<|startoftext|>",
                                        #   eos_token="<|endoftext|>", 
                                        #   pad_token="<|pad|>"
                                          )
tokenizer.pad_token = tokenizer.eos_token
print(tokenizer)

In [None]:
dataset_webgpt_comp = load_dataset("openai/webgpt_comparisons", split="train[:20%]")

In [None]:
data_list = []
for row in dataset_webgpt_comp:
  question = row["question"]["full_text"]
  answer_0 = row["answer_0"]
  data_list.append({
      "instruction": question,
      "completion": answer_0
  })

In [None]:
PROMPT_TEMPLATE = (
  "Below is an instruction that describes a task, paired with an input that provides further context. "
  "Write a response that appropriately completes the request.\n\n"
  "### Instruction:\n{instruction}\n\n### Response:"
)

In [None]:
dataset = SupervisedDataset(
    dataset=data_list,
    tokenizer=tokenizer, 
    prompt_template=PROMPT_TEMPLATE,
    completion_field="completion",
    verbose=True)

# Split train and val dataset.
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, eval_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Data collator.
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)

In [None]:
# Load the pretrained model.
model = AutoModelForCausalLM.from_pretrained(args.model)
model.resize_token_embeddings(len(tokenizer))

In [None]:
# Train arguments.
training_args = TrainingArguments(
    output_dir="./checkpoint_1_sft", # the output directory
    overwrite_output_dir=True, # overwrite the content of the output directory
    num_train_epochs=args.max_epochs, # number of training epochs
    per_device_train_batch_size=args.train_batch_size, # batch size for training
    per_device_eval_batch_size=4, # batch size for evaluation
    eval_steps=3, # number of update steps between two evaluations.
    save_steps=100, # after # steps model is saved 
    warmup_steps=5, # number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
)

# Train the model.
trainer = SupervisedTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=None,
    # callbacks=[ProgressCallback, LoggingCallback(logger=None)],
)

trainer.train()
trainer.save_state()
trainer.safe_save_model(output_dir=args.output_dir)

In [None]:
# Inference test.
generator = pipeline("text-generation", model=args.output_dir, tokenizer=tokenizer)

generation_args = dict(
    num_beams=4,
    repetition_penalty=2.0,
    no_repeat_ngram_size=4,
    # bos_token="<|startoftext|>",
    # eos_token="<|endoftext|>", 
    # pad_token="<|pad|>",
    max_new_tokens=64,
    do_sample=True,
    top_k=30,
    top_p=0.95,
    temperature=1.9, 
    #max_length=300, 
    #num_return_sequences=20
    early_stopping=True,
)

In [None]:
test_list = data_list[-5:]

test_prompt_list = []
actual_completion_list = []
for row in test_list:
    text_input = row
    prompt = PROMPT_TEMPLATE.format_map(text_input)
    test_prompt_list.append(prompt)
    actual_completion_list.append(text_input["completion"])

result_list = generator(test_prompt_list, **generation_args)
for prompt, result, actual_response in zip(test_prompt_list, result_list, actual_completion_list):
    print("")
    print("-" * 70)
    print(("completion: %s" % (result[0]["generated_text"])))
    print(f"\n### Actual answer:\n{actual_response}")