### Install packages

In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
%pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
%pip install --no-deps "xformers<0.0.26" trl peft accelerate bitsandbytes triton xformers

### Innitialize model and tokenizer

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3-mini-4k-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

### Add LoRA adapters

In [27]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

### Data Prep

In [None]:
my_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = my_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("trucnhi160703/dspp-phase-01", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True)

In [None]:
dataset

In [None]:
print(dataset['text'][0])

In [None]:
print(dataset['text'][44])

<a name="Train"></a>
### Train the model

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [None]:
# Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
trainer_stats.metrics

In [None]:
# Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

### Inference

In [None]:
# my_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    my_prompt.format(
        "Based on the content of the Data Safety and Privacy Policy. Please label according to the following rule: Label 1: Incorrect if the content is mentioned in Data Safety but not in the Privacy Policy. Correct if the content is mentioned in both documents. Label 2: Incomplete if the content is mentioned in Data Safety but not in the Privacy Policy. Complete if the content is mentioned in both documents. The contents of the two documents are injected in the following section. Note that don't explain and only give me a Json response in this format: {label 1: Incorrect or Correct, label 2: Incomplete or Complete}", # instruction
        "Data Safety: {'data_shared': [{'category': 'Device or other IDs', 'sub_info': [{'data_type': 'Device or other IDs', 'purpose': 'Advertising or marketing', 'optional': False}]}], 'data_collected': [], 'security_practices': [{'category': 'Data is encrypted in transit', 'sub_info': []}, {'category': 'You can request that data be deleted', 'sub_info': []}]}. Privacy Policy: Data Share: The Privacy Policy of the Dua e Masura Urdu app notes that user information may be shared under certain circumstances: 1. The personal information collected is used to enhance the service provided, with the assurance that it will not be utilized or disseminated outside the stipulations stated in the Privacy Policy. 2. Third-party services embedded in the app may gather information that could be used to identify the user. 3. Third-party companies and individuals may be used to facilitate the Service, provide the Service, perform related services, or assist in analyzing the effectiveness of the Service. This might grant them access to users' personal information. However, they are contractually required to maintain confidentiality and not use it for any unrelated purpose. Data Collect: The Privacy Policy of the Dua e Masura Urdu app notes several circumstances in which user data may be collected: 1. To improve the user experience, users may be asked to provide personally identifiable information such as Internet Access. Such requested information will remain stored on the user's device, and is not gathered in any other way. 2. In situations where the app encounters an error, data (referred to as Log Data) is collected via third-party applications. This may include information like the device’s Internet protocol (“IP”) address, device name, OS version, the app's configuration during the service use, the use time and date, and other statistical data. 3. In some cases, third-party code and libraries that use cookies might collect information to improve their services. 4. The app does not intentionally collect personally identifiable information from children under 13. If such information is unknowingly collected, it is promptly deleted upon discovery.", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
print(tokenizer.batch_decode(outputs)[0])

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
# my_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    my_prompt.format(
        "Based on the content of the Data Safety and Privacy Policy. Please label according to the following rule: Label 1: Incorrect if the content is mentioned in Data Safety but not in the Privacy Policy. Correct if the content is mentioned in both documents. Label 2: Incomplete if the content is mentioned in Data Safety but not in the Privacy Policy. Complete if the content is mentioned in both documents. The contents of the two documents are injected in the following section. Note that don't explain and only give me a Json response in this format: {label 1: Incorrect or Correct, label 2: Incomplete or Complete}", # instruction
        "Data Safety: {'data_shared': [{'category': 'Device or other IDs', 'sub_info': [{'data_type': 'Device or other IDs', 'purpose': 'Advertising or marketing', 'optional': False}]}], 'data_collected': [], 'security_practices': [{'category': 'Data is encrypted in transit', 'sub_info': []}, {'category': 'You can request that data be deleted', 'sub_info': []}]}. Privacy Policy: Data Share: The Privacy Policy of the Dua e Masura Urdu app notes that user information may be shared under certain circumstances: 1. The personal information collected is used to enhance the service provided, with the assurance that it will not be utilized or disseminated outside the stipulations stated in the Privacy Policy. 2. Third-party services embedded in the app may gather information that could be used to identify the user. 3. Third-party companies and individuals may be used to facilitate the Service, provide the Service, perform related services, or assist in analyzing the effectiveness of the Service. This might grant them access to users' personal information. However, they are contractually required to maintain confidentiality and not use it for any unrelated purpose. Data Collect: The Privacy Policy of the Dua e Masura Urdu app notes several circumstances in which user data may be collected: 1. To improve the user experience, users may be asked to provide personally identifiable information such as Internet Access. Such requested information will remain stored on the user's device, and is not gathered in any other way. 2. In situations where the app encounters an error, data (referred to as Log Data) is collected via third-party applications. This may include information like the device’s Internet protocol (“IP”) address, device name, OS version, the app's configuration during the service use, the use time and date, and other statistical data. 3. In some cases, third-party code and libraries that use cookies might collect information to improve their services. 4. The app does not intentionally collect personally identifiable information from children under 13. If such information is unknowingly collected, it is promptly deleted upon discovery.", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

### Saving, loading finetuned models

In [None]:
model.save_pretrained("final-ft-phi-3-4k") # Local saving
tokenizer.save_pretrained("final-ft-phi-3-4k")
model.push_to_hub("trucnhi160703/final-ft-phi-3-4k", token = "hf_xxxxxxx") # Online saving
tokenizer.push_to_hub("trucnhi160703/final-ft-phi-3-4k", token = "hf_xxxxxxx") # Online saving

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "final-ft-phi-3-4k", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# my_prompt = You MUST copy from above!

inputs = tokenizer(
[
    my_prompt.format(
        "Based on the content of the Data Safety and Privacy Policy. Please label according to the following rule: Label 1: Incorrect if the content is mentioned in Data Safety but not in the Privacy Policy. Correct if the content is mentioned in both documents. Label 2: Incomplete if the content is mentioned in Data Safety but not in the Privacy Policy. Complete if the content is mentioned in both documents. The contents of the two documents are injected in the following section. Note that don't explain and only give me a Json response in this format: {label 1: Incorrect or Correct, label 2: Incomplete or Complete}", # instruction
        "Data Safety: {'data_shared': [{'category': 'Device or other IDs', 'sub_info': [{'data_type': 'Device or other IDs', 'purpose': 'Advertising or marketing', 'optional': False}]}], 'data_collected': [], 'security_practices': [{'category': 'Data is encrypted in transit', 'sub_info': []}, {'category': 'You can request that data be deleted', 'sub_info': []}]}. Privacy Policy: Data Share: The Privacy Policy of the Dua e Masura Urdu app notes that user information may be shared under certain circumstances: 1. The personal information collected is used to enhance the service provided, with the assurance that it will not be utilized or disseminated outside the stipulations stated in the Privacy Policy. 2. Third-party services embedded in the app may gather information that could be used to identify the user. 3. Third-party companies and individuals may be used to facilitate the Service, provide the Service, perform related services, or assist in analyzing the effectiveness of the Service. This might grant them access to users' personal information. However, they are contractually required to maintain confidentiality and not use it for any unrelated purpose. Data Collect: The Privacy Policy of the Dua e Masura Urdu app notes several circumstances in which user data may be collected: 1. To improve the user experience, users may be asked to provide personally identifiable information such as Internet Access. Such requested information will remain stored on the user's device, and is not gathered in any other way. 2. In situations where the app encounters an error, data (referred to as Log Data) is collected via third-party applications. This may include information like the device’s Internet protocol (“IP”) address, device name, OS version, the app's configuration during the service use, the use time and date, and other statistical data. 3. In some cases, third-party code and libraries that use cookies might collect information to improve their services. 4. The app does not intentionally collect personally identifiable information from children under 13. If such information is unknowingly collected, it is promptly deleted upon discovery.", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)


In [None]:
print(tokenizer.batch_decode(outputs)[0])