# Finetuning on a New Dataset - Paris, Texas (finetune on new dataset) Versus Paris, France (pre-trained model)

### Data Handling and Visualization

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

### LLM model training

In [None]:
import torch
from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer, DataCollatorForLanguageModeling
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel
from datasets import Dataset
from unsloth import is_bfloat16_supported

# Saving model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Warnings
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

### Study response of pre-trained model

#### Loading the model

We are going to use Llama 3.2 with only 1 billion parameters.

(You can use the 3, 11 or 90 billion version as well.)

- Max Sequence Length:
    We used max_seq_length 5020.

- Loading Llama 3.2 Model:

    - The model and tokenizer are loaded using `FastLanguageModel.from_pretrained` with a specific pre-trained model, "unsloth/Llama-3.2-1B-bnb-4bitt". 
    - This is optimized for 4-bit precision, which reduces memory usage and increases training speed without significantly compromising performance.  
    - load_in_4bit=True 

- Applying PEFT (Parameter-Efficient Fine-Tuning):

    - Then we configured model using get_peft_model, which applies LoRA (Low-Rank Adaptation) techniques. 
    - This approach focuses on fine-tuning only specific layers or parts of the model, rather than the entire network.
    - This drastically reduces the computational resources needed.

- Parameters:

    - r=16
    - lora_alpha=16 for target_modules (include key components involved in attention mechanisms like q_proj, k_proj, and v_proj)
    - use_rslora=True (activates Rank-Stabilized LoRA())
    - use_gradient_checkpointing="unsloth" (memory usage optimized during training)

- Verifying Trainable Parameters:
    We used `model.print_trainable_parameters()`.

In [None]:
max_seq_length = 5020
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
    trust_remote_code=True,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
    use_rslora=True,
    use_gradient_checkpointing="unsloth",
    random_state = 32,
    loftq_config = None,
)
print(model.print_trainable_parameters())

#### Executing an example prompt

In [None]:
data_prompt = """Analyze the text based on what are the top 5 tourist attractions in Paris. Sort results based on popularity.

### Input:
{}

### Response:
{}"""

In [None]:
model = FastLanguageModel.for_inference(model)

inputs = tokenizer(
[
    data_prompt.format(
        #instructions
        text,
        #answer
        "",
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 5020, use_cache = True)
answer=tokenizer.batch_decode(outputs)
answer = answer[0].split("### Response:")[-1]
print("Answer of the question is:", answer)

### Calling the dataset for finetuning

NOTE: REPLACE DATASET BELOW WITH DATASET ON PARIS, TEXAS!!!

In [None]:
# import sys, pathlib, pymupdf
# fname = sys.argv[1]  # get document filename
# with pymupdf.open(fname) as doc:  # open document
#     text = chr(12).join([page.get_text() for page in doc])
# # write as a binary file to support non-ASCII characters
# pathlib.Path(fname + ".txt").write_bytes(text.encode())

In [None]:
import glob
import os

def get_filenames_with_glob(directory):
    """
    Gets all filenames in a directory using the glob module.

    Args:
        directory: The path to the directory.

    Returns:
        A list of filenames in the directory.
    """
    # Ensure the directory path ends with a separator
    if not directory.endswith(os.path.sep):
        directory += os.path.sep

    # Use glob to match all files in the directory
    all_files = glob.glob(directory + '*')
    
    # Filter out directories, keeping only files
    filenames = [f for f in all_files if os.path.isfile(f)]

    return filenames

In [None]:
import sys, pathlib, pymupdf

absolute_path = "/myapp/local/text_files"
directory_path = absolute_path  # Replace with the actual path
filenames = get_filenames_with_glob(directory_path)
# print(filenames)

all_text = ""
print("Filenames in directory:")
for fname in filenames:
    print(fname)

    with pymupdf.open(fname) as doc:  # open document
        text = chr(12).join([page.get_text() for page in doc])
        all_text = " ".join([all_text, text])
# print(all_text)
    
# write as a binary file to support non-ASCII characters
file_name = "/myapp/local/paris_texas_sites"
pathlib.Path(file_name + ".txt").write_bytes(all_text.encode())

In [None]:
from datasets import load_dataset
file_path = "/myapp/local/paris_texas_sites.txt"
train_dataset = load_dataset("text", data_files={"train": [file_path]}, split='train')

In [None]:
print(train_dataset.shape)
print(train_dataset['text'][:10])
print(train_dataset['text'][-10:])
# type(train_dataset)

### Prepare dataset for finetuning in form of question-answer pairs

**Instructions:** 

Fill in missing code base to convert generate text file into a pandas dataframe from which  question-answer pairs can be extracted:

- Column **'Context'** (e.g. "What is a tourist attarction in Paris? How large was the number of vistors to this site last year?")
- Column **'Response'** (e.g. "The Eiffel tower is a main attraction in Paris. About 1.5 million vistors visted it last year 2024.")

...

### Model training

#### Loading the model

We are going to use Llama 3.2 with only 1 billion parameters.

(You can use the 3, 11 or 90 billion version as well.)

- Max Sequence Length:
    We used max_seq_length 5020.

- Loading Llama 3.2 Model:

    - The model and tokenizer are loaded using `FastLanguageModel.from_pretrained` with a specific pre-trained model, "unsloth/Llama-3.2-1B-bnb-4bitt". 
    - This is optimized for 4-bit precision, which reduces memory usage and increases training speed without significantly compromising performance.  
    - load_in_4bit=True 

- Applying PEFT (Parameter-Efficient Fine-Tuning):

    - Then we configured model using get_peft_model, which applies LoRA (Low-Rank Adaptation) techniques. 
    - This approach focuses on fine-tuning only specific layers or parts of the model, rather than the entire network.
    - This drastically reduces the computational resources needed.

- Parameters:

    - r=16
    - lora_alpha=16 for target_modules (include key components involved in attention mechanisms like q_proj, k_proj, and v_proj)
    - use_rslora=True (activates Rank-Stabilized LoRA())
    - use_gradient_checkpointing="unsloth" (memory usage optimized during training)

- Verifying Trainable Parameters:
    We used `model.print_trainable_parameters()`.

In [None]:
# max_seq_length = 5020
# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name="unsloth/Llama-3.2-1B-bnb-4bit",
#     max_seq_length=max_seq_length,
#     load_in_4bit=True,
#     dtype=None,
#     trust_remote_code=True,
# )

# model = FastLanguageModel.get_peft_model(
#     model,
#     r=16,
#     lora_alpha=16,
#     lora_dropout=0,
#     target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
#     use_rslora=True,
#     use_gradient_checkpointing="unsloth",
#     random_state = 32,
#     loftq_config = None,
# )
# print(model.print_trainable_parameters())

#### Prepare data for model feed

Main points to remember:

- Data Prompt Structure:
The data_prompt is a formatted string template designed to guide the model in analyzing the provided text. It includes placeholders for the input text (the context) and the model's response. This template specifically prompts the model to identify mental health indicators, making it easier to fine-tune the model for mental health-related tasks.

- End-of-Sequence Token:
The EOS_TOKEN is retrieved from the tokenizer to signify the end of each text sequence. This token is essential for the model to recognize when a prompt has ended, helping to maintain the structure of the data during training or inference.

- Formatting Function:
The formatting_prompt used to take a batch of examples and formats them according to the data_prompt. It iterates over the input and output pairs, inserting them into the template and appending the EOS token at the end. The function then returns a dictionary containing the formatted text, ready for model training or evaluation.

- Function Output:
The function outputs a dictionary where the key is "text" and the value is a list of formatted strings. Each string represents a fully prepared prompt for the model, combining the context, response and the structured prompt template.

In [None]:
data_prompt = """Analyze the text based on what are the top 5 tourist attractions in Paris. Sort results based on popularity.

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompt(examples):
    inputs       = examples["Context"]
    outputs      = examples["Response"]
    texts = []
    for input_, output in zip(inputs, outputs):
        text = data_prompt.format(input_, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }


#### Format the data for training

In [None]:
training_data = Dataset.from_pandas(filtered_data)
training_data = training_data.map(formatting_prompt, batched=True)

#### Training setup to start fine tuning

- Trainer Initialization:
We are going to initialize SFTTrainer with the model and tokenizer, as well as the training dataset. 

- Training Arguments:
The TrainingArguments class is used to define key hyperparameters for the training process:

    - learning_rate=3e-4: Sets the learning rate for the optimizer.
    - per_device_train_batch_size=32: Defines the batch size per device, optimizing GPU usage.
    - num_train_epochs=20: Specifies the number of training epochs.
    - fp16=not is_bfloat16_supported() and bf16=is_bfloat16_supported(): Enable mixed precision training to reduce memory usage, depending on hardware support.
    - optim="adamw_8bit": Uses the 8-bit AdamW optimizer for efficient memory usage.
    - weight_decay=0.01: Applies weight decay to prevent overfitting.
    - output_dir="output": Specifies the directory where the trained model and logs will be saved.

- Training Process:

    - Finally we called trainer.train() method to start the training process. 
    - It uses the defined parameters of our fine-tune the model, adjusting weights and learning from the provided dataset. 
    - The trainer also handles data packing and gradient accumulation, optimizing the training pipeline for better performance.

In [None]:
trainer=SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=training_data,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=True,
    args=TrainingArguments(
        learning_rate=3e-4,
        lr_scheduler_type="linear",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=8,
        num_train_epochs=40,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        warmup_steps=20,
        output_dir="output",
        seed=0,
    ),
)

trainer.train()

### Inference

In [None]:
text = "What are the top 5 attractions of Paris? Sort by popularity."

In [None]:
modelFinetuned = FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    data_prompt.format(
        #instructions
        text,
        #answer
        "",
    )
], return_tensors = "pt").to("cuda")

outputs = modelFinetuned.generate(**inputs, max_new_tokens = 5020, use_cache = True)
answer = tokenizer.batch_decode(outputs)
answer = answer[0].split("### Response:")[-1]
print("Answer of the question is:", answer)

### Expected response:

...

### Push a fine-tuned model and its tokenizer to the Hugging Face Hub

**Note:**
- Create a **.env** file in your local/ folder in your working directory in the Docker environment (/myapp/local).
- Copy the line your **HF_TOKEN=\<your Hugginface API token\>** with your Hugginface API token inserted as value into your .env file.
- Run the cell below to load **HF_TOKEN** as an environment variable.

In [None]:
# import os
# from dotenv import load_dotenv
# load_dotenv()

In [None]:
# os.environ["HF_TOKEN"] = "hugging face token key, you can create from your HF account."
# model.push_to_hub("ImranzamanML/1B_finetuned_llama3.2", use_auth_token=os.getenv("HF_TOKEN"))
# tokenizer.push_to_hub("ImranzamanML/1B_finetuned_llama3.2", use_auth_token=os.getenv("HF_TOKEN"))

### Save fine-tuned model and its tokenizer locally on the machine.

In [None]:
model.save_pretrained("model/Paris_Texas_1B_finetuned_llama3.2")
tokenizer.save_pretrained("model/Paris_Texas_1B_finetuned_llama3.2")