### Notes: If ImportError occurs, it's probably due to the huggingface-hub. 
> pip install huggingface-hub==0.25.0


### Reference: https://medium.com/@hakeemsyd/how-to-fine-tune-your-llama-3-2-model-49a6f8c7621a

## Import

In [1]:
import torch
from transformers import pipeline
from transformers import AutoTokenizer, TextStreamer, pipeline, BitsAndBytesConfig, AutoModelForCausalLM,HfArgumentParser
from transformers import Trainer, TrainingArguments, TextStreamer, logging

from peft import LoraConfig,PeftModel,prepare_model_for_kbit_training,get_peft_model
from langchain import HuggingFacePipeline, PromptTemplate
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from trl import SFTTrainer, setup_chat_format
from datasets import Dataset, load_dataset
from huggingface_hub import login

import os
import re, json

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
#DEVICE = "cpu"
HUGGING_FACE_TOKEN = os.environ.get('HUGGING_FACE_TOKEN') #in terminal: export HUGGING_FACE_TOKEN="YOUR_TOKEN"
os.environ["WANDB_DISABLED"] = "True"


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"  # or "0,1" for multiple GPUs

## Load Model

In [3]:
'''
Possible Models:
- meta-llama/Llama-3.2-1B-Instruct
- meta-llama/Llama-3.2-3B-Instruct
- meta-llama/Llama-3.2-11B-Vision-Instruct
'''
model_id = "meta-llama/Llama-3.2-1B-Instruct" 

# Quantize your model dtype (for sparsity)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Set token using ENV variable
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HUGGING_FACE_TOKEN)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=HUGGING_FACE_TOKEN,
    quantization_config=bnb_config,
)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

`low_cpu_mem_usage` was None, now default to True since model is quantized.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [5]:
model.device

device(type='cuda', index=0)

# Fine-tuning with PEFT

In [6]:
lora_config = LoraConfig(r=16, lora_alpha=16, 
                         target_modules= ['q_proj','k_proj','v_proj'],
                         lora_dropout= 0.1)

model = get_peft_model(model, lora_config)
model.gradient_checkpointing_enable() 

# Prepare Data

In [7]:
directory_path = "/home/dongkyu/ros2-rag-project/exported_docs/ros2"

In [8]:
def preprocess_text(text):
    """
    Cleans and preprocesses text for LLaMA dataset.

    Args:
        text (str): The raw text to preprocess.

    Returns:
        str: Cleaned and normalized text.
    """
    # Normalize whitespace and remove excessive newlines
    text = re.sub(r'\s+', ' ', text)  # Replace all whitespace (including newlines) with a single space
    text = re.sub(r'\n+', '\n', text.strip())  # Replace multiple newlines with a single newline
    # Remove any unwanted characters or patterns (URLs, etc.)
    text = re.sub(r'http[s]?://\S+', '', text)  # Remove URLs
    # Additional cleaning logic (optional, customize as needed)
    return text.strip()

def load_txt_files_to_llama_format(directory, output_file):
    """
    Converts `.txt` files to LLaMA-compatible JSONL format with preprocessing.

    Args:
        directory (str): Directory containing `.txt` files.
        output_file (str): Path to save the output JSONL file.

    Returns:
        None
    """
    with open(output_file, 'w', encoding='utf-8') as output_f:
        for file_name in sorted(os.listdir(directory)):
            if file_name.endswith('.txt'):  # Only process `.txt` files
                file_path = os.path.join(directory, file_name)
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    cleaned_text = preprocess_text(content)
                    # Create JSON object and write to file
                    json_obj = {"text": cleaned_text}
                    output_f.write(json.dumps(json_obj, ensure_ascii=False) + '\n')

    print(f"Processed `.txt` files saved to {output_file} in JSONL format.")

directory = directory_path  # Update with your directory path
output_file = "finetune.jsonl"  # Update with desired output path
load_txt_files_to_llama_format(directory, output_file)


Processed `.txt` files saved to finetune.jsonl in JSONL format.


# Load Data

In [9]:
dataset = load_dataset("json", data_files = "finetune.jsonl")

Generating train split: 0 examples [00:00, ? examples/s]

In [10]:
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=512, padding= "max_length")

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/188 [00:00<?, ? examples/s]

# Train

In [None]:
training_args= TrainingArguments(
    output_dir= "./results",
    per_device_train_batch_size= 4,
    per_device_eval_batch_size= 4,
    num_train_epochs= 3,
    learning_rate= 2e-5,
    logging_dir="./logs",
    logging_steps=10,
)

trainer= Trainer(
    model=model,
    args= training_args,
    train_dataset= tokenized_datasets['train'],
    eval_dataset= tokenized_datasets['train']
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
model.print_trainable_parameters()