In [None]:
# import dependecies
import os
import torch
import pandas as pd
import datasets
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM
from peft import LoraConfig
from trl import SFTConfig, SFTTrainer
from trl import DataCollatorForCompletionOnlyLM

# Pretraining Data

load the data and clean the empty rows

In [None]:
path_dir = '/teamspace/studios/this_studio/Fine_tuning'
def read_dataFrame(path: str, sample = True, k = 2000):
    dataset_path = path_dir + '/train.csv'
    df = pd.read_csv(dataset_path, encoding= 'utf-8')
    df = df.dropna(axis=0)
    data = {
        'question': list(df['question']),
        'answer': list(df['answer']),
    }

    df = pd.DataFrame(data= data, columns=['question', 'answer'])
    if sample:
        df = df.sample(n = k, random_state=42).reset_index(drop = True)
    return df

df = read_dataFrame(path_dir, sample = False)
df

In [None]:
# convert the dataFrame the dict()
examples = df.to_dict()

In [None]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['question'])):
        text = f"### Question: {example['question'][i]}\n ### Answer: {example['answer'][i]}"
        output_texts.append(text)
    return output_texts

finetuning_dataset_question_answer = formatting_prompts_func(examples)

Tokenize the dataset

In [None]:
# load the tokenizer
max_sequence_length = 1024 # gpt2

tokenizer = AutoTokenizer.from_pretrained(
    'openai-community/gpt2',
    trust_remote_code = True
)
## pad the sequence if it is < max_sequence_length
tokenizer.pad_token = tokenizer.eos_token

tokenizer.padding_side = 'right'

tokenizer.model_max_length = max_sequence_length

tokenizer.truncation_side = 'right'

gpt2 has only 1024 max_sequence_length that can feed to the transformer, so let's select fewer examples

In [None]:
filtred_dataset = []
for text in finetuning_dataset_question_answer:
    if len(text) < max_sequence_length:
        filtred_dataset.append(text)

len(filtred_dataset)

In [None]:
filered_dataset_df = pd.DataFrame(filtred_dataset, columns=['text'])
filered_dataset_df

In [None]:
finetuning_dataset_loaded = datasets.Dataset.from_pandas(filered_dataset_df)
finetuning_dataset_loaded

In [None]:
dataset = finetuning_dataset_loaded.train_test_split(test_size=0.20,shuffle=True, seed=42)
train_dataset = dataset['train']
test_dataset = dataset['test']
train_dataset, test_dataset

In [None]:
# save the dataset
dataset.save_to_disk(path_dir + '/dataset')

In [None]:
# load the dataset
dataset = datasets.load_from_disk(path_dir + '/dataset')
dataset

## Define the HyperParameters and the quantization

In [None]:
max_sequence_length = 1024 # max sequence length for gpt2

output_dir = path_dir + '/fine_tuned_model'

## define the hyperparameters for QLoRA
lora_rank = 16
lora_alpha = 32 ## == lora_rank * 2
target_modules = ['c_attn', 'c_fc']
lora_dropout = 0.20 # 20%
## define the hyperparameters for training

epochs = 5
batch_size = 16
gradient_accumulation_steps = 1

learning_rate = 3e-4 # we need to experiment with the LR
lr_scheduler_type = 'cosine'
warmup_ratio = 0.03
optimizer = 'paged_adamw_32bit'
weight_decay = 0.001

In [None]:
# The Quantization
quant_4_bit = True
if quant_4_bit:
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
)
else:
    quant_config = BitsAndBytesConfig(
        load_in_8bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
)

## Load the Base model

In [None]:
## set the base model
# gpt2_xl_path = path_dir + '/gpt2-xl'
model = AutoModelForCausalLM.from_pretrained(
        'openai-community/gpt2',
        quantization_config = quant_config,
        device_map = 'auto',
        # local_files_only = True,
)

model.generation_config.pad_token_id = tokenizer.eos_token_id
print(f"{(model.get_memory_footprint() / 1e6):.2f} MB")

# using GPT2 124M parameters

# 134.06 MB = using quant_4_bit
# 176.53 MB = using quant_8_bit
# 510.34 MB = without quantization


In [None]:
model

In [None]:
response_template = " ### Answer:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

In [None]:
finetuning_dataset_loaded[0]

In [None]:
lora_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_rank,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules,
)

train_config = SFTConfig(
    output_dir = output_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_strategy='no',
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim = optimizer,
    save_steps=50,
    logging_steps=50,
    save_total_limit = 10,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16 = True,
    bf16 = False, ## set to True is you're not using the A100 GPU
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    max_seq_length=max_sequence_length,
    lr_scheduler_type=lr_scheduler_type,
    dataset_text_field="text",
    save_strategy='steps',
    report_to='tensorboard',
    )

fine_tuning = SFTTrainer(
    model = model,
    train_dataset = train_dataset,
    peft_config = lora_config,
    processing_class = tokenizer,
    args = train_config,
    data_collator = collator,
)


In [None]:
fine_tuning.train()

In [None]:
# save the fine tuned modle to drive
save_fine_tuned_model = path_dir + '/gpt2_fine_tuned_model'

fine_tuning.model.save_pretrained(save_fine_tuned_model)