In [1]:
# import dependecies
import os
import torch
import pandas as pd
import datasets
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM
from peft import LoraConfig
from trl import SFTConfig, SFTTrainer
from trl import DataCollatorForCompletionOnlyLM

# Pretraining Data

load the data and clean the empty rows

In [2]:
path_dir = '/teamspace/studios/this_studio/Fine_tuning'

def read_dataFrame(path: str, sample = True, k = 2000):
    dataset_path = path_dir + '/train.csv'
    df = pd.read_csv(dataset_path, encoding= 'utf-8')
    df = df.dropna(axis=0)
    data = {
        'question': list(df['question']),
        'answer': list(df['answer']),
    }

    df = pd.DataFrame(data= data, columns=['question', 'answer'])
    if sample:
        df = df.sample(n = k, random_state=42).reset_index(drop = True)
    return df

df = read_dataFrame(path_dir, sample = False)
df

Unnamed: 0,question,answer
0,How can I output bold text in Bash? I have a B...,"Yes, you can format the output text in Bash to..."
1,How can I install Python 3 on an AWS EC2 insta...,"To install Python 3 on an AWS EC2 instance, yo..."
2,How can I format the elapsed time from seconds...,You can achieve the desired time format using ...
3,I am trying to create a matrix of random numbe...,Your current implementation is actually quite ...
4,I am learning Python and have noticed extensiv...,The use of 'self' in Python is quite different...
...,...,...
136103,Can you provide a Ruby program that reads an a...,Sure! Here's a Ruby program that accomplishes ...
136104,What is the code to develop a program that tak...,You can use the following code:\n\n```python\n...
136105,How can I update my code to verify if the vari...,You can use the `isinstance()` function in Pyt...
136106,Can you provide an application code that autom...,Sure! Here's a code snippet that demonstrates ...


In [3]:
# convert the dataFrame the dict()
examples = df.to_dict()

In [4]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['question'])):
        text = f"### Question: {example['question'][i]}\n ### Answer: {example['answer'][i]}"
        output_texts.append(text)
    return output_texts

finetuning_dataset_question_answer = formatting_prompts_func(examples)

Tokenize the dataset

In [5]:
# load the tokenizer
max_sequence_length = 1024 # gpt2

tokenizer = AutoTokenizer.from_pretrained(
    'openai-community/gpt2-large',
    trust_remote_code = True
)
## pad the sequence if it is < max_sequence_length
tokenizer.pad_token = tokenizer.eos_token

tokenizer.padding_side = 'right'

tokenizer.model_max_length = max_sequence_length

tokenizer.truncation_side = 'right'

gpt2 has only 1024 max_sequence_length that can feed to the transformer, so let's select fewer examples

In [6]:
filtred_dataset = []
for text in finetuning_dataset_question_answer:
    if len(text) < max_sequence_length:
        filtred_dataset.append(text)

len(filtred_dataset)

24899

In [7]:
filered_dataset_df = pd.DataFrame(filtred_dataset, columns=['text'])
filered_dataset_df

Unnamed: 0,text
0,### Question: How can I make my Python program...
1,### Question: How can I parse a comma-separate...
2,"### Question: In Python, is there an easy meth..."
3,"### Question: In Python, I know there is a lot..."
4,### Question: How can I move the y-axis ticks ...
...,...
24894,### Question: How can I write a SQL query to c...
24895,### Question: I want to modify my code so that...
24896,### Question: How can I sort an array in ascen...
24897,### Question: Can you provide a Ruby program t...


In [8]:
finetuning_dataset_loaded = datasets.Dataset.from_pandas(filered_dataset_df)
finetuning_dataset_loaded

Dataset({
    features: ['text'],
    num_rows: 24899
})

In [9]:
dataset = finetuning_dataset_loaded.train_test_split(test_size=0.20,shuffle=True, seed=42)
train_dataset = dataset['train']
test_dataset = dataset['test']
train_dataset, test_dataset

(Dataset({
     features: ['text'],
     num_rows: 19919
 }),
 Dataset({
     features: ['text'],
     num_rows: 4980
 }))

In [10]:
# save the dataset
dataset.save_to_disk(path_dir + '/dataset')

Saving the dataset (0/1 shards):   0%|          | 0/19919 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4980 [00:00<?, ? examples/s]

In [11]:
# load the dataset
dataset = datasets.load_from_disk(path_dir + '/dataset')
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 19919
    })
    test: Dataset({
        features: ['text'],
        num_rows: 4980
    })
})

## Define the HyperParameters and the quantization

In [12]:
max_sequence_length = 1024 # max sequence length for gpt2

output_dir = path_dir + '/fine_tuned_model'

## define the hyperparameters for QLoRA
lora_rank = 16
lora_alpha = 32 ## == lora_rank * 2
target_modules = ['c_attn', 'c_fc']
lora_dropout = 0.20 # 20%
## define the hyperparameters for training

epochs = 5
batch_size = 8
gradient_accumulation_steps = 1

learning_rate = 3e-4 # we need to experiment with the LR
lr_scheduler_type = 'cosine'
warmup_ratio = 0.03
optimizer = 'paged_adamw_32bit'
weight_decay = 0.001

In [13]:
# The Quantization
quant_4_bit = True
if quant_4_bit:
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
)
else:
    quant_config = BitsAndBytesConfig(
        load_in_8bit=True,
        bnb_4bit_compute_dtype=torch.float16,
)

## Load the Base model

In [14]:
## set the base model
# gpt2_xl_path = path_dir + '/gpt2-xl'
model = AutoModelForCausalLM.from_pretrained(
        'openai-community/gpt2-large',
        quantization_config = quant_config,
        device_map = 'auto',
        # local_files_only = True,
)

model.generation_config.pad_token_id = tokenizer.eos_token_id
print(f"{(model.get_memory_footprint() / 1e6):.2f} MB")

# using GPT2 124M parameters

# 134.06 MB = using quant_4_bit
# 176.53 MB = using quant_8_bit
# 510.34 MB = without quantization


524.13 MB


In [15]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Linear4bit(in_features=1280, out_features=3840, bias=True)
          (c_proj): Linear4bit(in_features=1280, out_features=1280, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Linear4bit(in_features=1280, out_features=5120, bias=True)
          (c_proj): Linear4bit(in_features=5120, out_features=1280, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1280,), eps=1e-05, element

In [16]:
response_template = " ### Answer:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

In [17]:
finetuning_dataset_loaded[0]

{'text': "### Question: How can I make my Python program pause or sleep for a duration of 50 milliseconds?\n ### Answer: In Python, we can use the `time.sleep()` function from the built-in `time` module to pause the execution of the program for a specified duration. The `time.sleep()` function accepts an argument in seconds. So, if you want to pause the program for 50 milliseconds, you would need to pass 0.05 (which is 50 milliseconds converted to seconds) as the argument to the `time.sleep()` function.\n\nHere's how you can do it:\n\n```python\nimport time\n\n# Your code here\n\n# Pause for 50 milliseconds\ntime.sleep(0.05)\n\n# Your code here\n```\n\nIn this code, the program will pause at the `time.sleep(0.05)` line for 50 milliseconds before moving on to the next line of code."}

In [18]:
lora_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_rank,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules,
)

train_config = SFTConfig(
    output_dir = output_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_strategy='no',
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim = optimizer,
    save_steps=50,
    logging_steps=50,
    save_total_limit = 10,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16 = True, 
    bf16 = False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    max_seq_length=max_sequence_length,
    lr_scheduler_type=lr_scheduler_type,
    dataset_text_field="text",
    save_strategy='steps',
    report_to='tensorboard',
    )

fine_tuning = SFTTrainer(
    model = model,
    train_dataset = train_dataset,
    peft_config = lora_config,
    processing_class = tokenizer,
    args = train_config,
    data_collator = collator,
)


Map:   0%|          | 0/19919 [00:00<?, ? examples/s]

In [19]:
fine_tuning.train()

Step,Training Loss
50,1.7728
100,1.3753
150,1.19
200,1.1322
250,1.0822
300,1.0771
350,1.0338
400,1.059
450,1.0288
500,1.0165


TrainOutput(global_step=12450, training_loss=0.7196085133993003, metrics={'train_runtime': 7372.9822, 'train_samples_per_second': 13.508, 'train_steps_per_second': 1.689, 'total_flos': 9.292969795189248e+16, 'train_loss': 0.7196085133993003, 'epoch': 5.0})

In [23]:
%load_ext tensorboard
%tensorboard --logdir path_dir + '/fine_tuned_model/runs'

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


ERROR: Failed to launch TensorBoard (exited with 2).
Contents of stderr:
TensorFlow installation not found - running with reduced feature set.
usage: tensorboard [-h] [--helpfull] [--logdir PATH] [--logdir_spec PATH_SPEC]
                   [--host ADDR] [--bind_all] [--port PORT]
                   [--reuse_port BOOL] [--load_fast {false,auto,true}]
                   [--extra_data_server_flags EXTRA_DATA_SERVER_FLAGS]
                   [--grpc_creds_type {local,ssl,ssl_dev}]
                   [--grpc_data_provider PORT] [--purge_orphaned_data BOOL]
                   [--db URI] [--db_import] [--inspect] [--version_tb]
                   [--tag TAG] [--event_file PATH] [--path_prefix PATH]
                   [--window_title TEXT] [--max_reload_threads COUNT]
                   [--reload_interval SECONDS] [--reload_task TYPE]
                   [--reload_multifile BOOL]
                   [--reload_multifile_inactive_secs SECONDS]
                   [--generic_data TYPE]
            

In [20]:
# save the fine tuned modle to drive
save_fine_tuned_model = path_dir + '/gpt2_large_fine_tuned'

fine_tuning.model.save_pretrained(save_fine_tuned_model)