In [1]:
%%capture
! pip install -q bitsandbytes==0.42.0
! pip install -q peft==0.8.2
! pip install -q trl==0.7.10
! pip install -q accelerate==0.27.1
! pip install -q datasets==2.17.0
! pip install -q transformers==4.38.0
! pip install -q wandb

In [2]:
%%capture
import json 
import ast 
import torch
import wandb
import transformers

import pandas as pd
import bitsandbytes as bnb

from tqdm.notebook import tqdm
from datasets import load_dataset
from datasets import Dataset
from trl import SFTTrainer

from transformers import (
    AutoTokenizer ,
    LlamaTokenizer , 
    LlamaForCausalLM , 
    BitsAndBytesConfig ,
    AutoModelForCausalLM ,
)


from peft import (
    PeftModel , 
    LoraConfig , 
    get_peft_model , 
    prepare_model_for_kbit_training   
)

2024-05-14 19:08:40.716487: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-14 19:08:40.716607: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-14 19:08:40.868833: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
from huggingface_hub import login


login('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') 

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
def format_path(path) : 

    path = path.replace('\n' , '')
    path = path.replace('\t' , '')
    path = path.replace(' ' , '')

    return path

In [5]:
with open('/kaggle/input/dkwsjbzkxc/question_answer_pairs (7).jsonl') as fil : question_answer_pairs = [
    ast.literal_eval(row)
    for row 
    in fil
]

In [6]:
wandb.login(key = '4b954fc495fa6b1370052fbe1a5b2c418b159994')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [7]:
from transformers import AutoTokenizer , AutoModelForCausalLM

In [8]:
tokenizer = AutoTokenizer.from_pretrained(
    format_path(
        '''
meta-llama/Meta-Llama-3-8B'''
    ) , 
    padding = True , 
    truncation = True , 
    token ='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'
)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [9]:
model = AutoModelForCausalLM.from_pretrained(
    format_path(
        '''
        meta-llama/Meta-Llama-3-8B'''
    ) ,
    token = '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' , 
    device_map = {'' : 0} ,
    quantization_config = BitsAndBytesConfig(
        load_in_4bit = True ,
        bnb_4bit_compute_dtype = torch.float16 ,
        bnb_4bit_quant_type = 'nf4' ,
        bnb_4bit_use_double_quant = False 
    )
)

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

In [10]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [11]:
cls = bnb.nn.Linear4bit
modules = set()

In [12]:
for name , module in model.named_modules() :

    if isinstance(module , cls) :

        names = name.split('.')
        modules.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in modules : modules.remove('lm_head')

In [13]:
lora_config = LoraConfig(
    r = 64 ,
    lora_alpha = 32 ,
    target_modules = modules ,
    lora_dropout = 0.05 ,
    bias = 'none' ,
    task_type = 'SEQ_2_SEQ'
)

In [14]:
model = get_peft_model(model, lora_config)

In [15]:
args = transformers.TrainingArguments(
    max_steps = 1000 ,
    logging_steps = 1 ,
    warmup_steps = 0.03 ,
    learning_rate = 2e-4 ,
    output_dir = 'outputs' ,
    save_strategy = 'epoch' ,
    optim = 'paged_adamw_8bit' ,
    per_device_train_batch_size = 1 ,
    gradient_accumulation_steps = 4 , 
#     remove_unused_columns = False
)

In [16]:
collator = transformers.DataCollatorForLanguageModeling(tokenizer , mlm = False)

In [17]:
data = pd.DataFrame({
    'text' : [
        pair['question'] + pair['answer']
        for pair
        in question_answer_pairs
    ]
})

In [18]:
data['text'] = data['text'].astype(str)

In [19]:
data = Dataset.from_pandas(data)

In [20]:
trainer = SFTTrainer(
    args = args ,
    model = model ,
    data_collator = collator ,
    peft_config = lora_config ,
    train_dataset = data ,
    dataset_text_field = 'text' , 
    max_seq_length = 500 , 
)

Map:   0%|          | 0/891 [00:00<?, ? examples/s]

In [21]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mayushsinghal659[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.17.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240514_191047-22ukk4yq[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mdark-capybara-50[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/ayushsinghal659/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/ayushsinghal659/huggingface/runs/22ukk4yq[0m


Step,Training Loss
1,2.8171
2,3.0999
3,3.1765
4,3.3898
5,3.0009
6,2.6334
7,2.4293
8,2.4752
9,2.941
10,2.159




TrainOutput(global_step=1000, training_loss=1.0953277322649955, metrics={'train_runtime': 5349.2923, 'train_samples_per_second': 0.748, 'train_steps_per_second': 0.187, 'total_flos': 8381024858136576.0, 'train_loss': 1.0953277322649955, 'epoch': 4.49})

In [22]:
trainer.model.save_pretrained('llama2_instructed_s')