In [1]:
!nvidia-smi

Fri Sep 20 01:22:29 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06             Driver Version: 535.183.06   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:E1:00.0 Off |                  Off |
| 58%   50C    P5              53W / 450W |      1MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
# !pip3 install vllm

In [3]:
# !pip3 install --upgrade pip

In [4]:
# !pip3 install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# !pip3 install "xformers<0.0.26" trl peft accelerate bitsandbytes

In [5]:
from unsloth import FastLanguageModel
import torch
import pandas as pd
import numpy as np
import random
import datasets
from trl import SFTTrainer
from transformers import TrainingArguments, Trainer
from transformers import EvalPrediction
import re
import json
import gc 
import argparse
import os
from unsloth.chat_templates import get_chat_template
import wandb
pd.set_option('display.max_colwidth', None)  # None means unlimited width

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [6]:
MAX_SEQ_LENGTH = 1024 # Choose any! We auto support RoPE Scaling internally!
DTYPE = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
LOAD_IN_4BIT = False # Use 4bit quantization to reduce memory usage. Can be False.
DATA_PATH = "/cluster/home/pgoyal/main/test/COT/cot_train.csv"
# MODEL_NAME = "unsloth/gemma-2b-it"
# output_dir = "/cluster/project/sachan/piyushi/merged_models_COT"
# checkpoint_dir = "/cluster/project/sachan/piyushi/final_predictions_COT/gemma_2b"
MODEL_NAME = "unsloth/Qwen2.5-7B-Instruct"
OUTPUT_PATH = "/cluster/project/sachan/piyushi/final_predictions_COT/qwen_7"
merged_dir = "/cluster/project/sachan/piyushi/merged_models_COT/qwen_7"
SEED = 42
random.seed(SEED)

In [7]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = DTYPE,
    load_in_4bit = LOAD_IN_4BIT
)

==((====))==  Unsloth 2024.8: Fast Qwen2 patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.65 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
"""
1. No common IDs between train and validation.
2. See all IDs first [(id0-sample0, id1-sample0, id2-sample0,..., id1-sample1, id2-sample1, ...)]
"""

'\n1. No common IDs between train and validation.\n2. See all IDs first [(id0-sample0, id1-sample0, id2-sample0,..., id1-sample1, id2-sample1, ...)]\n'

In [9]:
# Function to arrange dataframe such that the first n rows correspond to first n responses, and so on. 
# The idea is that the model sees all IDs first, instead of all responses from an ID 
def arrange_df(df, id_col='id', ques_col='question',response_col='response'):
    df = df.sort_values(by=[id_col])
    
    # Group dataframe by 'id' column
    grouped = df.groupby(id_col)
    
    # Initialize an empty list to store the arranged data
    arranged_data = []
    
    # Get unique IDs and the maximum number of responses for any ID
    unique_ids = df[id_col].unique()
    max_responses = grouped.size().max()
    
    # Iterate over the number of responses (max_responses)
    for i in range(max_responses):
        # Iterate over unique IDs
        for id_ in unique_ids:
            # Get all responses and questions for the current ID
            responses = grouped.get_group(id_)[response_col].tolist()
            questions = grouped.get_group(id_)[ques_col].tolist()
            # Append the ID, question, and response if available, else append None
            if i < len(responses):
                arranged_data.append({'id': id_, 'question': questions[i], 'response': responses[i]})
            else:
                arranged_data.append({'id': id_, 'question': None, 'response': None})
    
    # Create a new DataFrame from the arranged data
    arranged_df = pd.DataFrame(arranged_data)
    arranged_df = arranged_df.dropna(subset=['response'])
    arranged_df.reset_index(drop=True, inplace=True)
    print(arranged_df.head(10))
    
    assert(arranged_df.shape[0] == df.shape[0])
    return arranged_df

In [10]:
def split_data(df, ratio=0.8):
    """
    Splits the data into train and validation sets ensuring no common IDs among them.
    """
    ids = list(set(df['id'].tolist()))
    random.shuffle(ids)

    ntrain = int(ratio*len(ids))
    train_ids = ids[:ntrain]
    val_ids = ids[ntrain:]

    df_train = df[df['id'].isin(train_ids)].copy()
    df_val = df[df['id'].isin(val_ids)].copy()

    print("Train shape: ", df_train.shape)
    print("Val shape: ", df_val.shape)
    print("Data distribution: Train: {:.2f}, Val: {:.2f}".format(df_train.shape[0]/len(df), df_val.shape[0]/len(df)))

    return df_train, df_val

In [11]:
def prepare_data(path=DATA_PATH, arrange_train=False, remove_train_duplicates=True, remove_val_duplicates=True, split_ratio=0.8):
    df = pd.read_csv(path)
    df.rename(columns={'answer':'response'}, inplace=True)
    # Get Train and Val DFs
    
    df_train, df_val = split_data(df, split_ratio)
    df_train = df_train.sort_values(by=['id'])
    df_val = df_val.sort_values(by=['id'])
    
    if remove_train_duplicates:
        print("Removing Train Duplicates")
        df_train = df_train.drop_duplicates(subset=['id'])
        df_train.reset_index(drop=True, inplace=True) 
        print(df_train.shape)
        print(df_train.head())
    
    # Arrange DF Train
    if arrange_train:
        df_train = arrange_df(df_train)
    
    # Remove Duplicates for Validation DF 
    if remove_val_duplicates:
        print("Removing Val Duplicates")
        df_val = df_val.drop_duplicates(subset=['id'])
        df_val.reset_index(drop=True, inplace=True)
        print(df_val.shape)
        print(df_val.head())
        
    # Convert to Dataset 
    dataset_train = datasets.Dataset.from_pandas(df_train[['id','question','response']].copy())
    dataset_val = datasets.Dataset.from_pandas(df_val[['id','question','response']].copy())
    
    # Dataset Dict
    ds = datasets.DatasetDict({"train":dataset_train, "val":dataset_val})
    
    print(ds)
    return ds

In [12]:
def split_into_steps(text):
    # Split the text by the newline character
    steps = text.split('\n')
    
    # Remove any empty strings and filter out steps that start with '####'
    steps = [step for step in steps if step.strip() and not step.strip().startswith('####')]
    final_ans = text.split("####")[1].strip()
    return steps, final_ans

In [13]:
# Define your ChatML template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="chatml",
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gemma"},  # ShareGPT style
    map_eos_token=True,  # Map <|im_end|> to </s>
)

def formatting_prompts_func(examples):
    inputs = examples["question"]
    outputs = examples["response"]
    texts = []

    for input, output in zip(inputs, outputs):
        steps, final_ans = split_into_steps(output)
        remaining_steps = ' '.join(steps[1:])
        
        # ChatML structure
        convo = [
            {"from": "human", "value": f"### Instruction:\nCalculate only the first step for the following Math Word Problem\n\n### Input:\n{input}"},
            {"from": "gemma", "value": f"### First Step:\n{steps[0]}"},
            {"from": "human", "value": f"### Instruction:\nContinue generating the entire answer from the next step\n\n"},
            {"from": "gemma", "value": f"### Next steps:\n{remaining_steps}" + f"\n## Final Answer: " + f"{final_ans}"},
        ]
        # convo = [
        #     {"from": "human", "value": f"### Instruction:\nSolve the following Math Word Problem\n\n### Input:\n{input}"},
        #     {"from": "gemma", "value": f"### Answer:\n{output}"}
        # ]
        text = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
        texts.append(text)

    return {"text": texts}
pass

Unsloth: Will map <|im_end|> to EOS = <|im_end|>.


In [14]:
ds = prepare_data(split_ratio=0.8)

Train shape:  (5978, 3)
Val shape:  (1495, 3)
Data distribution: Train: 0.80, Val: 0.20
Removing Train Duplicates
(5978, 3)
   id  \
0   1   
1   2   
2   3   
3   6   
4   7   

                                                                                                                                                                                                                                                                                question  \
0                                                                                                                            Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?   
1                                                                                                                                                                      Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitt

In [15]:
dataset_train = ds['train'].map(formatting_prompts_func, batched = True)
dataset_val = ds['val'].map(formatting_prompts_func, batched = True)

Map:   0%|          | 0/5978 [00:00<?, ? examples/s]

Map:   0%|          | 0/1495 [00:00<?, ? examples/s]

In [16]:
dataset_train[0]

{'id': 1,
 'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'response': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72',
 'text': '<|im_start|>user\n### Instruction:\nCalculate only the first step for the following Math Word Problem\n\n### Input:\nNatalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?<|im_end|>\n<|im_start|>assistant\n### First Step:\nNatalia sold 48/2 = <<48/2=24>>24 clips in May.<|im_end|>\n<|im_start|>user\n### Instruction:\nContinue generating the entire answer from the next step\n\n<|im_end|>\n<|im_start|>assistant\n### Next steps:\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n## Final Answer: 72<|im_end|>\n'}

In [17]:
ds['val'][0]

{'id': 4,
 'question': 'Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?',
 'response': 'Maila read 12 x 2 = <<12*2=24>>24 pages today.\nSo she was able to read a total of 12 + 24 = <<12+24=36>>36 pages since yesterday.\nThere are 120 - 36 = <<120-36=84>>84 pages left to be read.\nSince she wants to read half of the remaining pages tomorrow, then she should read 84/2 = <<84/2=42>>42 pages.\n#### 42'}

### LoRA

### Training

In [18]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = SEED,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 28 layers with 0 QKV layers, 28 O layers and 28 MLP layers.


In [19]:
# !pip3 install wandb

In [20]:
## wandb variables
wandb.login(relogin=False, key='02f6b5d0ce8ce8ee2b69844245a2b3aae6af9582')
%env WANDB_PROJECT=gemma-sft-lora-socratic

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpgoyal[0m ([33meth-piyushi[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /cluster/home/pgoyal/.netrc


env: WANDB_PROJECT=gemma-sft-lora-socratic


In [21]:
train_args = TrainingArguments(
    # eval_strategy = "epoch",
    save_strategy = "epoch",
    save_total_limit = 1,
    # metric_for_best_model = "accuracy",
    logging_steps = 100,
    # save_steps = 50, #unused because of epochs
    per_device_train_batch_size = 4, #1 for gemma2_9b
    # per_device_eval_batch_size = 2,
    gradient_accumulation_steps = 2, #8 for gemma2_9b
    warmup_steps = 100,
    num_train_epochs = 4,
    learning_rate = 2e-4,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = SEED,
    output_dir = OUTPUT_PATH,
    # report_to = "wandb",
    load_best_model_at_end=False
    )

In [22]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_train,
    # eval_dataset = dataset_val,
    dataset_text_field = "text",
    max_seq_length = MAX_SEQ_LENGTH,
    # compute_metrics = compute_accuracy,
    # dataset_num_proc = 2,
    packing = True, 
    args = train_args
    )

In [23]:
wandb.init(settings=wandb.Settings(start_method='fork'), project='gemma-sft-lora-socratic')

In [24]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4090. Max memory = 23.65 GB.
14.805 GB of memory reserved.


In [25]:
gc.collect()
torch.cuda.empty_cache()
# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

In [26]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,417 | Num Epochs = 4
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 2
\        /    Total batch size = 8 | Total steps = 708
 "-____-"     Number of trainable parameters = 40,370,176


Step,Training Loss
100,0.417
200,0.1843
300,0.1541
400,0.134
500,0.1107
600,0.0914
700,0.0811


In [None]:
wandb.finish()

In [7]:
# Loop over each checkpoint
for checkpoint in sorted(os.listdir(OUTPUT_PATH)):
    if checkpoint.startswith("checkpoint"):
        checkpoint_path = os.path.join(OUTPUT_PATH, checkpoint)
        
        if os.path.isdir(checkpoint_path):
            print(f"Processing checkpoint: {checkpoint_path}")
            model, tokenizer = FastLanguageModel.from_pretrained(
                model_name = checkpoint_path,
                max_seq_length = MAX_SEQ_LENGTH,
                dtype = DTYPE,
                load_in_4bit = LOAD_IN_4BIT
            )
            # Merge the LoRA weights with the full model and save it
            merged_output_dir = os.path.join(merged_dir, checkpoint)
            model.save_pretrained_merged(merged_output_dir, tokenizer)
            
            print(f"Merged model saved at: {merged_output_dir}")
            del model
            del tokenizer  
            gc.collect()
            torch.cuda.empty_cache()

print("All checkpoints processed and merged models saved.")

Processing checkpoint: /cluster/project/sachan/piyushi/final_predictions_COT/qwen_7/checkpoint-708
==((====))==  Unsloth 2024.8: Fast Qwen2 patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.65 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Unsloth 2024.8 patched 28 layers with 0 QKV layers, 28 O layers and 28 MLP layers.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 326.62 out of 501.32 RAM for saving.


 14%|█▍        | 4/28 [00:00<00:00, 38.27it/s]We will save to Disk and not RAM now.
100%|██████████| 28/28 [00:18<00:00,  1.51it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.
Merged model saved at: /cluster/project/sachan/piyushi/merged_models_COT/qwen_7/checkpoint-708
All checkpoints processed and merged models saved.
