In [1]:
!kill -KILL 67472

/bin/bash: line 0: kill: (67472) - No such process


In [2]:
!nvidia-smi

Mon Jun  3 19:04:46 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.07             Driver Version: 535.161.07   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:21:00.0 Off |                  Off |
| 30%   33C    P5              48W / 450W |      1MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [3]:
from unsloth import FastLanguageModel
import os
import torch
import pandas as pd 
import random 
import transformers
import datasets
from trl import SFTTrainer
from transformers import TrainingArguments

MAX_SEQ_LENGTH = 1024 # Choose any! We auto support RoPE Scaling internally!
DTYPE = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
LOAD_IN_4BIT = False # Use 4bit quantization to reduce memory usage. Can be False.
MODEL_NAME = "unsloth/gemma-2b-it"
DATA_PATH = "/cluster/work/lawecon/Work/mbraasch/data/df_cot_kd.csv"
SEED = 42 
random.seed(SEED) 

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


No ROCm runtime is found, using ROCM_HOME='/opt/rocm'
2024-06-03 19:04:53.722380: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [4]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = DTYPE,
    load_in_4bit = LOAD_IN_4BIT
)

==((====))==  Unsloth: Fast Gemma patching release 2024.5
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.65 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.
normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


### Data Preparation

In [5]:
# Function to arrange dataframe such that the first n rows correspond to first n responses, and so on. 
# The idea is that the model sees all IDs first, instead of all responses from an ID 
def arrange_df(df, id_col='id', ques_col='question',response_col='response'):
    df = df.sort_values(by=[id_col])
    
    # Group dataframe by 'id' column
    grouped = df.groupby(id_col)
    
    # Initialize an empty list to store the arranged data
    arranged_data = []
    
    # Get unique IDs and the maximum number of responses for any ID
    unique_ids = df[id_col].unique()
    max_responses = grouped.size().max()
    
    # Iterate over the number of responses (max_responses)
    for i in range(max_responses):
        # Iterate over unique IDs
        for id_ in unique_ids:
            # Get all responses and questions for the current ID
            responses = grouped.get_group(id_)[response_col].tolist()
            questions = grouped.get_group(id_)[ques_col].tolist()
            # Append the ID, question, and response if available, else append None
            if i < len(responses):
                arranged_data.append({'id': id_, 'question': questions[i], 'response': responses[i]})
            else:
                arranged_data.append({'id': id_, 'question': None, 'response': None})
    
    # Create a new DataFrame from the arranged data
    arranged_df = pd.DataFrame(arranged_data)
    arranged_df = arranged_df.dropna(subset=['response'])
    arranged_df.reset_index(drop=True, inplace=True)
    print(arranged_df.head(10))
    
    assert(arranged_df.shape[0] == df.shape[0])
    return arranged_df

In [6]:
# Function to get Train and Val DataFrames - No Common IDs 
def split_train_val(df, ratio=0.8):
    ids = list(set(df['id'].tolist()))
    random.seed(SEED)
    random.shuffle(ids)
    
    ntrain = int(ratio*len(ids))
    train_ids = ids[:ntrain]
    val_ids = ids[ntrain:]
    
    df_train = df[df['id'].isin(train_ids)].copy()
    df_val = df[df['id'].isin(val_ids)].copy()
    
    print("Train shape: ", df_train.shape)
    print("Val shape: ", df_val.shape)
    print("Train-Val distribution: ", df_train.shape[0]/(df_train.shape[0] + df_val.shape[0]))
    
    return df_train, df_val 

In [7]:
def prepare_data(col_name, path=DATA_PATH, arrange_train=False, remove_train_duplicates=True, remove_val_duplicates=True, split_ratio=1.0):
    df = pd.read_csv(path)
    if 'id' not in df.columns:
        df.reset_index(inplace=True, drop=False)
        df.rename(columns={'index':'id'}, inplace=True)
    
    df.rename(columns={col_name:'response'}, inplace=True)
    # Get Train and Val DFs
    
    df_train, df_val = split_train_val(df, split_ratio)
    df_train = df_train.sort_values(by=['id'])
    df_val = df_val.sort_values(by=['id'])
    
    if remove_train_duplicates:
        print("Removing Train Duplicates")
        df_train = df_train.drop_duplicates(subset=['id'])
        df_train.reset_index(drop=True, inplace=True) 
        print(df_train.shape)
        print(df_train.head())
    
    # Arrange DF Train
    if arrange_train:
        df_train = arrange_df(df_train)
    
    # Remove Duplicates for Validation DF 
    if remove_val_duplicates:
        #print("Removing Val Duplicates")
        df_val = df_val.drop_duplicates(subset=['id'])
        df_val.reset_index(drop=True, inplace=True)
        #print(df_val.shape)
        #print(df_val.head())
        
    # Convert to Dataset 
    dataset_train = datasets.Dataset.from_pandas(df_train[['id','question','response']].copy())
    dataset_val = datasets.Dataset.from_pandas(df_val[['id','question','response']].copy())
    
    # Dataset Dict
    dataset_train = dataset_train.map(formatting_prompts_func, batched = True)
    dataset_val = dataset_val.map(formatting_prompts_func, batched = True)
    
    ds = datasets.DatasetDict({"train":dataset_train, "val":dataset_val})
    
    ##print(ds)
    return ds

In [8]:
## Format Text 
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Solve the following math word problem step-by-step.

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    inputs       = examples["question"]
    outputs      = examples["response"]
    texts = []
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass


In [9]:
def prepare_data_gt(path, split_ratio = 0.0):
    """
    Loads data from disk, formats the prompt and casts into the correct dataset structure.
    """
    
    # Load data from disk
    import json
    with open(path, "r") as f:
        my_data = json.load(f)

    # Format prompt
    train_data = [
        {
            "text": alpaca_prompt.format(
                element["source_question"],
                element["target_steps"] + f"\nFinal Answer: {element['target_result']}" + tokenizer.eos_token
            )
        }
        for element in my_data
    ]

    # Format into huggingface dataset
    ds = datasets.Dataset.from_pandas(pd.DataFrame(data=train_data))
    
    if split_ratio:
        ds = ds.train_test_split(split_ratio)
        ds["val"] = ds["test"]
        del ds["test"]
        print("Train:")
        print(ds["train"])
        print("Val:")
        print(ds["val"])
    else:
        ds = datasets.DatasetDict({"train":ds})
        print("Train:")
        print(ds["train"])
    
    return ds

In [10]:
# My data
OUTPUT_PATH = "/cluster/work/lawecon/Work/mbraasch/output/dl_my_format"
DATA_PATH = "/cluster/work/lawecon/Work/mbraasch/data/formatted/train_dl_my_format.json"
#DATA_PATH = "/cluster/work/lawecon/Work/mbraasch/data/formatted/train_filtered.json"
split_ratio = 1.0

data = prepare_data_gt(path=DATA_PATH, split_ratio=1-split_ratio)

# Shivam data
data = prepare_data(col_name='output_answer', arrange_train=False, remove_train_duplicates=False, remove_val_duplicates=True, split_ratio=split_ratio)

#dataset_train = data["train"]
#dataset_val = data["val"]

Train:
Dataset({
    features: ['text'],
    num_rows: 6228
})
Train shape:  (6228, 8)
Val shape:  (0, 8)
Train-Val distribution:  1.0


Map:   0%|          | 0/6228 [00:00<?, ? examples/s]

In [11]:
def test():
    print("My data:")
    print("##############################")
    print(my_data["train"][0]["text"])
    print("##############################\n")
    print("Shivam data:")
    print("##############################")
    print(sh_data["train"][0]["text"])
    print("##############################")

### LoRA

In [12]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = SEED,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.5 patched 18 layers with 18 QKV layers, 18 O layers and 18 MLP layers.


In [13]:
import wandb 
## wandb variables
wandb.login(relogin=True, key='edcec5761dfce8c0c60778393ae2f6ceba79df18')
%env WANDB_PROJECT=gemma-2b-it


ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /cluster/home/mbraasch/.netrc


env: WANDB_PROJECT=gemma-2b-it


In [14]:
train_args = TrainingArguments(
    #evaluation_strategy = "epoch",
    logging_strategy = "epoch",
    save_strategy = "epoch",
    #save_steps = "epoch",
    #load_best_model_at_end=True,
    # metric_for_best_model = "loss",
    #logging_steps = 50,

    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 1,
    warmup_steps = 5,
    num_train_epochs = 2,
    learning_rate = 2e-4,
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    # optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "constant",
    seed = SEED,
    output_dir = OUTPUT_PATH,
    report_to = "wandb",
    save_total_limit = None,
    run_name=OUTPUT_PATH
    )

In [15]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = data["train"],
    #eval_dataset = data["val"],
    dataset_text_field = "text",
    max_seq_length = MAX_SEQ_LENGTH,
    # dataset_num_proc = 2,
    packing = True, 
    args = train_args
    )



In [16]:
wandb.init(settings=wandb.Settings(start_method='fork'), project='gemma-2b-it-lora-cot-no-duplicates')

[34m[1mwandb[0m: Currently logged in as: [33mmarcelbraasch[0m. Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [17]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4090. Max memory = 23.65 GB.
5.031 GB of memory reserved.


In [18]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,481 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
\        /    Total batch size = 8 | Total steps = 372
 "-____-"     Number of trainable parameters = 19,611,648


Step,Training Loss
186,0.6428
372,0.5125


In [19]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/epoch,▁██
train/global_step,▁██
train/grad_norm,▁█
train/learning_rate,▁▁
train/loss,█▁

0,1
total_flos,4.596555201065779e+16
train/epoch,2.0
train/global_step,372.0
train/grad_norm,1.2801
train/learning_rate,0.0002
train/loss,0.5125
train_loss,0.57764
train_runtime,412.6644
train_samples_per_second,7.178
train_steps_per_second,0.901
