Fine-tuning of gemma-2b-it

In [65]:
# ALL THE NECESSARY IMPORTS

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from tqdm import tqdm

from dataclasses import dataclass, field
from typing import Optional
from sklearn.model_selection import train_test_split


from datasets import load_dataset
from functools import partial
from peft import LoraConfig, TaskType, get_peft_model, get_peft_config

# Filepath to embeddings
fname = "/mnt/mimic/data/HAIM/mimic_extras/embeddings.csv"

Setting up the model

Different versions, with huggingface LoRA-class or custom Adapter-module.

In [46]:
# LoRA parameter efficient fine-tuning
# Parameters are freezed and small submodules with low-rank matrices ar inserted at the target layers.
# initialization of model
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="auto", quantization_config=quantization_config,attn_implementation="sdpa")
lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type="CAUSAL_LM",
    lora_alpha=16,
    lora_dropout=0.1
)

model = get_peft_model(model, lora_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
# Model-structure and trainable parameters (this can be tuned by hyperparameters)
model.print_trainable_parameters()
model

trainable params: 9,805,824 || all params: 2,515,978,240 || trainable%: 0.3897420034920493


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GemmaForCausalLM(
      (model): GemmaModel(
        (embed_tokens): Embedding(256000, 2048, padding_idx=0)
        (layers): ModuleList(
          (0-17): 18 x GemmaDecoderLayer(
            (self_attn): GemmaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_la

In [2]:
# Adapter NN for parameter efficient fine-tuning
# Adapters (bottleneck feed-forward networks) are added as modules to the layers of the model
# adapting attention projections and MLP projections while freezing original model parameters

class Adapter(nn.Module):
    def __init__(self, size = 6, model_dim = 2048):
        super().__init__()
        self.adapter_block = nn.Sequential(
            nn.Linear(model_dim, size),
            nn.ReLU(),
            nn.Linear(size, model_dim)
        )

    def forward(self, x):

        output = self.adapter_block(x)
        adapter_out = output + x

        return adapter_out


class Adaptered(nn.Module):
    def __init__(self, orig_layer):
        super().__init__()
        self.orig_layer = orig_layer
        self.adapter = Adapter()

    def forward(self, *x):
        orig_out = self.orig_layer(*x)
        output = (self.adapter.forward(orig_out[0].unsqueeze(0))[0],)

        return output



class model_with_adapter(nn.Module):

    def __init__(self):
        super().__init__()
        self.quantization_config = BitsAndBytesConfig(load_in_4bit=True)
        self.model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="auto", quantization_config=self.quantization_config,attn_implementation="sdpa")
        # Freeze the original model parameters
        for params in self.model.parameters():
            params.requires_grad = False
        # Embed adapter layers into the transformer blocks 
        for i, gemma_layer in enumerate(self.model.model.layers):
            gemma_layer.self_attn.q_proj = Adaptered(gemma_layer.self_attn.q_proj)
            gemma_layer.self_attn.k_proj = Adaptered(gemma_layer.self_attn.k_proj)
            gemma_layer.self_attn.v_proj = Adaptered(gemma_layer.self_attn.v_proj)
            gemma_layer.self_attn.o_proj = Adaptered(gemma_layer.self_attn.o_proj)
    
            gemma_layer.mlp.gate_proj = Adaptered(gemma_layer.mlp.gate_proj)
            gemma_layer.mlp.up_proj = Adaptered(gemma_layer.mlp.up_proj)
            gemma_layer.mlp.down_proj = Adaptered(gemma_layer.mlp.down_proj)

    def get_model(self):

        return self.model



In [3]:
# Custom get_parameters function
def get_parameters(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    trainable_percentage = (trainable_params / total_params) * 100

    trainable_params_str = "{:,}".format(trainable_params)
    total_params_str = "{:,}".format(total_params)

    print(f"trainable params: {trainable_params_str} || all params: {total_params_str} || trainable%: {trainable_percentage}")

In [4]:
# Initialization of adapter-model.
# 
model = model_with_adapter().to('cuda')

# Model-structure and trainable parameters (this can be tuned by hyperparameters)
get_parameters(model)
model

# OBS A lot less params, not sure why.. (maybe cause of degeneration? or mistake, need to look into)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 3,355,380 || all params: 1,518,623,476 || trainable%: 0.2209487771674616


model_with_adapter(
  (model): GemmaForCausalLM(
    (model): GemmaModel(
      (embed_tokens): Embedding(256000, 2048, padding_idx=0)
      (layers): ModuleList(
        (0-17): 18 x GemmaDecoderLayer(
          (self_attn): GemmaSdpaAttention(
            (q_proj): Adaptered(
              (orig_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
              (adapter): Adapter(
                (adapter_block): Sequential(
                  (0): Linear(in_features=2048, out_features=6, bias=True)
                  (1): ReLU()
                  (2): Linear(in_features=6, out_features=2048, bias=True)
                )
              )
            )
            (k_proj): Adaptered(
              (orig_layer): Linear4bit(in_features=2048, out_features=256, bias=False)
              (adapter): Adapter(
                (adapter_block): Sequential(
                  (0): Linear(in_features=2048, out_features=6, bias=True)
                  (1): ReLU()
                  (2):

Projection module

In [None]:
embedding_size = 1024
projection_size = 6

class ProjectionNN(nn.Module):
    def __init__(self):
        super(ProjectionNN, self).__init__()

        # Architecture
        self.fc1 = nn.Linear(embedding_size, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 2048 * projection_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = x.view(-1,6,2048)
        return x

Fetching and preprocessing of data

In [66]:
df = pd.read_csv(fname)
df_death_small48 = df[((df['img_length_of_stay'] < 48) & (df['death_status'] == 1))]
df_alive_big48 = df[((df['img_length_of_stay'] >= 48) & (df['death_status'] == 0))]
df_death_big48 = df[((df['img_length_of_stay'] >= 48) & (df['death_status'] == 1))]

df_death_small48['y'] = 1
df_alive_big48['y'] = 0
df_death_big48['y'] = 0
df = pd.concat([df_death_small48, df_alive_big48, df_death_big48], axis = 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_death_small48['y'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_alive_big48['y'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_death_big48['y'] = 0


In [67]:
vd_cols = df.filter(regex='^vd_')
y_col = df[['y']]
haim_col = df[['haim_id']]
df = pd.concat([haim_col, vd_cols, y_col], axis=1)
print(df.head())

     haim_id      vd_0      vd_1      vd_2  ...   vd_1021   vd_1022   vd_1023  y
256     6557  0.005299  0.082119  0.274407  ...  0.000203  0.024739  0.005796  1
259     6557  0.000000  0.079306  0.381579  ...  0.000000  0.022821  0.011584  1
267     6558  0.005299  0.082119  0.274407  ...  0.000203  0.024739  0.005796  1
270     6558  0.000000  0.079306  0.381579  ...  0.000000  0.022821  0.011584  1
319     6581  0.002288  0.078941  0.088397  ...  0.000000  0.013369  0.168811  1

[5 rows x 1026 columns]


In [68]:
def data_split(df, pkl_list):
    train_id, test_id = train_test_split(pkl_list, test_size=0.3)
    
    train_idx = df[df['haim_id'].isin(train_id)]['haim_id'].tolist()
    test_idx = df[df['haim_id'].isin(test_id)]['haim_id'].tolist()

    x_train = df[df['haim_id'].isin(train_idx)].drop(['haim_id','y'],axis=1).values
    x_test = df[df['haim_id'].isin(test_idx)].drop(['haim_id','y'],axis=1).values

    y_train = df[df['haim_id'].isin(train_idx)]['y'].values
    y_test = df[df['haim_id'].isin(test_idx)]['y'].values

    return x_train, x_test, y_train, y_test

In [78]:
input_embeddings = torch.tensor(df.iloc[:, 1:1025].values, dtype=torch.float32)
labels = df['y'].apply(lambda x: 'yes' if x == 0 else 'no').tolist()
list_of_lists = df.iloc[:, 1:1025].values.tolist()

In [9]:
# Prompt function to be fed into training loop

def formatting_func(example, emb, label):
    text = f"### INSTRUCTION: {'Use this input to create the correct label.'}\n### INPUT: {emb}\n### LABEL: {label}"
    return text

In [74]:
class CustomDataset(Dataset):
    def __init__(self, embedding, labels): #removing formatting function
        self.labels = labels
        self.embedding = embedding
        #self.formatting_func = formatting_func

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        label = self.labels[idx]
        emb = self.embedding[idx]
        sample = {"Emb": emb, "Class": label} #"Func": self.formatting_func
        return sample

In [63]:
numerical_embeddings = torch.tensor([
    [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1],
    [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
    [0.2, 0.4, 0.6, 0.8, 1.0, 0.8, 0.6, 0.4, 0.2, 0.0],
    [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.0]
])

# Corresponding labels
labels = ['yes', 'no', 'maybe', 'yes', 'no']

print("Numerical Embeddings:")
print(numerical_embeddings)
print("\nCorresponding Labels:")
print(labels)

CD = CustomDataset(numerical_embeddings, labels)

Numerical Embeddings:
tensor([[0.1000, 0.2000, 0.3000, 0.4000, 0.5000, 0.6000, 0.7000, 0.8000, 0.9000,
         1.0000],
        [1.0000, 0.9000, 0.8000, 0.7000, 0.6000, 0.5000, 0.4000, 0.3000, 0.2000,
         0.1000],
        [0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000,
         0.5000],
        [0.2000, 0.4000, 0.6000, 0.8000, 1.0000, 0.8000, 0.6000, 0.4000, 0.2000,
         0.0000],
        [0.9000, 0.8000, 0.7000, 0.6000, 0.5000, 0.4000, 0.3000, 0.2000, 0.1000,
         0.0000]])

Corresponding Labels:
['yes', 'no', 'maybe', 'yes', 'no']


In [110]:
def collate_batch(batch):
     
    emb_list, classes = [], []
    for thing in batch:
        #print(batch)
        emb_list.append(thing['Emb'])
        classes.append(tokenizer(thing['Class'], return_tensors="pt"))
    text = torch.tensor(emb_list)
    classes = torch.cat([item['input_ids'] for item in classes], dim=0)
    return text, classes

In [111]:
CD = CustomDataset(list_of_lists, labels)

DL_DS = DataLoader(CD, batch_size=2, collate_fn=collate_batch)

In [112]:
first_five_batches = [next(iter(DL_DS)) for _ in range(5)]
print(first_five_batches)

[(tensor([[5.2990e-03, 8.2119e-02, 2.7441e-01,  ..., 2.0308e-04, 2.4739e-02,
         5.7963e-03],
        [0.0000e+00, 7.9306e-02, 3.8158e-01,  ..., 0.0000e+00, 2.2821e-02,
         1.1584e-02]]), tensor([[  2, 956],
        [  2, 956]])), (tensor([[5.2990e-03, 8.2119e-02, 2.7441e-01,  ..., 2.0308e-04, 2.4739e-02,
         5.7963e-03],
        [0.0000e+00, 7.9306e-02, 3.8158e-01,  ..., 0.0000e+00, 2.2821e-02,
         1.1584e-02]]), tensor([[  2, 956],
        [  2, 956]])), (tensor([[5.2990e-03, 8.2119e-02, 2.7441e-01,  ..., 2.0308e-04, 2.4739e-02,
         5.7963e-03],
        [0.0000e+00, 7.9306e-02, 3.8158e-01,  ..., 0.0000e+00, 2.2821e-02,
         1.1584e-02]]), tensor([[  2, 956],
        [  2, 956]])), (tensor([[5.2990e-03, 8.2119e-02, 2.7441e-01,  ..., 2.0308e-04, 2.4739e-02,
         5.7963e-03],
        [0.0000e+00, 7.9306e-02, 3.8158e-01,  ..., 0.0000e+00, 2.2821e-02,
         1.1584e-02]]), tensor([[  2, 956],
        [  2, 956]])), (tensor([[5.2990e-03, 8.2119e-02, 2.744

Projection module

In [None]:
embedding_size = 1024
projection_size = 6

class ProjectionNN(nn.Module):
    def __init__(self):
        super(ProjectionNN, self).__init__()

        # Architecture
        self.fc1 = nn.Linear(embedding_size, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 2048 * projection_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = x.view(-1,6,2048)
        return x

Training functions

In [None]:
def train_epoch(model, gemma, optimizer, loss_fn, train_loader, device):
    # Train:
    model.train()
    train_loss_batches, train_acc_batches = [], []
    num_batches = len(train_loader)
    for batch_index, (x, y) in enumerate(train_loader, 1):
        inputs, labels = x.to(device), y.to(device)
        optimizer.zero_grad()
        embs = model.forward(inputs)
        
        

        loss = loss_fn(z, labels.float())
        loss.backward()
        optimizer.step()
        train_loss_batches.append(loss.item())

        hard_preds = output_to_label(z)
        acc_batch_avg = (hard_preds == labels).float().mean().item()
        train_acc_batches.append(acc_batch_avg)

    return model, train_loss_batches, train_acc_batches

def validate(model, loss_fn, val_loader, device):
    val_loss_cum = 0
    val_acc_cum = 0
    model.eval()
    with torch.no_grad():
        for batch_index, (x, y) in enumerate(val_loader, 1):
            inputs, labels = x.to(device), y.to(device)
            z = model.forward(inputs)

            batch_loss = loss_fn(z, labels.float())
            val_loss_cum += batch_loss.item()
            hard_preds = output_to_label(z)
            acc_batch_avg = (hard_preds == labels).float().mean().item()
            val_acc_cum += acc_batch_avg
    return val_loss_cum/len(val_loader), val_acc_cum/len(val_loader)

In [None]:
def training_loop(model, optimizer, loss_fn, train_loader, val_loader, num_epochs, print_every):
    print("Starting training")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    train_losses, train_accs, val_losses, val_accs = [], [], [], []

    for epoch in range(1, num_epochs+1):
        model, train_loss, train_acc = train_epoch(model,
                                                   optimizer,
                                                   loss_fn,
                                                   train_loader,
                                                   val_loader,
                                                   device,
                                                   print_every)
        val_loss, val_acc = validate(model, loss_fn, val_loader, device)
        print(f"Epoch {epoch}/{num_epochs}: "
              f"Train loss: {sum(train_loss)/len(train_loss):.3f}, "
              f"Train acc.: {sum(train_acc)/len(train_acc):.3f}, "
              f"Val. loss: {val_loss:.3f}, "
              f"Val. acc.: {val_acc:.3f}")
        train_losses.extend(train_loss)
        train_accs.extend(train_acc)
        val_losses.append(val_loss)
        val_accs.append(val_acc)
    return model, train_losses, train_accs, val_losses, val_accs

In [49]:
input_text = "Hello, can you tell me what the input_embs are saying."
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
#print(input_ids)
#print(next(model.children()))
tmp = model(**input_ids, output_hidden_states=True).hidden_states[-1]

outputs = model.generate(**input_ids, inputs_embeds=tmp, max_length=50)
print(tokenizer.decode(outputs[0]))

<bos>Hello, can you tell me what the input_embs are saying.，,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [37]:
# conerting hidden to tokens (not working great)

print(tmp)

token_ids = torch.argmax(tmp, dim=-1)

decoded_tokens = tokenizer.convert_ids_to_tokens(token_ids[0].tolist())
converted_token_ids = tokenizer.convert_tokens_to_ids(decoded_tokens)
print(converted_token_ids)
tokenizer.decode(converted_token_ids)

tensor([[[ 0.3562, -0.8369, -0.2644,  ..., -0.4573, -0.5049, -0.4429],
         [-1.0010, -1.4258, -1.4482,  ..., -0.8257, -0.2101, -1.2383],
         [-0.7925,  0.6221, -1.1826,  ..., -0.4849,  0.8032, -1.1758],
         ...,
         [-0.2217, -0.6143,  0.0714,  ...,  2.7500, -1.4404, -1.5752],
         [ 0.4551, -1.8252,  0.0773,  ...,  1.9189, -0.7290, -1.5742],
         [ 0.7378,  0.9209,  0.1779,  ...,  0.3516, -0.0546, -0.5752]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<MulBackward0>)
[924, 1959, 1959, 1959, 1959, 1959, 1959, 1959, 1959, 1959, 1645, 981, 1790, 1959, 1959, 1959]


'ieslesslesslesslesslesslesslesslesslessuserнаinesslesslessless'