# Notebook for testing mutlimodal capability of Gemma

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Filepath to embeddings
fname = "/mnt/mimic/data/HAIM/mimic_extras/embeddings.csv"

# YES-TOKEN: 3553
# NO-TOKEN: 1294

In [2]:
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="auto", quantization_config=quantization_config)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
embedding_size = 1024
projection_size = 6

class ProjectionNN(nn.Module):
    def __init__(self):
        super(ProjectionNN, self).__init__()

        # Architecture
        self.fc1 = nn.Linear(embedding_size, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 2048 * projection_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = x.view(-1,6,2048)
        return x


### Load and pre-process data

In [4]:
df = pd.read_csv(fname)
condition_death_small48 = (df['img_length_of_stay'] < 48) & (df['death_status'] == 1)
condition_alive_big48 = (df['img_length_of_stay'] >= 48) & (df['death_status'] == 0)
condition_death_big48 = (df['img_length_of_stay'] >= 48) & (df['death_status'] == 1)

# Use .loc to avoid SettingWithCopyWarning
df.loc[condition_death_small48, 'y'] = 1
df.loc[condition_alive_big48, 'y'] = 0
df.loc[condition_death_big48, 'y'] = 0

In [5]:
vd_cols = df.filter(regex='^vd_')
y_col = df[['y']]
haim_col = df[['haim_id']]
df = pd.concat([haim_col, vd_cols, y_col], axis=1)

pkl_list = df['haim_id'].unique().tolist()

print(df.head())

   haim_id      vd_0      vd_1      vd_2      vd_3      vd_4      vd_5  \
0     6514  0.000000  0.102385  0.188977  0.007367  0.219433  0.000106   
1     6514  0.000399  0.063669  0.297278  0.007873  0.288133  0.000000   
2     6515  0.000000  0.073280  0.390735  0.007879  0.094356  0.006252   
3     6515  0.000000  0.003337  0.084882  0.008524  0.030514  0.000936   
4     6515  0.000121  0.098648  0.514754  0.001866  0.211975  0.011927   

       vd_6      vd_7      vd_8  ...   vd_1015   vd_1016   vd_1017   vd_1018  \
0  0.074859  0.017974  0.138016  ...  0.010239  0.000589  0.000743  0.102930   
1  0.099269  0.004799  0.215243  ...  0.000000  0.013072  0.000000  0.078393   
2  0.113489  0.021230  0.324026  ...  0.173980  0.009676  0.095614  0.052150   
3  0.242137  0.027981  0.025548  ...  0.071969  0.000301  0.142212  0.017643   
4  0.081207  0.010555  0.364878  ...  0.204686  0.013269  0.134133  0.044195   

    vd_1019   vd_1020   vd_1021   vd_1022   vd_1023    y  
0  0.008906  0.

### Setup functions for training

*Data splitter*

In [6]:
def data_split(df, pkl_list):
    train_id, test_id = train_test_split(pkl_list, test_size=0.3)
    
    train_idx = df[df['haim_id'].isin(train_id)]['haim_id'].tolist()
    test_idx = df[df['haim_id'].isin(test_id)]['haim_id'].tolist()

    x_train = df[df['haim_id'].isin(train_idx)].drop(['haim_id','y'],axis=1).values
    x_test = df[df['haim_id'].isin(test_idx)].drop(['haim_id','y'],axis=1).values

    y_train = df[df['haim_id'].isin(train_idx)]['y'].values
    y_test = df[df['haim_id'].isin(test_idx)]['y'].values

    return x_train, x_test, y_train, y_test

*Train/Val funcs (needs to be updated)*

In [None]:
def custom_output(emb, gemma):
    outputs = gemma(inputs_embeds=emb)
    noyes = [1294,3553]
    logits = outputs['logits']
    logits = logits[:,1,noyes]
    return logits

def output_to_label(logits):
    probs = torch.softmax(logits, dim=-1)
    predicted_token_id = torch.argmax(probs, dim=-1)
    return predicted_token_id

    
def train_epoch(model, gemma, optimizer, loss_fn, train_loader, device, word_embs):
    # Train:
    model.train()
    train_loss_batches, train_acc_batches = [], []
    num_batches = len(train_loader)
    for batch_index, (x, y) in enumerate(train_loader, 1):
        inputs, labels = x.to(device), y.to(device)
        optimizer.zero_grad()

        emb = model.forward(inputs)
        concatted = torch.cat((word_embs,emb), dim=1).to(torch.float16)
        logits = custom_output(concatted, gemma)
        
        loss = loss_fn(logits, labels.float())
        loss.backward()
        optimizer.step()
        train_loss_batches.append(loss.item())

        hard_preds = output_to_label(logits)
        acc_batch_avg = (hard_preds == labels).float().mean().item()
        train_acc_batches.append(acc_batch_avg)

    return model, train_loss_batches, train_acc_batches

def validate(model, gemma, loss_fn, val_loader, device, word_embs):
    val_loss_cum = 0
    val_acc_cum = 0
    model.eval()
    with torch.no_grad():
        for batch_index, (x, y) in enumerate(val_loader, 1):
            inputs, labels = x.to(device), y.to(device)
            emb = model.forward(inputs)
            concatted = torch.cat((word_embs,emb), dim=1).to(torch.float16)
            logits = custom_output(concatted, gemma)

            batch_loss = loss_fn(logits, labels.float())
            val_loss_cum += batch_loss.item()
            hard_preds = output_to_label(logits)
            acc_batch_avg = (hard_preds == labels).float().mean().item()
            val_acc_cum += acc_batch_avg
    return val_loss_cum/len(val_loader), val_acc_cum/len(val_loader)

*Training framework*

In [None]:
def training_loop(model, gemma, optimizer, loss_fn, train_loader, val_loader, num_epochs, word_embs):
    print("Starting training")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    train_losses, train_accs, val_losses, val_accs = [], [], [], []

    for epoch in range(1, num_epochs+1):
        model, train_loss, train_acc = train_epoch(model,
                                                   gemma,
                                                   optimizer,
                                                   loss_fn,
                                                   train_loader,
                                                   val_loader,
                                                   device,
                                                   word_embs)
        val_loss, val_acc = validate(model, loss_fn, val_loader, device)
        print(f"Epoch {epoch}/{num_epochs}: "
              f"Train loss: {sum(train_loss)/len(train_loss):.3f}, "
              f"Train acc.: {sum(train_acc)/len(train_acc):.3f}, "
              f"Val. loss: {val_loss:.3f}, "
              f"Val. acc.: {val_acc:.3f}")
        train_losses.extend(train_loss)
        train_accs.extend(train_acc)
        val_losses.append(val_loss)
        val_accs.append(val_acc)
    return model, train_losses, train_accs, val_losses, val_accs

# Testing out gemma instruct on text generation

In [19]:
input_text = "Hello"
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
print(input_ids.input_ids)
#print(model)
tmp = model.get_input_embeddings().weight[input_ids.input_ids]
tmp.to(device='cuda')

conc = torch.cat((tmp,projected), dim=1).to(torch.float16)

outputs = model(inputs_embeds=conc)
noyes = [1294,3553]
logits = outputs['logits']
print(logits)
logits = logits[:,1,noyes]

probs = torch.softmax(logits, dim=-1)
print(probs)
predicted_token_id = torch.argmax(probs, dim=-1)

if predicted_token_id.item() == 0:
    predicted_token_id[0] = 1294
else:
    predicted_token_id[0] = 3553

decoded_token = tokenizer.decode(predicted_token_id[0])
print(decoded_token)



tensor([[   2, 4521]], device='cuda:0')
tensor([[[   3.1172,    7.4336,   44.9062,  ...,   -5.1055,   -1.9609,
             3.5332],
         [ -30.8125,  -10.2188,  -24.0625,  ...,  -26.6250,  -28.3281,
           -30.1250],
         [-322.5000, -117.3125, -173.8750,  ..., -216.5000, -243.1250,
          -322.0000],
         ...,
         [-378.2500, -141.1250, -184.1250,  ..., -234.2500, -265.5000,
          -378.0000],
         [  -4.4102,  -14.7109,  -51.8125,  ...,   -4.0430,   -0.5557,
            -3.7812],
         [ -24.9375,   -8.6328,  -48.2812,  ...,  -18.4219,  -13.7891,
           -24.3125]]], device='cuda:0', grad_fn=<ToCopyBackward0>)
tensor([[0.0716, 0.9284]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Yes


In [7]:
train, _, _, _ = data_split(df, pkl_list)

In [8]:
proj = ProjectionNN()

tmp12 = torch.tensor(train[0]).float()

projected = proj(tmp12).to(device='cuda').to(torch.float16)
print(projected.size())

torch.Size([1, 6, 2048])


In [21]:
input_text = "Is smoking good for you"
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
test = model.get_input_embeddings().weight[input_ids.input_ids]
#print(input_ids)

outputs = model.generate(inputs_embeds=test, max_length = 150)

notoken = torch.tensor(1294).to(device='cuda')
yestoken = torch.tensor(3553).to(device='cuda')

print(tokenizer.decode(outputs[0]))

Yes
?

Sure, here's a summary of the information I found about the topic:

**Disclaimer:** I am not a medical professional and cannot provide medical advice. Please consult with a healthcare provider for any health concerns or questions.

**Smoking can have both positive and negative effects on your health.**

**Positive effects:**

* **Reduced risk of heart disease:** Smoking can lower blood pressure, cholesterol levels, and other risk factors for heart disease.
* **Reduced risk of stroke:** Smoking can reduce the risk of stroke by up to 50%.
* **Lower risk of cancer:** Smoking can reduce the risk of certain types of cancer, such as lung cancer, mouth cancer, and throat cancer
