# Notebook for testing mutlimodal capability of Gemma

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split

# Filepath to embeddings
fname = "/mnt/mimic/data/HAIM/mimic_extras/embeddings.csv"

In [None]:
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="auto", quantization_config=quantization_config)

In [None]:
embedding_size = 100
projection_size = 6

class ProjectionNN(nn.Module):
    def __init__(self):
        super(ProjectionNN, self).__init__()

        # Architecture
        self.fc1 = nn.Linear(embedding_size, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, projection_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x


### Load and pre-process data

In [5]:
df = pd.read_csv(fname)
condition_death_small48 = (df['img_length_of_stay'] < 48) & (df['death_status'] == 1)
condition_alive_big48 = (df['img_length_of_stay'] >= 48) & (df['death_status'] == 0)
condition_death_big48 = (df['img_length_of_stay'] >= 48) & (df['death_status'] == 1)

# Use .loc to avoid SettingWithCopyWarning
df.loc[condition_death_small48, 'y'] = 1
df.loc[condition_alive_big48, 'y'] = 0
df.loc[condition_death_big48, 'y'] = 0

In [6]:
vd_cols = df.filter(regex='^vd_')
y_col = df[['y']]
haim_col = df[['haim_id']]
df = pd.concat([haim_col, vd_cols, y_col], axis=1)

pkl_list = df['haim_id'].unique().tolist()

print(df.head())

   haim_id      vd_0      vd_1      vd_2      vd_3      vd_4      vd_5  \
0     6514  0.000000  0.102385  0.188977  0.007367  0.219433  0.000106   
1     6514  0.000399  0.063669  0.297278  0.007873  0.288133  0.000000   
2     6515  0.000000  0.073280  0.390735  0.007879  0.094356  0.006252   
3     6515  0.000000  0.003337  0.084882  0.008524  0.030514  0.000936   
4     6515  0.000121  0.098648  0.514754  0.001866  0.211975  0.011927   

       vd_6      vd_7      vd_8  ...   vd_1015   vd_1016   vd_1017   vd_1018  \
0  0.074859  0.017974  0.138016  ...  0.010239  0.000589  0.000743  0.102930   
1  0.099269  0.004799  0.215243  ...  0.000000  0.013072  0.000000  0.078393   
2  0.113489  0.021230  0.324026  ...  0.173980  0.009676  0.095614  0.052150   
3  0.242137  0.027981  0.025548  ...  0.071969  0.000301  0.142212  0.017643   
4  0.081207  0.010555  0.364878  ...  0.204686  0.013269  0.134133  0.044195   

    vd_1019   vd_1020   vd_1021   vd_1022   vd_1023    y  
0  0.008906  0.

### Setup train/test split and loss funcs.

In [None]:
def data_split(df):
    train_id, test_id = train_test_split(pkl_list, test_size=0.3)
    
    train_idx = df[df['haim_id'].isin(train_id)]['haim_id'].tolist()
    test_idx = df[df['haim_id'].isin(test_id)]['haim_id'].tolist()

    x_train = df[df['haim_id'].isin(train_idx)].drop(['haim_id','y'],axis=1)
    x_test = df[df['haim_id'].isin(test_idx)].drop(['haim_id','y'],axis=1)

    y_train = df[df['haim_id'].isin(train_idx)]['y']
    y_test = df[df['haim_id'].isin(test_idx)]['y']

    return x_train, x_test, y_train, y_test

# Testing out gemma instruct on text generation

In [None]:
input_text = "Write me a poem about Machine Learning."
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

outputs = model.generate(**input_ids, max_length=40)
print(tokenizer.decode(outputs[0]))

In [None]:
print(input_ids)
print(input_ids['input_ids'])