# Clinical Health Prediction

## Install Required Libraries

In [1]:
!pip install transformers huggingface_hub sentencepiece protobuf==3.20.3

Defaulting to user installation because normal site-packages is not writeable


## Import Required Libraries

In [1]:
import os
import random
import time
import datetime
import torch
import pandas as pd
import numpy as np
from transformers import AutoModel, AutoTokenizer
from dotenv import load_dotenv
from huggingface_hub import login

In [2]:
torch.cuda.empty_cache()

# Confirm that the GPU is detected
assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: NVIDIA A100-SXM4-80GB MIG 3g.40gb, n_gpu: 1


In [None]:
load_dotenv()
login(token=os.getenv("HF_TOKEN"))

## Define Constants

In [4]:
model_id = 'meta-llama/Llama-2-7b-chat-hf'
# model_id = 'BioMistral/BioMistral-7B'
model_path = 'models/Llama-2-7b-chat-hf'

In [5]:
# Set variables to be used for saving and loading results
code = "diabetes"
text_column = "Text"
label_column = "Text_label"
prompt = "prompt2"
data_path = "data/test_diabetes_filtered.csv"
output = "MIMIC_inference_small_"
chkpt="/checkpoint-2000"
save_name = output+"_"+code+"_"+prompt+'_yes_no_'+chkpt[1:]

In [6]:
if code=="oud":
    code_text = "Opioid Use Disorder"
elif code == "sud":
    code_text = "Substance Use Disorder"
elif code == "diabetes":
    code_text = "Diabetes"
else:
    print("Error in the code")

## Define Utility Functions

In [7]:
def create_test_prompt(examples):
    # Initialize static strings for the prompt template
    INTRO_BLURB = "Given a patient's past medical history, predict whether the patient will have a future diagnosis of " + code_text + ". Return 'Yes' or 'No' after the XML tag <Diagnosis>."
    INSTRUCTION_KEY = "### Instruction:"
    INPUT_KEY = "### Input:"
    RESPONSE_KEY = "### Response:"
    END_KEY = "### End"
    
    instruction = f"{INSTRUCTION_KEY}\n{INTRO_BLURB}"
    input_context = f"{INPUT_KEY}\n{examples[text_column]}" if examples[text_column] else None

    high_low_label = examples[label_column]
    if high_low_label == "High":
        t_label = "Yes"
    elif high_low_label == "Low":
        t_label = "No"
    else:
        print("There is some error with the label")
        
    response_ground_truth = f"{RESPONSE_KEY}\n<Diagnosis>"
    
    end = f"{END_KEY}"

    # Create a list of prompt template elements
    parts_ground_truth = [part for part in [instruction, input_context, response_ground_truth] if part]
    # Join prompt template elements into a single string to create the prompt template
    formatted_prompt_ground_truth = "\n\n".join(parts_ground_truth)

    # # Store the formatted prompt template in a new key "text"
    # examples["prompt"] = formatted_prompt

    return formatted_prompt_ground_truth

In [8]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    
seed_everything()

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/apathak2/.cache/huggingface/token
Login successful


In [None]:
model =  AutoModel.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [32]:
model.to(device)

In [26]:
data = pd.read_csv(data_path)

In [27]:
data.head()

Unnamed: 0,PatientId,Text,Label,Text_label
0,10000117,A patient had 2 total visits to the hospital. ...,0.0,Low
1,10001217,A patient had 2 total visits to the hospital. ...,0.0,Low
2,10002428,A patient had 7 total visits to the hospital. ...,0.0,Low
3,10002769,A patient had 2 total visits to the hospital. ...,0.0,Low
4,10003299,A patient had 4 total visits to the hospital. ...,1.0,High


In [28]:
examples_list = []
for i, row in data.iterrows():
    example = create_test_prompt(row)
    examples_list.append(example)

In [None]:
max_length = 8192
batch_size = 1  # Adjust based on your GPU memory

In [None]:
# Set the padding token
tokenizer.pad_token = tokenizer.eos_token # or use tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.padding_side = "right"

# Start the timer
start_time = time.time()

pred = []
for i in range(0, len(examples_list), batch_size):
    print(f"Batch {i//batch_size+1}/{len(examples_list)//batch_size}")
    batch_examples = examples_list[i:i+batch_size]
    with torch.no_grad():
        model_input = tokenizer(batch_examples, return_tensors="pt", truncation="max_length", max_length=max_length).to(device)

        output_ = model(**model_input)
        next_token_logits = output_.logits[0, -1, :]

        # 2. step to convert the logits to probabilities
        next_token_probs = torch.softmax(next_token_logits, -1)

        # 3. step to get the top 20
        topk_next_tokens= torch.topk(next_token_probs, 20)

        low_tokens = {"No", "No", "N", "no", "NO"}
        high_tokens = {"Yes", "Yes", "yes", "yes", "YES", "Y"}

        for j in range(len(batch_examples)):
            top_k_probs = [(tokenizer.decode(idx), prob) for idx, prob in zip(topk_next_tokens.indices[j], topk_next_tokens.values[j])]
            low_sum = 0
            high_sum = 0

            for k, v in top_k_probs:
                if k in low_tokens:
                    low_sum += v.item()
                elif k in high_tokens:
                    high_sum += v.item()
            arr = [high_sum, low_sum]
            low_high_probs = np.exp(arr) / np.sum(np.exp(arr), axis=0) # instead of softmax, normalize it
            pred.append(low_high_probs[0])

        # Clear cache and free memory
        del model_inputs, outputs, next_token_logits, next_token_probs, topk_next_tokens, top_k_probs
        torch.cuda.empty_cache()

# Calculate and print the elapsed time
elapsed_time = time.time() - start_time
print(f"Elapsed time: {datetime.timedelta(seconds=elapsed_time)}")

In [None]:
# Store the results into a new df
result_df = pd.DataFrame(columns =["PatientID","Predictions","Label"])
result_df["PatientID"] = df["PatientId"].values
result_df["Predictions"] = pred
result_df["Label"] = labels

# Show results
print(result_df)

In [None]:
# Save results to file
result_df.to_csv("Predictions_" +save_name+".csv", index=False)

In [None]:
# Calculate and show metrics
roc = roc_auc_score(labels, pred)
pr_auc = average_precision_score(labels, pred)

print("ROC Area under curve: ", roc)
print("Avg Precision Score: ", pr_auc)

In [None]:
# Get the uncertainity estimates
## Change the output to probality score instead of yes/no
## Change the output to probality score instead of high/low or something similar
## Run the same model multiple times and note down the results and compare

# Research what has been done and can be done further based on the uncertainity