<a href="https://colab.research.google.com/github/nnilayy/MedGPT/blob/main/llama_8_1b_instruct.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## API KEYS

In [None]:
import os
import wandb
from huggingface_hub import HfApi
from kaggle_secrets import UserSecretsClient

# LOADING API-KEYS
user_secrets = UserSecretsClient()

# HUGGINGFACE LOGIN
hugging_face_token = user_secrets.get_secret("HUGGING_FACE_API_KEY")
api = HfApi(token=hugging_face_token)

# WANDB LOGIN
wandb_api_token = user_secrets.get_secret("WANDB_API_KEY")
wandb.login(key = wandb_api_token)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## LOADING MODEL AND TOKENIZERS

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# tokenizer.pad_token = tokenizer.eos_token
checkpoint = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, token=hugging_face_token)
model = AutoModelForCausalLM.from_pretrained(checkpoint,
                                             device_map="auto",
                                             torch_dtype=torch.float16,
                                             token=hugging_face_token,
                                            )

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [None]:
from transformers import TextStreamer
streamer = TextStreamer(
    tokenizer,
    skip_prompt=True,
    skip_special_tokens=True,
)

In [None]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import DataLoader
from tqdm import tqdm

# Load dataset
dataset = load_dataset("pubmed_qa", "pqa_labeled")
test_set = dataset["train"]

# Load model and tokenizer
checkpoint = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, token=hugging_face_token)
model = AutoModelForCausalLM.from_pretrained(checkpoint,
                                             device_map="auto",
                                             torch_dtype=torch.float16,
                                             token=hugging_face_token)
tokenizer.pad_token = tokenizer.eos_token
device = next(model.parameters()).device

def collate_fn(batch):
    input_texts = [f"Question: {item['question']}\nAnswer:" for item in batch]
    inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)
    return {
        'inputs': inputs,
        'true_answers': [item['final_decision'] for item in batch]
    }

batch_size = 8  # Adjust based on your GPU memory
dataloader = DataLoader(test_set, batch_size=batch_size, collate_fn=collate_fn)

# Evaluation loop
results = []
total_batches = len(dataloader)

model.eval()
for batch in tqdm(dataloader, total=total_batches, desc="Evaluating"):
    inputs = batch['inputs']
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=1024)

    generated_answers = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    for true_answer, generated_answer in zip(batch['true_answers'], generated_answers):
        results.append({
            "true_answer": true_answer,
            "generated_answer": generated_answer
        })

# Calculate metrics
correct = sum(1 for r in results if r["true_answer"] in r["generated_answer"])
accuracy = correct / len(results)
print(f"Accuracy: {accuracy:.2f}")

## DATASET HANDLING

In [None]:
# from datasets import load_dataset
class DataHelper:
    def __init__(self, tokenizer, user_query_column, columns_to_tokenize):
        self.tokenizer = tokenizer
        self.tokenizer.pad_token = self.tokenizer.eos_token

        self.system_instruction = "You are a Helpful AI Assistant."
        self.user_instruction = "Please answer the following Question: "
        self.user_query = None

        self.null_values_list = None

        self.user_query_column = user_query_column
        self.columns_to_tokenize = columns_to_tokenize

# SYSTEM & USER PROMPT
    def set_system_instruction(self, system_instruction):
        self.system_instruction = system_instruction
        return self.system_instruction

    def set_user_instruction(self, user_instruction):
        self.user_instruction = user_instruction
        return self.user_instruction

    def set_user_query(self, user_query):
        self.user_query = user_query
        return self.user_query

# HANDLING EMPTY ROWS
    def set_null_values_list(self, null_values_list):
        self.null_values_list = null_values_list
        return self.null_values_list

    def find_empty_rows(self, hugging_face_dataset):
        pandas_df = hugging_face_dataset.to_pandas()
        mask = pandas_df.apply(lambda col: col.isin(self.null_values_list)).any(axis=1)
        return pandas_df[mask]

    def drop_empty_rows(self, hugging_face_dataset):
        pandas_df = hugging_face_dataset.to_pandas()
        mask = pandas_df.apply(lambda col: col.isin(self.null_values_list)).any(axis=1)
        final_df = pandas_df[~mask].reset_index(drop=True)
        hugging_face_dataset = Dataset.from_pandas(final_df)
        return hugging_face_dataset

# FORMATTING DATASET CODE
    def convert_input_to_chat_template(self):
        message = [
            {"role": "system", "content": self.system_instruction},
            {"role": "user", "content": self.user_instruction + self.user_query}
        ]
        formatted_input = self.tokenizer.apply_chat_template(message,
                                                                tokenize=False,
                                                                add_generation_prompt=True,
                                                                return_tensors="pt"
                                                            )
        return formatted_input

    def create_chat_template_dataset(self, example):
        self.user_query = example[self.user_query_column]
        example['training_input'] = self.convert_input_to_chat_template()
        return example

    def format_dataset(self, dataset):
        if dataset:
            formatted_dataset = dataset.map(self.create_chat_template_dataset)
            return formatted_dataset

# TOKENIZATION CODE
    def tokenization_function(self, example):
        return self.tokenizer(example[self.columns_to_tokenize],
                                padding=True,
                                max_length=1024,
                                truncation=True,
                                return_tensors="pt"
                                )

    def tokenize_dataset(self, dataset):
        if dataset:
            tokenized_dataset = dataset.map(self.tokenization_function,
                                            batched=True,
                                            batch_size=128,
                                            num_proc=8
                                           )
            return tokenized_dataset

    def clean_up_dataset(self, base_dataset, tokenized_dataset):
        base_dataset_columns = list(base_dataset.features.keys())
        tokenized_dataset_columns = list(tokenized_dataset.features.keys())
        final_columns = list(set(tokenized_dataset_columns) - set(base_dataset_columns))

        final_dataset = tokenized_dataset.remove_columns(base_dataset_columns)
        final_dataset.set_format(type='pt', columns=final_columns, output_all_columns=True)
        return final_dataset
#

In [None]:
import pandas as pd

# Example DataFrame
null_value_list = [None, "", pd.NA, float('nan')]

data = {
    'A': ["", "2", None, "4", "", "1"],
    'B': ["3", None, "7", "8", pd.NA, "2"],
    'C': ["9", "10", "11", None, pd.NA, "3"]
}
df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['A', 'B', 'C'],
    num_rows: 6
})

In [None]:
datahelper = DataHelper(tokenizer=tokenizer,
                        user_query_column = "input",
                        columns_to_tokenize="training_input"
                       )

datahelper.set_null_values_list([None, "", float('nan'), pd.NA])
datahelper.find_empty_rows(dataset)
datahelper.drop_empty_rows(dataset)

datahelper.set_system_instruction("You are a helpful AI Assistant")
datahelper.set_user_instruction("Please Answer the following Questions:\n")
datahelper.set_user_query("What is 2*2*x derivate")
datahelper.convert_input_to_chat_template()

Dataset({
    features: ['A', 'B', 'C'],
    num_rows: 1
})

In [None]:
from datasets import load_dataset
dataset = load_dataset("medalpaca/medical_meadow_medqa")
dataset

In [None]:
# EXTRACTING DATASET
train_dataset = dataset['train']
test_dataset = None
validation_dataset = None

datasets = [train_dataset, test_dataset, validation_dataset]
datasets

In [None]:
# INSTANTIATING DATAHELPER CLASS
datahelper = DataHelper(tokenizer=tokenizer,
                        user_query_column = "input",
                        columns_to_tokenize="training_input"
                       )

datahelper.set_system_instruction("You are a helpful AI Assistant")
datahelper.set_user_instruction("Please Answer the following Questions:\n")
datahelper.set_user_query("What is 2*2*x derivate")
datahelper.convert_input_to_chat_template()

In [None]:
# FORMATTING DATASET
formatted_datasets = [datahelper.format_dataset(dataset) for dataset in datasets]
tokenized_datasets = [datahelper.tokenize_dataset(dataset) for dataset in formatted_datasets]

train_dataset, test_dataset, validation_dataset = formatted_datasets

In [None]:
# tokenized_datasets
# datahelper.clean_up_dataset(base_dataset=dataset['train'], tokenized_dataset=tokenized_dataset)
# train_dataset = dataset['train'].map(preprocessing, batched=True, batch_size=32)
# train_dataset = train_dataset.remove_columns(['input', 'instruction', 'output', 'final_text'])
# train_dataset.set_format(type='pt', columns=['input_ids', 'attention_mask'], output_all_columns=True)

In [None]:
with tqdm(total=len(train_dataset.batch(batch_size=8)), desc="Generating responses", unit="batch") as pbar:
    for batch in train_dataset.batch(batch_size=8):
        batch = {k: v.to("cuda:0") for k, v in batch.items() if isinstance(v, torch.Tensor)}

        with torch.no_grad():
            output = model.generate(**batch,max_length=1024)
            results.append(output)
    #         decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
    #         results.append(decoded_output)
            pbar.update(1)

## Trainer Code

In [None]:
import torch
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=28,
    save_total_limit=1,
    eval_strategy="steps",
    save_strategy="epoch",
    save_steps = 100,
    # label_names = ['not_equivalent', 'equivalent'],
    fp16=torch.cuda.is_available()  # Use mixed precision if GPUs support it
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    # eval_dataset=dataset['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

## Evaluate Code

In [None]:
from tqdm import tqdm
import torch
import logging

class Evaluate:
    def __init__(self, tokenizer, model):

        logging.getLogger("transformers").setLevel(logging.ERROR)

        self.tokenizer = tokenizer
        self.model = model
        self.streamer = None
        self.model.generation_config.pad_token_id = self.tokenizer.pad_token_id
        self.model.generation_config.eos_token_id = self.tokenizer.eos_token_id

    def single_question_evaluate(self, question, return_prompt=False, stream_response=False):
        self.model.eval()
        with torch.no_grad():
            inputs = self.tokenizer(question, return_tensors="pt").to("cuda")
            output = self.model.generate(
                **inputs,
                max_new_tokens=1000,
                return_dict_in_generate=True,
                temperature=0.5,
                do_sample=True,
                top_k=50,
                num_beams=1,
                early_stopping=False,
                eos_token_id=self.tokenizer.eos_token_id,
                pad_token_id=self.tokenizer.pad_token_id,
                streamer=(self.streamer if stream_response else None),
            )

            if return_prompt:
                response = self.tokenizer.decode(output.sequences[0], skip_special_tokens=True)
            else:
                response = self.tokenizer.decode(output.sequences[:, inputs.input_ids.shape[1]:][0], skip_special_tokens=True)

            return response

    def load_streamer(self, streamer):
        self.streamer = streamer
        return self.streamer

    def batch_evaluate(self, dataset, batch_size, return_prompt=False):
        pass

    def qbq_evaluate(self, dataset, return_prompt=False):
        model_responses = []
        self.model.eval()
        with tqdm(total=len(dataset), desc="Generating responses", unit="question") as pbar:
            for index in range(len(dataset)):
                with torch.no_grad():
                    question = dataset['training_input'][index]
                    inputs = self.tokenizer(question, return_tensors="pt").to("cuda")
                    output = self.model.generate(
                        **inputs,
                        max_new_tokens=1000,
                        return_dict_in_generate=True,
                        temperature=0.5,
                        do_sample=True,
                        top_k=50,
                        num_beams=1,
                        early_stopping=False,
                    )
                    if return_prompt:
                        response = self.tokenizer.decode(output.sequences[0], skip_special_tokens=True)
                    else:
                        response = self.tokenizer.decode(output.sequences[:, inputs.input_ids.shape[1]:][0], skip_special_tokens=True)

                    model_responses.append(response)
                    pbar.update(1)

        return model_responses
