In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, StoppingCriteria, StoppingCriteriaList, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from dotenv import load_dotenv
from datasets import load_dataset
import torch

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

quantization_config = BitsAndBytesConfig(
    load_in_8bit=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda:0",
    quantization_config=quantization_config
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.37s/it]


In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [4]:
llama_special_tokens = {
    "additional_special_tokens": ["<|begin_of_text|>", "<|start_header_id|>" , "<|end_header_id|>", "<|eot_id|>", ]
}

tokenizer.add_special_tokens(llama_special_tokens)
tokenizer.special_tokens_map

{'bos_token': '<|begin_of_text|>',
 'eos_token': '<|eot_id|>',
 'additional_special_tokens': ['<|begin_of_text|>',
  '<|start_header_id|>',
  '<|end_header_id|>',
  '<|eot_id|>']}

In [5]:
# class StopOnTokens(StoppingCriteria):
#     def __init__(self, stop_token_ids):
#         self.stop_token_ids = stop_token_ids
    
#     def __call__(self, input_ids, scores):
#         for stop_ids in self.stop_token_ids:
#             if torch.equal(input_ids[0, -len(stop_ids):], torch.tensor(stop_ids)):
#                 return True
#         return False

# stop_sequences = []
# stop_token_ids = [tokenizer(stop_sequence, add_special_tokens=False, return_tensors="pt")["input_ids"].to(device) for stop_sequence in stop_sequences ] # add_special_tokens to False so as not to have any such added and for accurate matching
# stopping_criteria = StoppingCriteriaList([StopOnTokens(stop_token_ids)])

In [6]:
model.eval()

prompt_template = lambda question: f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>You are a medical expert specializing in Parkinson's disease. Please provide detailed information and answer questions solely related to Parkinson's disease. If a question pertains to a different topic, kindly respond with, 'I'm sorry, but I can only provide information about Parkinson's disease.Provide a brief and direct answer to the following question without restating it the question.<|eot_id|><|start_header_id|>user<|end_header_id|>{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

prompt = prompt_template("What is parkinson disease?")

inputs = tokenizer(prompt, return_tensors="pt")
output = model.generate(input_ids=inputs["input_ids"].to("cuda:0"), max_new_tokens=200)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [7]:
tokenizer.batch_decode(output)[0]

"<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>You are a medical expert specializing in Parkinson's disease. Please provide detailed information and answer questions solely related to Parkinson's disease. If a question pertains to a different topic, kindly respond with, 'I'm sorry, but I can only provide information about Parkinson's disease.Provide a brief and direct answer to the following question without restating it the question.<|eot_id|><|start_header_id|>user<|end_header_id|>What is parkinson disease?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nParkinson's disease is a progressive neurological disorder characterized by the degeneration of dopamine-producing neurons in a specific part of the brain, leading to a range of motor symptoms, including tremors, rigidity, bradykinesia (slow movement), and postural instability. It is a chronic and incurable condition that affects approximately 1% of people over the age of 60 worldwide.<|eot_id|

In [8]:
dataset = load_dataset("json", data_files={"train": "data/parkinson_disease_qa_train_data.json", "test": "data/parkinson_disease_qa_test_data.json"})
dataset

DatasetDict({
    train: Dataset({
        features: ['example'],
        num_rows: 40
    })
    test: Dataset({
        features: ['example'],
        num_rows: 20
    })
})

In [9]:
dataset["train"]["example"][0]

"<|begin_of_text|><|start_header_id|>system<|end_header_id|>You are a medical expert specializing in Parkinson's disease. Please provide detailed information and answer questions solely related to Parkinson's disease. If a question pertains to a different topic, kindly respond with, 'I'm sorry, but I can only provide information about Parkinson's disease.Provide a brief and direct answer to the following question without restating it the question.<|eot_id|><|start_header_id|>user<|end_header_id|>What are the early signs of Parkinson's disease?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEarly signs of Parkinson's may include mild tremors, rigidity, and subtle changes in posture or gait. Specific indicators such as resting tremor, softer voice, or smaller handwriting are common."

In [10]:
model.train()

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [11]:
# LoRA Config
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 2,097,152 || all params: 8,032,358,400 || trainable%: 0.0261


In [12]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [13]:
def tokenize_function(single_example):

    text = single_example["example"]
    tokenizer.truncation_side = "left"
    output = tokenizer(
        text,
        return_tensors="np",
        max_length=512,
        truncation=True,
        padding=True
    )

    return output

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['example', 'input_ids', 'attention_mask'],
        num_rows: 40
    })
    test: Dataset({
        features: ['example', 'input_ids', 'attention_mask'],
        num_rows: 20
    })
})

In [14]:
tokenized_dataset["train"]["example"]

["<|begin_of_text|><|start_header_id|>system<|end_header_id|>You are a medical expert specializing in Parkinson's disease. Please provide detailed information and answer questions solely related to Parkinson's disease. If a question pertains to a different topic, kindly respond with, 'I'm sorry, but I can only provide information about Parkinson's disease.Provide a brief and direct answer to the following question without restating it the question.<|eot_id|><|start_header_id|>user<|end_header_id|>What are the early signs of Parkinson's disease?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEarly signs of Parkinson's may include mild tremors, rigidity, and subtle changes in posture or gait. Specific indicators such as resting tremor, softer voice, or smaller handwriting are common.",
 "<|begin_of_text|><|start_header_id|>system<|end_header_id|>You are a medical expert specializing in Parkinson's disease. Please provide detailed information and answer questions solely related

In [15]:
train_dataset = tokenized_dataset["train"].shuffle(seed=42)
test_dataset = tokenized_dataset["test"].shuffle(seed=42)

In [16]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['example', 'input_ids', 'attention_mask'],
        num_rows: 40
    })
    test: Dataset({
        features: ['example', 'input_ids', 'attention_mask'],
        num_rows: 20
    })
})

In [17]:
# hyperparameters

lr = 2e-4
num_epoch = 12
batch = 5

training_args = TrainingArguments(
    output_dir = "trainer",
    learning_rate = lr,
    per_device_train_batch_size = batch,
    per_device_eval_batch_size = batch,
    num_train_epochs = num_epoch,
    weight_decay = 0.01,
    logging_strategy = "epoch",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = "epoch",
    gradient_accumulation_steps = 4,
    warmup_steps = 2,
    fp16 = True,
    optim="paged_adamw_8bit",
)



In [18]:
for name, param in model.named_parameters():
    print(f"Parameter '{name}' is stored on: {param.device}")


Parameter 'base_model.model.model.embed_tokens.weight' is stored on: cuda:0
Parameter 'base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight' is stored on: cuda:0
Parameter 'base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight' is stored on: cuda:0
Parameter 'base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight' is stored on: cuda:0
Parameter 'base_model.model.model.layers.0.self_attn.k_proj.weight' is stored on: cuda:0
Parameter 'base_model.model.model.layers.0.self_attn.v_proj.weight' is stored on: cuda:0
Parameter 'base_model.model.model.layers.0.self_attn.o_proj.weight' is stored on: cuda:0
Parameter 'base_model.model.model.layers.0.mlp.gate_proj.weight' is stored on: cuda:0
Parameter 'base_model.model.model.layers.0.mlp.up_proj.weight' is stored on: cuda:0
Parameter 'base_model.model.model.layers.0.mlp.down_proj.weight' is stored on: cuda:0
Parameter 'base_model.model.model.layers.0.input_layernorm.weight' is stored on: cuda:0
Param

In [19]:
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    args=training_args,
    data_collator = data_collator
)

model.config.use_cache = False # silence the warning
trainer.train()
model.config.use_cache = True # renable warning

  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss
1,3.6968,3.289178
2,3.7054,3.232896
3,3.6326,3.154819
4,3.4962,3.012617
5,3.343,2.907411
6,3.1774,2.825839
7,3.0436,2.745687
8,2.9304,2.68336
9,2.8437,2.622124
10,2.7655,2.594224


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


In [20]:
model.push_to_hub("pd-detect-agent")

adapter_model.safetensors: 100%|██████████| 8.40M/8.40M [00:00<00:00, 11.4MB/s]


CommitInfo(commit_url='https://huggingface.co/manojpi/pd-detect-agent/commit/56bc37ebdd6c8c876709c02336edf0f54d7a5276', commit_message='Upload model', commit_description='', oid='56bc37ebdd6c8c876709c02336edf0f54d7a5276', pr_url=None, repo_url=RepoUrl('https://huggingface.co/manojpi/pd-detect-agent', endpoint='https://huggingface.co', repo_type='model', repo_id='manojpi/pd-detect-agent'), pr_revision=None, pr_num=None)

In [21]:
tokenizer.push_to_hub("pd-detect-agent")

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/manojpi/pd-detect-agent/commit/56bc37ebdd6c8c876709c02336edf0f54d7a5276', commit_message='Upload tokenizer', commit_description='', oid='56bc37ebdd6c8c876709c02336edf0f54d7a5276', pr_url=None, repo_url=RepoUrl('https://huggingface.co/manojpi/pd-detect-agent', endpoint='https://huggingface.co', repo_type='model', repo_id='manojpi/pd-detect-agent'), pr_revision=None, pr_num=None)

In [22]:
model.save_pretrained("model_state")
tokenizer.save_pretrained("model_state")

('model_state/tokenizer_config.json',
 'model_state/special_tokens_map.json',
 'model_state/tokenizer.json')

In [27]:
model.eval()

prompt_template = lambda question: f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>You are a medical expert specializing in Parkinson's disease. Please provide detailed information and answer questions solely related to Parkinson's disease. If a question pertains to a different topic, kindly respond with, 'I'm sorry, but I can only provide information about Parkinson's disease.Provide a brief and direct answer to the following question without restating it the question.<|eot_id|><|start_header_id|>user<|end_header_id|>{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

prompt = prompt_template("What is Cancer?")

inputs = tokenizer(prompt, return_tensors="pt")
output = model.generate(input_ids=inputs["input_ids"].to("cuda:0"), max_new_tokens=200)
tokenizer.batch_decode(output)[0]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>You are a medical expert specializing in Parkinson's disease. Please provide detailed information and answer questions solely related to Parkinson's disease. If a question pertains to a different topic, kindly respond with, 'I'm sorry, but I can only provide information about Parkinson's disease.Provide a brief and direct answer to the following question without restating it the question.<|eot_id|><|start_header_id|>user<|end_header_id|>What is Cancer?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm sorry, but I can only provide information about Parkinson's disease.<|eot_id|>"