In [1]:
! pip install numpy
! pip install pandas
! pip install transformers
! pip install datasets
! pip install torch
! pip install tqdm
! pip install peft
! pip install bitsandbytes
! pip install einops



In [2]:
from peft import peft_model, LoraConfig, get_peft_model

import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
import torch
import matplotlib.pyplot as plt
import re
import time

from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig

device=(torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'))

In [3]:
pip install -U transformers



In [4]:
tokenizer=AutoTokenizer.from_pretrained('microsoft/phi-1_5', trust_remote_code=True)
tokenizer.pad_token=tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
bnb=BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16
)

In [6]:

model=AutoModelForCausalLM.from_pretrained(
    'microsoft/phi-1_5',
    trust_remote_code=True,
    device_map='auto',
    quantization_config=bnb
)

In [7]:
lora_config=LoraConfig(
    r=32,
    lora_alpha=16,
    target_modules=['q_proj','k_proj','v_proj','dense', 'fc1','fc2'],
    lora_dropout=0.05,
    bias="none",
    task_type='CAUSAL_LM'
)

model=get_peft_model(model, lora_config)

model.print_trainable_parameters()

trainable params: 28,311,552 || all params: 1,446,582,272 || trainable%: 1.9571


In [8]:
data=load_dataset('BI55/MedText', split='train')

df=data.to_pandas()
df

Unnamed: 0,Prompt,Completion
0,A 50-year-old male presents with a history of ...,This patient's history of recurrent kidney sto...
1,"A 7-year-old boy presents with a fever, headac...","This child's symptoms of a red, bulging tympan..."
2,A 35-year-old woman presents with a persistent...,While the symptoms might initially suggest ast...
3,A 50-year-old male presents with severe abdomi...,The patient's symptoms suggest an incarcerated...
4,A newborn baby presents with eye redness and a...,The infant's symptoms suggest neonatal conjunc...
...,...,...
1407,A 55-year-old male with a history of chronic o...,While this patient's symptoms could be due to ...
1408,Can diet and lifestyle changes help manage vit...,While there is no specific diet or lifestyle m...
1409,A 50-year-old female presents with right shoul...,This patient's shoulder and arm pain following...
1410,A 60-year-old female with high cholesterol lev...,In addition to a diet low in saturated fats an...


In [9]:
df['text'] = df.apply(lambda x: f"prompt: {x['Prompt']} completition: {x['Completion']}", axis=1)

In [10]:
def tok(x):
  tokens=tokenizer(x['text'], padding=True, truncation=True, max_length=512)
  return tokens

In [11]:
data=Dataset.from_pandas(df)

tok_data=data.map(tok, batched=True, remove_columns=data.column_names)

Map:   0%|          | 0/1412 [00:00<?, ? examples/s]

In [12]:
training_args=TrainingArguments(
    output_dir='phi_1_5-finemeds',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    learning_rate=2e-5,
    lr_scheduler_type='cosine',
    save_strategy='epoch',
    logging_steps=100,
    max_steps=10,
    num_train_epochs=10
)


In [13]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PhiForCausalLM(
      (model): PhiModel(
        (embed_tokens): Embedding(51200, 2048)
        (layers): ModuleList(
          (0-23): 24 x PhiDecoderLayer(
            (self_attn): PhiAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4b

In [14]:

trainer = Trainer(
    model=model,
    train_dataset=tok_data,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

In [15]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss


TrainOutput(global_step=10, training_loss=2.1033609390258787, metrics={'train_runtime': 26.5473, 'train_samples_per_second': 1.507, 'train_steps_per_second': 0.377, 'total_flos': 75351257579520.0, 'train_loss': 2.1033609390258787, 'epoch': 0.028328611898016998})

In [17]:
model.save_pretrained('megumind/phi-1_5-meds-qlora')