In [1]:
!pip install -U datasets
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForCausalLM,
    PreTrainedTokenizer,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
import torch
import os
import copy
from torch.optim import AdamW
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training

In [3]:
import pandas as pd
import datasets
from typing import Dict, Sequence
import tqdm

In [4]:
def fmt_prompt(prompt):
    return f"### Instructions:\n Can you please translate this phrase or word to french? \n {prompt}\n\n### Response:\n Yes of course! Here is a french translation of that phrase: \n"

In [5]:
def _tokenize(
        strings: Sequence[str],
        tokenizer: PreTrainedTokenizer
) -> Dict:
    """tokenize examples"""
    tokenized_strings = [
        tokenizer(
            example,
            return_tensors='pt',
            padding=False,
            max_length=tokenizer.model_max_length,
            truncation=True,
        )
        for example in strings
    ]

    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_strings]
    input_ids_lens = labels_lens = [
        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
        for tokenized in tokenized_strings
    ]
    return dict(
        input_ids=input_ids,
        labels=labels,
        input_ids_lens=input_ids_lens,
        labels_lens=labels_lens,
    )

In [6]:
def preprocess(
        samples: Sequence[str],
        tokenizer: PreTrainedTokenizer
    ) -> Dict:
    """Preprocess data for training by tokenizing"""
    sources = [f"{fmt_prompt(sources)}" for sources in samples["input"]]
    targets = [f"{translation}{tokenizer.eos_token}" for translation in samples["output"]]
    complete_examples = [s + t for s,t in zip(sources, targets)] # source + target -> "Can you translate this phrase for me? <|phrase|>, Sure thing, here is the french translation <|target|>"
    examples_tokenized, sources_tokenized = [
        _tokenize(strings, tokenizer) for strings in (complete_examples, sources)
    ]
    input_ids = examples_tokenized["input_ids"]
    labels = copy.deepcopy(input_ids)
    for label, source_length in zip(labels, sources_tokenized["input_ids_lens"]):
        label[:source_length] = -100 # Pytorch will ignore -100 during learning in c.e.l.
    return dict(input_ids=input_ids, labels=labels)


In [7]:
class MyDataSet(Dataset):
    """Dataset for fine-tuning model"""

    def __init__(self, tokenizer: PreTrainedTokenizer, paths: str, limit: int):
        super(MyDataSet, self).__init__()
        dataset = (
            datasets.load_dataset(
            "json",
            data_files=paths,
            split=f"train[0:{limit}]" if limit else "train",
            )
            .map(
                lambda samples: preprocess(samples, tokenizer),
                batched=True,
                batch_size=300,
            )
        )

        self.tokenizer = tokenizer
        self.input_ids = dataset["input_ids"]
        self.labels = dataset["labels"]
        self.size = len(dataset)

    def __len__(self) -> int:
        return len(self.input_ids)

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        return dict(
            input_ids = torch.tensor(self.input_ids[idx]),
            labels = torch.tensor(self.labels[idx])
        )


In [8]:
from dataclasses import dataclass

@dataclass
class MyDataCollator(object):

    tokenizer: PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple(
            [instance[key] for instance in instances] for key in ("input_ids", "labels")
        )
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        labels = torch.nn.utils.rnn.pad_sequence(
            labels, batch_first=True, padding_value=-100
        )

        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )

In [9]:
path = "drive/MyDrive/phi-2/en-2-fr-translation.jsonl/"
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
#### put everything together ######
tokenizer = AutoTokenizer.from_pretrained(
        'microsoft/phi-2',
        model_max_length=2048,
        padding_side="right",
        use_fast=False,
        pad_token="<|pad|>",
        trust_remote_code=True,
    )

dataset = MyDataSet(tokenizer, [path], 3000)

special_tokens_dict = dict()
if tokenizer.pad_token is None:
    special_tokens_dict["pad_token"] = "<|pad|>"
if tokenizer.eos_token is None:
    special_tokens_dict["eos_token"] = "<|eos|>"
if tokenizer.unk_token is None:
    special_tokens_dict["unk_token"] = "<|unk|>"

tokenizer.add_special_tokens(special_tokens_dict)
collator = MyDataCollator(tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
dataset.size

3000

In [12]:
# train and val split
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
generator = torch.Generator().manual_seed(42)
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size], generator)
print(f'train and validation dataset sizes: {len(train_dataset), len(val_dataset)}\n')

train and validation dataset sizes: (2400, 600)



In [13]:
# dataloader
batch_size = 4
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, collate_fn=collator)
val_dataloader = DataLoader(val_dataset, batch_size, collate_fn=collator)

In [14]:
#Quant Weights config
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    # store weights in 4bit
    bnb_4bit_quant_type = "nf4",
    # compute with 16 bit
    bnb_4bit_compute_dtype = torch.float16,
    bnb_4bit_use_double_quant = False
)

PackageNotFoundError: No package metadata was found for bitsandbytes

In [26]:
# init model
model_name = "microsoft/phi-2"
config = transformers.AutoConfig.from_pretrained(model_name)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50296, 2560)

In [16]:
optimizer = AdamW(model.parameters(), lr=5e-5)
num_params = sum([p.numel() for p in model.parameters()])
print(f"model params: {num_params}\n")

model params: 2775054456



In [17]:
from transformers import get_scheduler
import uuid

epochs = 1
train_steps = epochs * len(train_dataloader)
lr_scheduler = get_scheduler(name='cosine', optimizer=optimizer, num_warmup_steps=0, num_training_steps=train_steps)

In [18]:
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')
model.to(device)

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(50296, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_layernorm): LayerNorm((2560,),

In [19]:
def should_run_eval(total_steps, freq, current_step):
    return current_step % (total_steps // freq) == 0

def eval(model, val_data, wandb):
    print("evaluating model...\n")
    model.eval()
    losses = 0.0
    for step, batch in enumerate(val_data):
        batch = {
            "input_ids": batch["input_ids"].to(model.device),
            "labels": batch["labels"].to(model.device),
            "attention_mask": batch["attention_mask"].to(model.device),
        }
        with torch.no_grad():
            outputs = model(**batch)

        # record loss
        loss = outputs.loss
        losses += loss.float()
    val_loss = losses / (step + 1)
    wandb.log(
        {
            "val_loss": val_loss
        }
    )

    return val_loss

def save_model(model, outpath: str, current_epoch: int, current_step: int):
    print(f"saving model at epoch: {current_epoch}, step: {current_step}")
    outpath += f"/model"
    model.save_pretrained(outpath)

def run_stats(pbar, wandb, epoch, step, loss):

    wandb.log({
        "current_loss": loss,
        "current_epoch": epoch
    })

    current_loss = f"{loss:.4f}"

    pbar.set_description(f"Epoch {epoch} :: Step {step} :: Loss {current_loss}")

In [20]:
# from model import train_model
from tqdm.auto import tqdm

def train_model(model, epochs, train_dataloader, val_dataloader, train_steps, optimizer, lr_scheduler, wandb):
    pbar = tqdm(range(train_steps))

    run_id = str(uuid.uuid4())
    print(f"model id :: {run_id}")
    output_dir = f"drive/MyDrive/phi-2/outputs/{run_id}"
    model.train()
    for epoch in range(epochs):
        current_epoch = epoch + 1
        for step, batch in enumerate(train_dataloader):
            current_step = step + 1

            batch = {
                "input_ids": batch["input_ids"].to(model.device),
                "labels": batch["labels"].to(model.device),
                "attention_mask": batch["attention_mask"].to(model.device),
            }

            # forward
            outputs = model(**batch)
            loss = outputs.loss

            # backward
            loss.backward()

            # log results
            loss = loss.detach()
            run_stats(pbar, wandb, epoch, step, loss.item())

            # update weights
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            # evaluate and save model
            if should_run_eval(len(train_dataloader), 5, current_step):
                val_loss = eval(model, val_dataloader, wandb)

                save_model(model, output_dir, current_epoch, current_step)
                model.train()
            pbar.update(1)

In [21]:
!pip install -U wandb
import wandb



In [22]:
wandb.init(
    # set the wandb project where this run will be logged
    project="phi-2-fine-tune",

    # track hyperparameters and run metadata
    config={
        "model_name": model_name,
        "run_id": "test_run",
        "epochs": 1,
        "batch_size": batch_size
    }
)

[34m[1mwandb[0m: Currently logged in as: [33mlrav35[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [23]:
print("beginning model training...\n")
train_model(model, epochs, train_dataloader, val_dataloader, train_steps, optimizer, lr_scheduler, wandb)

print("complete")

beginning model training...



  0%|          | 0/600 [00:00<?, ?it/s]

model id :: 98186e5b-0eae-4d1c-a863-98ef9d29a763


OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB. GPU 0 has a total capacty of 15.77 GiB of which 2.38 MiB is free. Process 55737 has 15.77 GiB memory in use. Of the allocated memory 14.55 GiB is allocated by PyTorch, and 866.39 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [27]:
# Empty VRAM
del model
import gc
gc.collect()

90