To do a finetuning of LLama I have followed this tutorial: https://www.mlexpert.io/machine-learning/tutorials/alpaca-fine-tuning

In [None]:
%pip install pandas
%pip install gensim
%pip install huggingface
%pip install sentencepiece
%pip install datasets
%pip install nltk
%pip install transformers
%pip isntall regex
%pip install torch
%pip install peft
%pip install accelerate
%pip install bitsandbytes


In [None]:

import pandas as pd
import  datasets
import regex as re
import nltk
from transformers import LlamaTokenizer, LlamaForCausalLM
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
)
import torch
import accelerate
import platform


extract train dataset,test an validate datasets

In [None]:
dataset = datasets.load_dataset("tab_fact", "tab_fact")

In [None]:
train_df = dataset["train"].to_pandas()
test_df = dataset["test"].to_pandas()
validation_df = dataset["validation"].to_pandas()

Downloading LLAMA-7B

In [None]:
BASE_MODEL = "baffo32/decapoda-research-llama-7B-hf"

model = LlamaForCausalLM.from_pretrained(
    BASE_MODEL,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)

tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token_id = (
    0
)
tokenizer.padding_side = "left"

Create a Tokenize Prompt to train the models


In [None]:
def creating_prompt(statement, label, csv):
  if label == 0:
    label = "REFUTED"
  else:
    label = "ENTAILED"

  return f"""###Instruction: Having a csv, response if the follow statment is entilted or refuted
   ###Input:
   the csv is:{csv}, and the statement is {statement}",
   ###Response:
   {label}"""


def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=255,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < 255
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def tokenized_prompt(data):
  prompt = creating_prompt(data["statement"], data["label"], data["table_text"])
  return tokenize(prompt)
test_llm = test_df.apply(tokenized_prompt, axis=1)
train_llm = train_df.apply(tokenized_prompt, axis=1)

Define training variables

In [None]:
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT= 0.05
LORA_TARGET_MODULES = [
    "q_proj",
    "v_proj",
]

BATCH_SIZE = 8
MICRO_BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
LEARNING_RATE = 3e-4
TRAIN_STEPS = 50
OUTPUT_DIR = "models"

Configure Lora Config

In [None]:
model = prepare_model_for_int8_training(model)
config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
model.print_trainable_parameters()

Import training arguments from HF

In [None]:
import transformers
training_arguments = transformers.TrainingArguments(
    per_device_train_batch_size=MICRO_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=100,
    max_steps=TRAIN_STEPS,
    learning_rate=LEARNING_RATE,
    fp16=True,
    logging_steps=10,
    optim="adamw_torch",
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=10,
    save_steps=10,
    output_dir=OUTPUT_DIR
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to="tensorboard"
)

Train the model

In [None]:
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=train_llm,
    eval_dataset=test_llm,
    args=training_arguments,
    data_collator=data_collator
)
model.config.use_cache = False
old_state_dict = model.state_dict
model.state_dict = (
    lambda self, *_, **__: get_peft_model_state_dict(
        self, old_state_dict()
    )
).__get__(model, type(model))

model = torch.compile(model)

trainer.train()
model.save_pretrained(OUTPUT_DIR)

In [None]:
model.save_pretrained(OUTPUT_DIR)

Test the model

In [None]:
prompt = creating_prompt(train_df.loc[0,"statement"], train_df.loc[0,"label"], train_df.loc[0,"table_text"])
inputs = tokenizer(prompt, return_tensors="pt")
generate_ids = model.generate(input_ids=inputs.input_ids, max_length=250)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
