# AKI Seminar2 Demo
## Finetuning of LLaMA

by Syon Kadkade


Table of contents

> [Install packages](#install)   
> [Import libaries](#imports)   
> [Lorem Ipsum]()


--------------
<a id="install"></a>
### Install packages[Emoji]

**Description**:   
lorem ipsum

In [28]:
!pip install accelerate --quiet
!pip install bitsandbytes --quiet
!pip install datasets --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
!pip install sentencepiece



---------------
<a id="imports"></a>
### Import libaries [Emoji]
**Description**:   
Load all necessary libaries.

In [26]:
import numpy as np
import pandas as pd
import torch
import transformers
from accelerate import Accelerator
from transformers import DataCollatorWithPadding, LlamaForCausalLM, LlamaTokenizer

In [15]:
import warnings
warnings.filterwarnings('ignore')

-----------------
### Load LLaMA-7B-Model[Emoji]

**Description**:  
lorem ipsum dolor sit amet

**Resources**:
- [Tutorial](#https://www.youtube.com/watch?v=t68IV5t5UOA)
- [Hugging Face: Transformer Tutorial](#https://huggingface.co/learn/nlp-course/chapter2/2?fw=pt)
- [Hugging Face: LLaMA-7B-Model](https://huggingface.co/docs/transformers/main/model_doc/llama)
- [Hugging Face: LLaMA weights](https://huggingface.co/luodian/llama-7b-hf)
- [Hugging Face: 7B Weights](#https://huggingface.co/huggyllama/llama-7b)

**Note**: I use a model that has the weights in it and we introduce these into the actual LLaMA model. Normally you have to request the weights from Meta AI by filling out a form. I have filled it out several times but there is no response from them.

In [12]:
#MODEL_NAME = 'huggyllama/llama-7b'
MODEL_NAME = "Enoch/llama-7b-hf"

In [13]:
tokenizer = LlamaTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token_id = (0)
tokenizer.padding_side = "left"

tokenizer_config.json:   0%|          | 0.00/218 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

In [None]:
model = LlamaForCausalLM.from_pretrained(MODEL_NAME)

config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

--------------
### Example Usage

In [22]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
#inputs = tokenizer(raw_inputs, padding='max_length', truncation=True, return_token_type_ids=True)
inputs = tokenizer(raw_inputs, truncation=True) #Here we don't use padding because it is ineffecient. Better use after batching.
print(inputs)

{'input_ids': [[1, 306, 29915, 345, 1063, 10534, 363, 263, 379, 688, 3460, 23360, 3236, 590, 3353, 2834, 29889], [1, 306, 26277, 445, 577, 1568, 29991]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}


In [19]:
inputs["input_ids"]

[[1,
  306,
  29915,
  345,
  1063,
  10534,
  363,
  263,
  379,
  688,
  3460,
  23360,
  3236,
  590,
  3353,
  2834,
  29889],
 [1, 306, 26277, 445, 577, 1568, 29991]]

In [20]:
decoded_string = tokenizer.decode(inputs["input_ids"][0])
print(decoded_string)

<s>I've been waiting for a HuggingFace course my whole life.


--------------
### Create own Dataset

In [27]:
from json import loads, dumps
df = pd.DataFrame(
     [["a", "b"], ["c", "d"]],
     index=["row 1", "row 2"],
     columns=["col 1", "col 2"],
 )

In [None]:
from datasets import load_dataset
data_files = {"train": "SQuAD_it-train.json", "test": "SQuAD_it-test.json"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset
issues_dataset = load_dataset("json", data_files="datasets-issues.jsonl", split="train")
issues_dataset

-------------
### Data Preprocessing

In [None]:
def tokenize_function(data):
  """
  A function to tokenize given sentences.

  ...
  """
  return tokenizer(data["sentence1"], data["sentence2"], padding='max_length' truncation=True)

In [None]:
#Use Dataset.map() function to tokenize entire dataset
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

In [23]:


#We can use the Collator function from huggingface to apply dynamic padding to our data.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

----------------
### Prepare Trainer for finetuning
Do it via Trainer API or with own training pipeline

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

----------------
### Finetune Model

In [None]:
trainer.train()

----------------
### Save Model and Tokenizer

In [None]:
outputs = model(**inputs)

In [None]:
tokenizer.save_pretrained("./content/")

In [None]:
model.save_pretrained("./content/")