## Step #0: Import Necessary Libraries

In [None]:
!pip install transformers
!pip install datasets
!pip install accelerate
!pip install trl transformers accelerate git+https://github.com/huggingface/peft.git -Uqqq
!pip install bitsandbytes einops wandb -Uqqq
!pip install evaluate
!pip install rouge_score
!pip install rouge

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)


In [None]:
import pandas as pd
import json
import re
import torch
import glob
import numpy as np
from trl import SFTTrainer, SFTConfig
from datasets import Dataset, load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, GenerationConfig, pipeline
from peft import get_peft_model, PeftConfig, PeftModel, LoraConfig, prepare_model_for_kbit_training
import evaluate

In [None]:
# you will need to create a Hugging Face account if you do not have one,
# and then generate a write token to enter in the widget below
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Step #1: Initialize TinyLlama Model

In [None]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
#model_name = "kvwong/tinyllama" # comment this out and uncomment the line above if we want to train on a new TinyLlama instance

bnb_config = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_quant_type="nf4",
  bnb_4bit_use_double_quant=True,
  bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
  model_name,
  quantization_config=bnb_config,
  device_map="auto",
  trust_remote_code=True,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# creating tokenizer and defining the pad token
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
EOS_TOKEN = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

## Step #2: Define Custom Dataset


In [None]:
# import the json lines train and test dataset splits
lines = []
with open(r'liscu_train.jsonl') as f:
  lines = f.read().splitlines()

lines_dict = [json.loads(line) for line in lines]
df = pd.DataFrame(lines_dict)
train_dataset = Dataset.from_pandas(df)
print(train_dataset)

lines = []
with open(r'liscu_test.jsonl') as f:
  lines = f.read().splitlines()

lines_dict = [json.loads(line) for line in lines]
df = pd.DataFrame(lines_dict)
test_dataset = Dataset.from_pandas(df)
print(test_dataset)

Dataset({
    features: ['book_title', 'source', 'character_name', 'summary', 'description', 'masked_description'],
    num_rows: 7600
})
Dataset({
    features: ['book_title', 'source', 'character_name', 'summary', 'description', 'masked_description'],
    num_rows: 957
})


In [None]:
# shuffle the dataset and slice it
train_dataset = train_dataset.shuffle(seed=42)
test_dataset = test_dataset.shuffle(seed=42).select(range(100))

system_prompt = "You are a chatbot that must output a character description for the given name using the text summary. Analyze/infer information about the character. Do not be conversational; just provide the description."
def transform_data(example, isTest):
  character_name = example['character_name']
  summary = example['summary']
  if isTest:
    description = ""
  else:
    description = example['description']

  messages = [
    {
      "role": "system",
      "content": system_prompt,
    },
    {"role": "user", "content": f"Character name: \'{character_name}\'. Text summary: \'{summary}\'. Output a description for the character."},
    {"role": "assistant", "content": description}
  ]

  tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
  return {'text': tokenizer.decode(tokenized_chat[0])}

# apply the transformation
transformed_train = train_dataset.map(transform_data, fn_kwargs={"isTest": False})
transformed_test = test_dataset.map(transform_data, fn_kwargs={"isTest": True})
print(transformed_train[0]['text'])

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2265 > 2048). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

<|system|>
You are a chatbot that must output a character description for the given name using the text summary. Analyze/infer information about the character. Do not be conversational; just provide the description.</s> 
<|user|>
Character name: 'Alceste'. Text summary: 'The Misanthrope is a pint-sized play, so we're going to make this a nano-sized summary. There's this guy named Alceste, and he hates people because he thinks they're too insincere. His BFF Philinte tries to keep him out of trouble, but Alceste is a wily fella. By the way, Alceste likes this lady called Célimène, who is his polar opposite, and Philinte likes a lady named Éliante who likes Alceste and is what he would be like if he were a lady and not crazy. Oh, and Alceste has some kind of lawsuit against him. Got it all? Good. Alceste meets Oronte, who wants to be his friend. Instead, Alceste insults his poetry. (Way to make nice, Alceste.) After Oronte storms away, Alceste tries (unsuccessfully) to get Célimène to pro

## Step #3: Train/Finetune TinyLlama Model

In [None]:
model = prepare_model_for_kbit_training(model)  # setting arguments for low-rank adaptation

lora_alpha = 32
lora_dropout = 0.05
lora_rank = 32  # LoRA rank

peft_config = LoraConfig(
  lora_alpha=lora_alpha,
  lora_dropout=lora_dropout,
  r=lora_rank,
  bias="none",
  task_type="CAUSAL_LM")

peft_model = get_peft_model(model, peft_config)

In [None]:
# setting training arguments
output_dir = "tommyadams/tinyllama" # where you want to save your model
per_device_train_batch_size = 3   # batch size
gradient_accumulation_steps = 4
optim = "adamw_8bit"
save_strategy="steps"
save_steps = 10
logging_steps = 10
learning_rate = 3e-5  # learning rate
max_grad_norm = 0.3
max_steps = 200     # number of training steps
warmup_ratio = 0.03
lr_scheduler_type = "cosine"

training_arguments = SFTConfig(
  output_dir=output_dir,
  per_device_train_batch_size=per_device_train_batch_size,
  gradient_accumulation_steps=gradient_accumulation_steps,
  optim=optim,
  save_steps=save_steps,
  logging_steps=logging_steps,
  learning_rate=learning_rate,
  max_grad_norm=max_grad_norm,
  max_steps=max_steps,
  warmup_ratio=warmup_ratio,
  lr_scheduler_type=lr_scheduler_type,
  push_to_hub=True,
  report_to='none'
)

In [None]:
trainer = SFTTrainer(
  model=peft_model,
  train_dataset=transformed_train,
  peft_config=peft_config,
  max_seq_length=500,
  dataset_text_field='text',
  tokenizer=tokenizer,
  args=training_arguments
)
peft_model.config.use_cache = False


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer.train()



Step,Training Loss
10,2.5457
20,2.5072
30,2.4712
40,2.4167
50,2.3965
60,2.3665
70,2.2815
80,2.2311
90,2.1749
100,2.1316




TrainOutput(global_step=200, training_loss=2.2056548976898194, metrics={'train_runtime': 1117.1317, 'train_samples_per_second': 2.148, 'train_steps_per_second': 0.179, 'total_flos': 7480929484800000.0, 'train_loss': 2.2056548976898194, 'epoch': 0.31570639305445936})

## Step #4: Test Llama Generation Model

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
def generate_description(query, model):
  encoding = tokenizer(query, return_tensors="pt").to(device)
  generation_config = GenerationConfig(max_new_tokens=250, pad_token_id = tokenizer.eos_token_id,repetition_penalty=1.3, eos_token_id = tokenizer.eos_token_id)
  outputs = model.generate(input_ids=encoding.input_ids, generation_config=generation_config)
  text_output = tokenizer.decode(outputs[0],skip_special_tokens=True)
  idx = text_output.find("<|assistant|>", text_output.find("<|assistant|>") + 1)
  return text_output[idx + 14:]

In [None]:
# generating character descriptions with fine-tuned model
response_to_desc = {}
post_finetuning = []
print(transformed_test[0]['text'])
idx = 1
for item in transformed_test:
  input_text = item['text']  # 'text' is the field that stores the input query
  response = generate_description(input_text, model)
  if idx % 10 == 0: # only print every 10 generated descriptions
    print(response)

  post_finetuning.append(response)
  response_to_desc[response] = item['description']
  idx += 1

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


KeyboardInterrupt: 

In [None]:
# calculate bleu scores
bleu = load_metric('bleu')

# prepare references, which are stored in the response_to_desc dict
references = [[response_to_desc[desc]] for desc in post_finetuning] # Human-written descriptions

# compute ROUGE-n and ROUGE-L scores
rouge = evaluate.load('rouge')
rouge_results = rouge.compute(predictions=post_finetuning, references=references)
print(rouge_results)
print("ROUGE-1 score after finetuning:", rouge_results['rouge1'])
print("ROUGE-2 score after finetuning:", rouge_results['rouge2'])
print("ROUGE-L score after finetuning:", rouge_results['rougeL'])

# compute BLEU score
references_split = [[response_to_desc[desc].split()] for desc in post_finetuning]
predictions = [desc.split() for desc in post_finetuning]  # Machine generated descriptions
bleu_results = bleu.compute(predictions=predictions, references=references_split)
print("BLEU score after finetuning:", bleu_results['bleu'])

In [None]:
# disconnect from runtime to save on compute units
from google.colab import runtime
runtime.unassign()