# Instruct finetune

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import setup_chat_format

model = AutoModelForCausalLM.from_pretrained("pauhidalgoo/cucafera")
tokenizer = AutoTokenizer.from_pretrained("pauhidalgoo/cucafera")

In [3]:
tokenizer.add_special_tokens({"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "additional_special_tokens": ["<im_start>","<im_end>","<fim_prefix>","<fim_middle>","<fim_end>", "<mask>"]})

1

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from datasets import load_dataset
from datasets import load_dataset, DatasetDict
from collections import Counter
from datasets import concatenate_datasets
from trl import SFTConfig, SFTTrainer
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
import re
import torch

Insturctions

In [6]:
model, tokenizer = setup_chat_format(model, tokenizer)

## Processing datasets

In [14]:
patufet_it = load_dataset("baiges/patufet-IT", split="train")
alpacat = load_dataset("baiges/alpaCat", split="train")
patufet_qa = load_dataset("baiges/patufet-QA", split="train")
patufet_escrits = load_dataset("pauhidalgoo/patufet-escrits", split="train")
patufet_human = load_dataset("baiges/patufet-human-interactions", split="train")
patufet_summaries= load_dataset("baiges/patufet-summaries", split="train")

In [15]:
def detect_repetitions(text, ngram_size=5, repetition_threshold=15):
    words = re.findall(r'\b\w+\b', text)

    ngrams = [' '.join(words[i:i+ngram_size]) for i in range(len(words) - ngram_size + 1)]

    ngram_counts = Counter(ngrams)

    for ngram, count in ngram_counts.items():
        if count >= repetition_threshold:
            return True

    return False

def filter_repetitive_examples(example):
    return not detect_repetitions(example['output'])
patufet_it = patufet_it.filter(filter_repetitive_examples)

In [16]:
def format_example(example):
    if example['input'] == None:
        example['input'] = ''
    prompt = f"{example['instruction']}\n{example['input']}"
    completion = example['output']
    return {"completion": completion, "prompt": prompt}

patufet_it_formatted = patufet_it.map(format_example, remove_columns=['instruction', 'input', 'output'])


In [17]:
def format_example(example):
    prompt = f"{example['question']}"
    completion = example['answer']
    return {"completion": completion, "prompt": prompt}

patufet_qa_formatted = patufet_qa.map(format_example, remove_columns=['question','answer', 'field','topic','subtopic','chapter','subunit'])


In [18]:
patufet_qa_formatted = patufet_qa_formatted.shuffle(seed=42).select(range(100000))

In [19]:
rename_mapping = {
    'prompt': 'prompt',
    'output': 'completion',
}

def rename_columns(example):
    return {rename_mapping.get(k, k): v for k, v in example.items()}

patufet_human = patufet_human.map(rename_columns)

In [20]:
rename_mapping = {
    'prompt': 'prompt',
    'summary': 'completion',
}

def rename_columns(example):
    return {rename_mapping.get(k, k): v for k, v in example.items()}

patufet_summaries = patufet_summaries.map(rename_columns)

In [21]:
dataset_instructions = concatenate_datasets([patufet_it_formatted, patufet_qa_formatted, patufet_escrits, patufet_summaries, patufet_human])
dataset_instructions = dataset_instructions.shuffle()
dataset_instructions = dataset_instructions.remove_columns(["summary", "output"])

## Training

In [25]:
sft_config = SFTConfig(
    max_seq_length=2048,
    output_dir="/tmp",
    logging_steps=300,
    num_train_epochs=8,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    learning_rate=3e-5,
    weight_decay=0.01,
    warmup_steps=300,
    logging_dir="./logs",
    fp16=True,
)
trainer = SFTTrainer(
    model,
    tokenizer= tokenizer,
    train_dataset=dataset_instructions,
    args=sft_config,
    packing = True,
)
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Generating train split: 0 examples [00:00, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
300,2.0508
600,1.9046
900,1.8693
1200,1.8269
1500,1.7424
1800,1.7371
2100,1.7285
2400,1.69
2700,1.617
3000,1.621


TrainOutput(global_step=9232, training_loss=1.5024383055688604, metrics={'train_runtime': 6352.8305, 'train_samples_per_second': 17.427, 'train_steps_per_second': 1.453, 'total_flos': 2.64860256864043e+17, 'train_loss': 1.5024383055688604, 'epoch': 8.0})

In [26]:
trainer.save_model("./model")
model = trainer.model


Saving the model

In [27]:
import os

copy_dir = './instruct_model'

if not os.path.exists(copy_dir):
    os.makedirs(copy_dir)

model.save_pretrained(copy_dir)
tokenizer.save_pretrained(copy_dir)


('./instruct_model/tokenizer_config.json',
 './instruct_model/special_tokens_map.json',
 './instruct_model/tokenizer.json')

In [34]:
from transformers import AutoModel, AutoTokenizer

copied_model = AutoModel.from_pretrained(copy_dir)
copied_tokenizer = AutoTokenizer.from_pretrained(copy_dir)


Uploading to huggingface

In [36]:
from huggingface_hub import HfApi, HfFolder, Repository, notebook_login

notebook_login()

repo_name = "pauhidalgoo/cucafera-instruct"
copied_model.push_to_hub(repo_name)
copied_tokenizer.push_to_hub(repo_name)


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/980M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/pauhidalgoo/cucafera-instruct/commit/dbeaff2aa94554cf0fad21cfac782a8b34cea9ea', commit_message='Upload tokenizer', commit_description='', oid='dbeaff2aa94554cf0fad21cfac782a8b34cea9ea', pr_url=None, pr_revision=None, pr_num=None)

# Chat finetunning

In [28]:
tokenizer.pad_token = "<pad>"

In [29]:
patufet_conversa = load_dataset("pauhidalgoo/patufet-conversa", split="train")

Downloading readme:   0%|          | 0.00/531 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/33.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/29044 [00:00<?, ? examples/s]

Processing dataset

In [30]:
import re
import random
from multiprocessing import cpu_count

def apply_chat_template(example, tokenizer):
    messages = example["messages"]
    # We add an empty system message if there is none
    if messages[0]["role"] != "system":
        messages.insert(0, {"role": "system", "content": ""})
    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False)

    return example

column_names = list(patufet_conversa.features)
patufet_conversa_process = patufet_conversa.map(apply_chat_template,
                                num_proc=cpu_count()//2,
                                fn_kwargs={"tokenizer": tokenizer},
                                remove_columns=column_names,
                                desc="Applying chat template",)



Applying chat template (num_proc=127):   0%|          | 0/29044 [00:00<?, ? examples/s]

We train the model on completions only (just the assistant responses)

In [32]:
response_template = "<|im_start|>assistant"
instruction_template = '<|im_start|>user'
collator = DataCollatorForCompletionOnlyLM(instruction_template = instruction_template, response_template=response_template, tokenizer=tokenizer)


In [32]:
def formatting_prompts_func(example):
    messages = example["messages"]
    # We add an empty system message if there is none
    if messages[0]["role"] != "system":
        messages.insert(0, {"role": "system", "content": ""})
    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False)
    return output_texts

In [37]:
sft_config = SFTConfig(
    max_seq_length=2048,
    output_dir="/tmp",
    logging_steps=300,
    num_train_epochs=3,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    learning_rate=1e-5,
    weight_decay=0.01,
    warmup_steps=300,
    logging_dir="./logs2",
    fp16=False,
    save_strategy="no",
)
trainer = SFTTrainer(
    model,
    tokenizer= tokenizer,
    train_dataset=patufet_conversa_process,
    args=sft_config,
    dataset_text_field="text",
    packing = False,
    data_collator=collator,
)
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/29044 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
300,2.0821
600,1.9117
900,1.837
1200,1.8181
1500,1.794
1800,1.7702
2100,1.7467
2400,1.7526
2700,1.7314
3000,1.65


TrainOutput(global_step=8715, training_loss=1.575867317711835, metrics={'train_runtime': 1492.7864, 'train_samples_per_second': 58.369, 'train_steps_per_second': 5.838, 'total_flos': 4.621398230603059e+16, 'train_loss': 1.575867317711835, 'epoch': 3.0})

In [41]:
trainer.save_model("./model-chat")
model_chat = trainer.model

Publishing to huggingface

In [42]:
from huggingface_hub import HfApi, HfFolder, Repository, notebook_login

notebook_login()

repo_name = "pauhidalgoo/cucafera-chat"
model_chat.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/980M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/pauhidalgoo/cucafera-chat/commit/d9445f69e46cb6249028db1e4e3a8476f8e10d28', commit_message='Upload tokenizer', commit_description='', oid='d9445f69e46cb6249028db1e4e3a8476f8e10d28', pr_url=None, pr_revision=None, pr_num=None)

# Testing LORA (didn't work)

We wanted to try doing LORA to prevent catastrophic forgetting, but the dataset wasn't correctly formated for it and in the end we didn't do it because we already had a chat version that worked. :)

In [45]:
from transformers import AutoModel, AutoTokenizer

copied_model = AutoModel.from_pretrained(copy_dir)
copied_tokenizer = AutoTokenizer.from_pretrained(copy_dir)

In [46]:
from peft import get_peft_model, LoraConfig, PeftModel

In [47]:

lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=128,
    lora_alpha=128,
    lora_dropout=0.1,
)

In [48]:
new_model = get_peft_model(copied_model, lora_config)


In [53]:
from transformers import DataCollatorForLanguageModeling

In [56]:
response_template = "<|im_start|>assistant"
instruction_template = '<|im_start|>user'
collator = DataCollatorForLanguageModeling( tokenizer=copied_tokenizer, mlm=False)


In [61]:
trainer = SFTTrainer(
    new_model,
    train_dataset=patufet_conversa_process,
    dataset_text_field= "text",
    args=SFTConfig(output_dir="./cat-gpt-peft",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=6e-5,  # Higher learning rate for LoRA
    weight_decay=0.01,
    warmup_steps=300,
    fp16=True,max_seq_length=2048),
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/29044 [00:00<?, ? examples/s]

KeyError: 'text'

In [60]:
trainer.train()

TypeError: LlamaModel.forward() got an unexpected keyword argument 'labels'

In [None]:
os.makedirs("lora", exist_ok=True)
new_model.save_pretrained('./lora')
copied_tokenizer.save_pretrained('./lora')

merged_model = new_model.merge_and_unload()
merged_model.save_pretrained('./cucafera-peft-merged')