- get tokenizer
- get model
- make LoRa model
- get dataset
- prepare dataset and dataloaders
- basic training loop with accelerate
- do stuff similar to NER_fewnerd_LoRA




# CausalLM with `open-orca` and LoRA

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_version = "large" # "small", "medium" and "large"
tokenizer = AutoTokenizer.from_pretrained(f"microsoft/DialoGPT-{model_version}")
model = AutoModelForCausalLM.from_pretrained(f"microsoft/DialoGPT-{model_version}")
tokenizer



GPT2TokenizerFast(name_or_path='microsoft/DialoGPT-large', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [2]:
# small:  in_features=768,  out_features=50257, bias=False
# medium: in_features=1024, out_features=50257, bias=False
# large:  in_features=1280, out_features=50257, bias=False
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token}) # the pad_token should be ignored, just like the eos_token
tokenizer

GPT2TokenizerFast(name_or_path='microsoft/DialoGPT-large', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [23]:
%%time
import transformers
from datasets import load_dataset, DatasetDict
def tokenize(element):
    outputs = tokenizer(
        element["question"],
        padding=True,
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
        return_length=True,
    )
    return outputs
openorca_dataset = (
    load_dataset("Open-Orca/OpenOrca")["train"]
    .select(range(800000)) # use subset of data (500k for distilbert, ??? for LLaMA 3) 
    .filter(lambda item: isinstance(item["response"], str)) # ensure the response is a string
    .filter(lambda item: len(item["response"])>0 and len(item["response"])<=768) # response length is 768 or shorter and not ""
    .filter(lambda item: item["response"][0].isdigit() or item["response"][0].isalpha()) # .isdigit => digit, .isalpha => letter
    .map(tokenize) # tokenize
    .map(lambda item: {"input_ids": item["input_ids"][0]}) # remove nesting
    .remove_columns(["id", "system_prompt", "length", "overflow_to_sample_mapping", "attention_mask", "question", "response"])
)
print(openorca_dataset)

Filter:   0%|          | 0/800000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/800000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/547269 [00:00<?, ? examples/s]

Map:   0%|          | 0/502777 [00:00<?, ? examples/s]

Map:   0%|          | 0/502777 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids'],
    num_rows: 502777
})
CPU times: user 9min 4s, sys: 20.1 s, total: 9min 24s
Wall time: 9min 10s


In [22]:
# filter dataset by length if necessary
trainvalid_test_splits = openorca_dataset.train_test_split(test_size=0.15)
test_split_100 = trainvalid_test_splits["test"]
test_split_10 = test_split_100.train_test_split(test_size = 0.1)["test"]
test_split_1 = test_split_100.train_test_split(test_size = 0.01)["test"]
trainvalid_split = trainvalid_test_splits["train"]
train_valid_split = trainvalid_split.train_test_split(test_size=0.15)
valid_split_100 = train_valid_split["test"]
valid_split_10 = valid_split_100.train_test_split(test_size = 0.1)["test"]
valid_split_1 = valid_split_100.train_test_split(test_size = 0.01)["test"]
train_split_100 = train_valid_split["train"]
train_split_10 = train_split_100.train_test_split(test_size = 0.1)["test"]
train_split_1 = train_split_100.train_test_split(test_size = 0.01)["test"]
dev_train_split = train_split_100.train_test_split(test_size = 120)["test"]
dev_valid_split = valid_split_100.train_test_split(test_size = 32)["test"]
dev_test_split = test_split_100.train_test_split(test_size = 8)["test"]
fewnerd_dsd = DatasetDict({
    "train_100": train_split_100,
    "train_10": train_split_10,
    "train_1": train_split_1,
    "valid_100": valid_split_100,
    "valid_10": valid_split_10,
    "valid_1": valid_split_1,
    "test_100": test_split_100,
    "test_10": test_split_10,
    "test_1": test_split_1,
    "train_dev": dev_train_split,
    "valid_dev": dev_valid_split,
    "test_dev": dev_test_split
})
fewnerd_dsd

DatasetDict({
    train_100: Dataset({
        features: ['input_ids'],
        num_rows: 226931
    })
    train_10: Dataset({
        features: ['input_ids'],
        num_rows: 22694
    })
    train_1: Dataset({
        features: ['input_ids'],
        num_rows: 2270
    })
    valid_100: Dataset({
        features: ['input_ids'],
        num_rows: 40047
    })
    valid_10: Dataset({
        features: ['input_ids'],
        num_rows: 4005
    })
    valid_1: Dataset({
        features: ['input_ids'],
        num_rows: 401
    })
    test_100: Dataset({
        features: ['input_ids'],
        num_rows: 47114
    })
    test_10: Dataset({
        features: ['input_ids'],
        num_rows: 4712
    })
    test_1: Dataset({
        features: ['input_ids'],
        num_rows: 472
    })
    train_dev: Dataset({
        features: ['input_ids'],
        num_rows: 120
    })
    valid_dev: Dataset({
        features: ['input_ids'],
        num_rows: 32
    })
    test_dev: Dataset({
      

In [13]:
trainvalid_test_split = openorca_dataset.train_test_split(test_size=0.25)
test_split = trainvalid_test_split["test"] # 25% for testing
trainvalid_split = trainvalid_test_split["train"] # 75% for training AND validation
train_valid_split = trainvalid_split.train_test_split(test_size=0.25)
valid_split = train_valid_split["test"]
train_split = train_valid_split["train"]

from datasets import DatasetDict
DatasetDict({
    "train": train_split,
    "valid": valid_split,
    "test": test_split
})

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 157495
    })
    valid: Dataset({
        features: ['input_ids'],
        num_rows: 52499
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 69999
    })
})

In [3]:
ds = load_dataset("Open-Orca/OpenOrca")
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'system_prompt', 'question', 'response'],
        num_rows: 4233923
    })
})

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# small, medium and large
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small") 
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")

# Let's chat for 5 lines
for step in range(5):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

    # pretty print last ouput tokens from bot
    print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))



tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/351M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

>> User: How are you?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: I'm here


>> User: In which town?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: I'm here


>> User: Are you there?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: I'm here


>> User: This is dull!


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: I'm here


>> User: OK, whatever..


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: I'm here


In [6]:
# Let's chat for 5 lines
for step in range(5):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

    # pretty print last ouput tokens from bot
    print(f"DialoGPT: {format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True))}")

>> User: How have you been?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: I'm here


>> User: 


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: I'm here


>> User: 


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: I'm here


>> User: 


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: I'm here


>> User: Yes, you're doing a really great job. \s


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


DialoGPT: I'm here


In [4]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig
config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

NameError: name 'tokenizer' is not defined

In [None]:
# trying it out 2
def tokenize2(element):
    outputs = tokenizer(
        element["question"],
        padding=True,
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
        return_length=True,
    )
    return outputs
try2_dataset = (
    load_dataset("Open-Orca/OpenOrca")["train"]
    .select(range(100000)) # use subset of data
    .filter(lambda item: isinstance(item["response"], str)) # ensure the response is a string
    .filter(lambda item: item["response"][0].isdigit() or item["response"][0].isalpha()) # .isdigit => digit, .isalpha => letter
    .map(tokenize2) # tokenize
    .map(lambda item: {"input_ids": item["input_ids"][0]}) # remove nesting
    .remove_columns(["id", "system_prompt", "length", "overflow_to_sample_mapping", "attention_mask", "question", "response"])
)
try2_dataset

In [11]:
!ls 1_ner/results

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [16]:
import os
os.listdir("1_ner/results")

[]

In [24]:
!ls ner_logs/FacebookAI-roberta-base/split=100__r=128__bias=all__loradropout=0

adapter_config.json	   logs_dict.json	    tokenizer_config.json
adapter_model.safetensors  merges.txt		    tokenizer.json
confusion_matrix_abs.png   README.md		    training_loss_plot.png
confusion_matrix_pct.png   special_tokens_map.json  vocab.json


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [21]:
!ls ner

ls: cannot access 'ner': No such file or directory


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
!ls

 1_ner
 2_summarization
 3_causallm
 7.6-Training_a_causal_language_model_from_scratch_Colab.ipynb
 7.6-Training_a_causal_language_model_from_scratch.ipynb
 assets
 basic_sweep
 Blenderbot
 Causal_LM_evaluation.ipynb
 CausalLM_openorca_LoRA.ipynb
 Data
 embeddings.ipynb
 enneo
 env
 Links_to_online_references
 literature
 LLaMA
 NER_fewnerd_LoRA-Copy1.ipynb
 NER_fewnerd_LoRA.ipynb
 NER_fewnerd_LoRA_sweep-backup.ipynb
 NER_fewnerd_LoRA_sweep_functions-backup.ipynb
 NER_fewnerd_LoRA_sweep_functions.ipynb
 NER_fewnerd_LoRA_sweep.ipynb
 NER_fewnerd_LoRA_sweep_old.ipynb
 NER_fewnerd_LoRA_sweep.py
 ner_logs
 OldNotebooks
'Organizing_Hyperparameter_Sweeps_in_PyTorch_with_W&B.ipynb'
 PII-NER_coeo.ipynb
 __pycache__
 PyTorch_GPU-Check.ipynb
 README.md
 requirements.txt
 sections
 sweep_outline
 Sweep-progress
 Sweeps_overview
 Sweep_Walkthrough-Edit.ipynb
 Sweep_Walkthrough.ipynb
 TeX
 utils.py
 wandb


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [23]:
!ls sections

section_7


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
