In [None]:
!pip install --upgrade pip
!pip install bitsandbytes
!pip install -q datasets loralib sentencepiece
!pip uninstall transfomers
!pip install --upgrade git+https://github.com/zphang/transformers.git@llama_push
!pip install -q git+https://github.com/huggingface/peft.git

In [None]:
from peft import PeftModel

from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig

tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
model = LlamaForCausalLM.from_pretrained(
    "decapoda-research/llama-7b-hf",
    load_in_8bit=True,
    device_map="auto",
)
model = PeftModel.from_pretrained(model, "bertin-project/bertin-alpaca-lora-7b")


In [None]:
from datasets import load_dataset

eli5 = load_dataset("eli5", split="train_asks[:5000]")
eli5 = eli5.train_test_split(test_size=0.2)

In [None]:
eli5 = eli5.flatten()
def preprocess_function(examples):
    return tokenizer([x for x in examples["answers.text"]])

tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5["train"].column_names,
)
block_size = 128


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)


In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
prompt = input("Enter prompt: ")
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].cuda()
generation_output = model.generate(
    input_ids=input_ids,
    generation_config=GenerationConfig(temperature=0.2, top_p=0.75, num_beams=4),
    return_dict_in_generate=True,
    output_scores=True,
    max_new_tokens=150
)

In [None]:
for seq in generation_output.sequences:
    output = tokenizer.decode(seq)
    print(output)