In [1]:
!pip -q install datasets loralib sentencepiece
!pip -q install git+https://github.com/huggingface/transformers # need to install from github
!pip -q install git+https://github.com/huggingface/peft.git
!pip -q install bitsandbytes

In [1]:
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoTokenizer, GPTJForCausalLM, GenerationConfig

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

model = GPTJForCausalLM.from_pretrained(
                                        "EleutherAI/gpt-j-6B",
                                        load_in_8bit=True,
                                        device_map="auto",
                                        )

model = PeftModel.from_pretrained(model, "samwit/dolly-lora")


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /home/ec2-user/anaconda3/envs/pytorch_p39/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/ec2-user/anaconda3/envs/pytorch_p39/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda117_nocublaslt.so...


  warn(msg)


In [2]:
def dolly_generate(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
    )
    input_ids = inputs["input_ids"].cuda()

    generation_config = GenerationConfig(
        temperature=0.6,
        top_p=0.95,
        repetition_penalty=1.2,
    )

    print("Generating...")
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=128,
        pad_token_id = 0,
        eos_token_id = 50256
    )

    for s in generation_output.sequences:
        print(tokenizer.decode(s))

In [3]:
%%time

PROMPT = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
What are the differences between alpacas and sheep?
### Response:"""

dolly_generate(PROMPT)

Generating...
Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
What are the differences between alpacas and sheep?
### Response:!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
CPU times: user 32.1 s, sys: 312 ms, total: 32.4 s
Wall time: 32.4 s


In [4]:
%%time

PROMPT = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Write me a python function that checks if a number is prime.
### Response:"""

dolly_generate(PROMPT)

Generating...
Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Write me a python function that checks if a number is prime.
### Response:!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
CPU times: user 31.8 s, sys: 214 ms, total: 32 s
Wall time: 32 s


In [5]:
%%time

PROMPT = """Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Write an email explaining why GPT-4 should be open source.
### Response:"""

dolly_generate(PROMPT)

Generating...
Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Write an email explaining why GPT-4 should be open source.
### Response:!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
CPU times: user 31.8 s, sys: 239 ms, total: 32 s
Wall time: 32 s
