https://huggingface.co/docs/transformers/llm_tutorial
generate() method is available to all models with generative capabilities.
A language model trained for causal language modeling takes a sequence of text tokens as input and returns the probability distribution for the next token.

# Download Llama2 model to HPC

In [3]:
import os
mycache_dir="/data/cmpe249-fa23/Huggingfacecache"
os.environ['TRANSFORMERS_CACHE'] = mycache_dir
os.environ['HF_HOME'] = mycache_dir
os.environ['HF_DATASETS_CACHE'] = mycache_dir

In [1]:
from transformers import AutoTokenizer
import transformers
import torch

modelname = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(modelname)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [2]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(modelname, device_map="auto")

Downloading (…)lve/main/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]



Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [4]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained(modelname)

In [5]:
newpath=os.path.join(mycache_dir, "Llama-2-7b-chat-hf")
tokenizer.save_pretrained(newpath)
config.save_pretrained(newpath)
model.save_pretrained(newpath)

In [6]:
!ls $newpath

config.json			  model-00006-of-00006.safetensors
generation_config.json		  model.safetensors.index.json
model-00001-of-00006.safetensors  special_tokens_map.json
model-00002-of-00006.safetensors  tokenizer_config.json
model-00003-of-00006.safetensors  tokenizer.json
model-00004-of-00006.safetensors  tokenizer.model
model-00005-of-00006.safetensors


In [7]:
from transformers import AutoTokenizer, AutoConfig, AutoModel
def loadmodels(model_ckpt, newname):
    #model_ckpt = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)#,cache_dir=mycache_dir)
    config = AutoConfig.from_pretrained(model_ckpt)
    model = AutoModel.from_pretrained(model_ckpt)
    newpath=os.path.join(mycache_dir, newname)
    tokenizer.save_pretrained(newpath)
    config.save_pretrained(newpath)
    model.save_pretrained(newpath)
    print(model)

In [9]:
loadmodels("meta-llama/Llama-2-13b-chat-hf", "Llama-2-13b-chat-hf")

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

LlamaModel(
  (embed_tokens): Embedding(32000, 5120)
  (layers): ModuleList(
    (0-39): 40 x LlamaDecoderLayer(
      (self_attn): LlamaAttention(
        (q_proj): Linear(in_features=5120, out_features=5120, bias=False)
        (k_proj): Linear(in_features=5120, out_features=5120, bias=False)
        (v_proj): Linear(in_features=5120, out_features=5120, bias=False)
        (o_proj): Linear(in_features=5120, out_features=5120, bias=False)
        (rotary_emb): LlamaRotaryEmbedding()
      )
      (mlp): LlamaMLP(
        (gate_proj): Linear(in_features=5120, out_features=13824, bias=False)
        (up_proj): Linear(in_features=5120, out_features=13824, bias=False)
        (down_proj): Linear(in_features=13824, out_features=5120, bias=False)
        (act_fn): SiLUActivation()
      )
      (input_layernorm): LlamaRMSNorm()
      (post_attention_layernorm): LlamaRMSNorm()
    )
  )
  (norm): LlamaRMSNorm()
)


In [10]:
loadmodels("mistralai/Mistral-7B-Instruct-v0.1", "Mistral-7B-Instruct-v0.1")

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

MistralModel(
  (embed_tokens): Embedding(32000, 4096)
  (layers): ModuleList(
    (0-31): 32 x MistralDecoderLayer(
      (self_attn): MistralAttention(
        (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
        (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
        (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (rotary_emb): MistralRotaryEmbedding()
      )
      (mlp): MistralMLP(
        (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
        (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
        (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
        (act_fn): SiLUActivation()
      )
      (input_layernorm): MistralRMSNorm()
      (post_attention_layernorm): MistralRMSNorm()
    )
  )
  (norm): MistralRMSNorm()
)


In [11]:
!ls $mycache_dir

arrow
bookcorpus
_data_cmpe249-fa22_Huggingfacecache_emotion_split_1.0.0_cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd.lock
_data_cmpe249-fa22_Huggingfacecache_imdb_plain_text_1.0.0_d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0.lock
_data_cmpe249-fa23_Huggingfacecache_arrow_default-151bc8281cb4d07d_0.0.0_74f69db2c14c2860059d39860b1f400a03d11bf7fb5a8258ca38c501c878c137.lock
_data_cmpe249-fa23_Huggingfacecache_bookcorpus_plain_text_1.0.0_eddee3cae1cc263a431aa98207d4d27fd8a73b0a9742f692af0e6c65afa4d75f.lock
_data_cmpe249-fa23_Huggingfacecache_eli5_LFQA_reddit_1.0.0_17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa.lock
_data_cmpe249-fa23_Huggingfacecache_imdb_plain_text_1.0.0_d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0.lock
distilbert-base-uncased
distilgpt2
distilroberta-base
downloads
eli5
emotion
imdb
Llama-2-13b-chat-hf
Llama-2-7b-chat-hf
Mistral-7B-Instruct-v0.1
models--distilbert-base-uncased
models--distilgpt

# Use the downloaded model

In [2]:
pipeline = transformers.pipeline(
    "text-generation",
    model=modelname,
    torch_dtype=torch.float16,
    device_map="auto",
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

A matching Triton is not available, some optimizations will not be enabled.
Error caught was: No module named 'triton'


In [4]:
sequences = pipeline(
    'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Result: I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?

I also enjoy watching documentaries, but I am always looking for new ones to watch. Have you seen any good ones lately?

Thanks for the chat!

Sincerely,
[Your Name]


In [7]:
sequences = pipeline(
    'Where is San Jose, CA? \n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Result: Where is San Jose, CA? 
San Jose is a city located in the Santa Clara Valley in the southern part of the San Francisco Bay Area of California, United States. It is the third most populous city in California, after Los Angeles and San Diego, and the 10th most populous city in the United States.

What is the climate like in San Jose, CA? 
San Jose has a Mediterranean climate, characterized by mild, wet winters and hot, dry summers. The average temperature in January, the coldest month, is around 50°F (10°C), while the average temperature in July, the warmest month, is around 75°F (24°C). The city experiences a moderate climate year-round, with an average annual rainfall of around 17 inches (43 cm).

What is the population of San Jose, CA? 


In [10]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", device_map="auto")

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNo

In [14]:
model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda")
model_inputs

{'input_ids': tensor([[    1,   319,  5665,   310,  3694, 29901, 29871, 29896, 29892, 29871,
         29906]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [18]:
del model_inputs["token_type_ids"]

In [19]:
model_inputs

{'input_ids': tensor([[    1,   319,  5665,   310,  3694, 29901, 29871, 29896, 29892, 29871,
         29906]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [21]:
# By default, the output will contain up to 20 tokens
# Setting `max_new_tokens` allows you to control the maximum length
generated_ids = model.generate(**model_inputs, max_new_tokens=50)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

'A sequence of numbers: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10.\n\nA sequence of letters: A, B, C, D, E, F, G, H, I'

In [22]:
model_inputs = tokenizer("Where is San Jose, CA?", return_tensors="pt").to("cuda")
del model_inputs["token_type_ids"]
generated_ids = model.generate(**model_inputs, max_new_tokens=50)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

'Where is San Jose, CA?\nWhere is San Jose, California located?\nSan Jose is a city located in the Santa Clara Valley in the San Francisco Bay Area of California, United States. It is the largest city in the San Francisco Bay Area and the 10th'

In [23]:
generated_ids = model.generate(**model_inputs, max_new_tokens=150)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

'Where is San Jose, CA?\nWhere is San Jose, California located?\nSan Jose is a city located in the Santa Clara Valley in the San Francisco Bay Area of California, United States. It is the largest city in the San Francisco Bay Area and the 10th largest city in the state of California. San Jose is situated approximately 50 miles (80 kilometers) south of San Francisco and 35 miles (56 kilometers) north of Santa Cruz. The city is strategically located at the intersection of two major highways, Interstate 280 and Interstate 880, and is served by San Jose International Airport. San Jose is known for its vibrant cultural scene, diverse neighborhoods, and thriv'

In [2]:
from transformers import AutoModelForCausalLM
from transformers import BitsAndBytesConfig #pip install bitsandbytes


nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

#model_nf4 = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", quantization_config=nf4_config)

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", load_in_4bit=True, device_map="auto")


False

The following directories listed in your path were found to be non-existent: {WindowsPath('C')}
The following directories listed in your path were found to be non-existent: {WindowsPath('vs/workbench/api/node/extensionHostProcess')}
The following directories listed in your path were found to be non-existent: {WindowsPath('/matplotlib_inline.backend_inline'), WindowsPath('module')}
The following directories listed in your path were found to be non-existent: {WindowsPath('/usr/local/cuda/lib64')}
DEBUG: Possible options found for libcudart.so: set()
CUDA SETUP: PyTorch settings found: CUDA_VERSION=118, Highest Compute Capability: 8.6.
CUDA SETUP: To manually override the PyTorch CUDA version please see:https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md
CUDA SETUP: Loading binary c:\Users\lkk68\.conda\envs\mycondapy39\lib\site-packages\bitsandbytes\libbitsandbytes_cuda118.so...
argument of type 'WindowsPath' is not iterable
CUDA SETUP: Problem: The 


python -m bitsandbytes


  warn(msg)
  warn(msg)


RuntimeError: 
        CUDA Setup failed despite GPU being available. Please run the following command to get more information:

        python -m bitsandbytes

        Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them
        to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
        and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues

In [4]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
            device_map='auto',
            load_in_8bit=True,
            max_memory=f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB')


The following directories listed in your path were found to be non-existent: {WindowsPath('C')}
The following directories listed in your path were found to be non-existent: {WindowsPath('vs/workbench/api/node/extensionHostProcess')}
The following directories listed in your path were found to be non-existent: {WindowsPath('/matplotlib_inline.backend_inline'), WindowsPath('module')}
The following directories listed in your path were found to be non-existent: {WindowsPath('/usr/local/cuda/lib64')}
DEBUG: Possible options found for libcudart.so: set()
CUDA SETUP: PyTorch settings found: CUDA_VERSION=118, Highest Compute Capability: 8.6.
CUDA SETUP: To manually override the PyTorch CUDA version please see:https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md
CUDA SETUP: Loading binary c:\Users\lkk68\.conda\envs\mycondapy39\lib\site-packages\bitsandbytes\libbitsandbytes_cuda118.so...
argument of type 'WindowsPath' is not iterable
CUDA SETUP: Problem: The main i


python -m bitsandbytes


  warn(msg)
  warn(msg)


RuntimeError: 
        CUDA Setup failed despite GPU being available. Please run the following command to get more information:

        python -m bitsandbytes

        Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them
        to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
        and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues

In [9]:
model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda")

# By default, the output will contain up to 20 tokens
generated_ids = model.generate(**model_inputs)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]




'A sequence of numbers: 1, 2, 3, 4, 5'

In [10]:
# Setting `max_new_tokens` allows you to control the maximum length
generated_ids = model.generate(**model_inputs, max_new_tokens=50)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

'A sequence of numbers: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,'

In [11]:
# Set seed or reproducibility -- you don't need this unless you want full reproducibility
from transformers import set_seed
set_seed(0)

model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to("cuda")

# LLM + greedy decoding = repetitive, boring output
generated_ids = model.generate(**model_inputs)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

# With sampling, the output becomes more creative!
generated_ids = model.generate(**model_inputs, do_sample=True)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]



'I am a cat.\nI just need to be. I am always.\nEvery time'