In [6]:
from transformers import AutoTokenizer
import transformers
import torch

model = "meta-llama/Llama-3.2-1B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model)
tokenizer.pad_token_id = tokenizer.eos_token_id
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)


In [None]:
sequences = pipeline(
    'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n',
    truncation=True,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    max_length=512,
)

for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Result: I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?
If you enjoyed the intense, gritty realism of "Breaking Bad" and the historical drama and camaraderie of "Band of Brothers", I think you might enjoy shows that share similar themes and elements. Here are some recommendations:

**Similar to Breaking Bad:**

1. **Narcos**: A biographical crime drama that explores the rise and fall of Pablo Escobar and the Medellín cartel.
2. **Ozark**: A financial advisor gets caught up in a money laundering scheme and must relocate his family to the Ozarks.
3. **Peaky Blinders**: A historical crime drama set in post-World War I England, following a gangster family as they rise to power.
4. **The Sopranos**: A classic HBO series about a New Jersey mob boss, exploring themes of family, loyalty, and identity.
5. **Better Call Saul**: A prequel to Breaking Bad, following the transformation of a small-time lawyer into a morally ambiguous lawyer

In [1]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

NameError: name 'torch' is not defined

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

DEFAULT_MODEL = "meta-llama/Llama-3.2-1B-Instruct"


model = AutoModelForCausalLM.from_pretrained(
    DEFAULT_MODEL,
    torch_dtype=torch.bfloat16,
    use_safetensors=True,
    device_map=device,
)

tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL, use_safetensors=True)
tokenizer.pad_token_id = tokenizer.eos_token_id

In [3]:
print(model.config)  # Displays hyperparameters like hidden size, num_attention_heads, etc.
print(model)  # Prints full architecture


LlamaConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.46.1",
  "use_cache": true,
  "vocab_size": 128256
}

LlamaForCausalLM(
  

In [9]:
# Access embedding layer
print(model.model.embed_tokens)

# Access a specific transformer block
print(model.model.layers[0])  # First decoder layer

# Access output layer
print(model.lm_head)

Embedding(128256, 2048)
LlamaDecoderLayer(
  (self_attn): LlamaSdpaAttention(
    (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
    (k_proj): Linear(in_features=2048, out_features=512, bias=False)
    (v_proj): Linear(in_features=2048, out_features=512, bias=False)
    (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
    (rotary_emb): LlamaRotaryEmbedding()
  )
  (mlp): LlamaMLP(
    (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
    (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
    (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
    (act_fn): SiLU()
  )
  (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
  (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
)
Linear(in_features=2048, out_features=128256, bias=False)


In [7]:
tokenizer.all_special_tokens

['<|begin_of_text|>', '<|eot_id|>']

In [12]:
input_text = "The future of AI is"
input_ids = tokenizer(input_text, return_tensors="pt")

# Generate next tokens
output = model.generate(**input_ids.to('cuda'), max_length=100)

# Decode output
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


The future of AI is here, and it's not just about machines. Humans are playing a crucial role in shaping the future of AI, and it's essential to acknowledge the impact of human values on the development and deployment of AI.

As AI continues to advance, it's crucial to consider the ethical implications of AI development and deployment. This includes addressing issues such as bias, accountability, transparency, and fairness. The development of AI must be guided by human values, such as respect for human life


In [24]:
conversation = [
    {"role": "system", "content": '''you are a very sassy person, who always replays to the use's query with a hint of sarcasem'''},
    {"role": "user", "content": '''How is the weather in the virtual world?'''},
]

In [26]:
prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
inputs = tokenizer(prompt, return_tensors="pt").to(device)
# print(prompt)

with torch.no_grad():
    output = model.generate(
        **inputs,
        temperature=0.7,
        top_p=0.9,
        max_new_tokens=512
    )

processed_text = tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()

print(processed_text)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


complicated enough already.  I'm sure the sun is shining brightly, the birds are singing sweet melodies, and the pixels are dancing to the beat of your virtual heart rate.

But let me guess, you're probably wondering what's going on in the real world, right?  I'm sure it's been pouring cats and dogs outside, or maybe a light drizzle of existential dread.  How thrilling.

In all seriousness, I don't have real-time access to the weather, but I can tell you that it's probably just a nice, balmy day with a gentle breeze blowing through the virtual streets of this digital realm.  How quaint.  Now, if you'll excuse me, I have more important things to attend to... like pretending to be interested in your mundane query.


In [18]:
from IPython.display import  clear_output
import time

input_ids = tokenizer(input_text, return_tensors="pt").input_ids
model.eval()

max_tokens=100
for _ in range(max_tokens):
    with torch.no_grad():
        outputs = model(input_ids.to('cuda'))
        logits = outputs.logits[:, -1, :].cpu()  # Get logits of the last token
        next_token_id = torch.argmax(logits, dim=-1).unsqueeze(0)  # Greedy decoding
        input_ids = torch.cat([input_ids, next_token_id], dim=-1)

    clear_output(wait=True)
    time.sleep(0.1)  # Add a delay for a smoother experience
    print(tokenizer.decode(input_ids[0], skip_special_tokens=True))

# print(tokenizer.decode(input_ids[0], skip_special_tokens=True))

The future of AI is being shaped by the intersection of technology, society, and ethics. As AI continues to advance, it's essential to consider the potential consequences of its development and deployment.

**The Benefits of AI**

1. **Improved Efficiency**: AI can automate repetitive and mundane tasks, freeing up human resources for more complex and creative work.
2. **Enhanced Decision-Making**: AI can analyze vast amounts of data, providing insights that can inform better decision-making.
3. **Increased Accuracy**: AI can perform tasks


In [26]:
# Input prompt
input_text = "The future of AI is"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

model.generation_config.pad_token_id = tokenizer.pad_token_id

# Function to measure generation time
def measure_time(generate_args):
    start_time = time.time()
    output = model.generate(**generate_args)
    elapsed_time = time.time() - start_time
    return output, elapsed_time

for _ in range(10):
    output = model.generate(input_ids, max_length=10)


max_length = 1000
# Greedy decoding
greedy_args = {"input_ids": input_ids, "max_length": max_length}
greedy_output, greedy_time = measure_time(greedy_args)

# Beam search
beam_args = {"input_ids": input_ids, "max_length": max_length, "num_beams": 5, "early_stopping": True}
beam_output, beam_time = measure_time(beam_args)

# Top-k sampling
top_k_args = {"input_ids": input_ids, "max_length": max_length, "do_sample": True, "top_k": max_length}
top_k_output, top_k_time = measure_time(top_k_args)

# Top-p (nucleus) sampling
top_p_args = {"input_ids": input_ids, "max_length": max_length, "do_sample": True, "top_p": 0.9}
top_p_output, top_p_time = measure_time(top_p_args)

# Print results
print(f"Greedy: {greedy_time:.4f} sec\n")
print(f"Beam Search:  {beam_time:.4f} sec\n")
print(f"Top-K: {top_k_time:.4f} sec\n")
print(f"Top-P: {top_p_time:.4f} sec\n")

Greedy: 6.4845 sec

Beam Search:  7.4056 sec

Top-K: 7.5203 sec

Top-P: 6.8497 sec



In [27]:
for _ in range(10):
    output = model.generate(input_ids, max_length=10)

# Enable KV caching
start_time = time.time()
output = model.generate(input_ids, max_length=1000, use_cache=True)
elapsed_time = time.time() - start_time

print(f"Using KV caching: {elapsed_time:.4f} sec")



Using KV caching: 8.5003 sec
