# Llama PyTorch
Run this within the virtual environment **(env_ollama)**!

In [1]:
!which python

/home/matthias/Desktop/MachineLearning/Ollama_Udemy/env_ollama/bin/python


Prepare the `model`, `tokenizer`, and the `device`.

In [2]:
import os
import warnings
import torch
from dotenv import load_dotenv
from transformers import AutoModelForCausalLM, AutoTokenizer
load_dotenv()
warnings.filterwarnings("ignore") # ignore all Python warnings
HF_TOKEN = os.environ["HF_TOKEN"]
# Load the pre-trained LLaMA model and tokenizer (replace with the correct model name)
model_name = "meta-llama/Llama-3.2-1B"  # Make sure this model exists on Hugging Face
tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(model_name, token=HF_TOKEN)
# Ensure pad_token_id is set (default to eos_token_id if not defined)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)
model

  from .autonotebook import tqdm as notebook_tqdm


cuda


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (ro

Put everything together and generate a text.

In [3]:
# Define a simple prompt
prompt = "Once upon a time there was a "
# Tokenize the input text and ensure the attention mask is included
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
# Get the attention mask from the tokenized inputs
attention_mask = inputs.get("attention_mask").to(device)
# Move input tensors to the same device as the model
input_ids = inputs["input_ids"].to(device)
# Generate output using the model, passing attention_mask
with torch.no_grad():  # Disable gradient tracking for inference
    output = model.generate(input_ids, max_length=300, num_beams=4, temperature=0.7, attention_mask=attention_mask,pad_token_id=tokenizer.pad_token_id)
# Decode the generated tokens into text
output_text = tokenizer.decode(output[0], skip_special_tokens=True)
# Print the generated text
print(output_text)

Once upon a time there was a 12-year-old boy who lived in a small village in the middle of nowhere. He was the son of a farmer and lived with his mother and two sisters. His father had died when he was very young, and his mother worked hard to provide for her family. The boy was a good student, and he loved to read and write. He was also a good athlete, and he enjoyed playing sports with his friends.
One day, the boy was walking home from school when he saw a strange man standing on the side of the road. The man was wearing a long black coat and had a hood pulled over his head. The boy was scared and ran home as fast as he could. When he got home, he told his mother what he had seen, and she told him to go to the police.
The boy went to the police station and told them what he had seen. The police took him to the station and questioned him about the man he had seen. The boy told them that the man was wearing a long black coat and had a hood pulled over his head. The police took him to 

$\checkmark$