# HuggingFace
Run this within the virtual environment **(env_ollama)**!

In [1]:
!which python

/home/matthias/Desktop/MachineLearning/Ollama_Udemy/env_ollama/bin/python


In [2]:
import os
import torch
from dotenv import load_dotenv
from transformers import AutoModelForCausalLM, AutoTokenizer
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
#
print("Torch version:", torch.__version__)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print("Device:", device)
if use_cuda:
    print("GPU:", torch.cuda.get_device_name(0))
#
dtype = torch.float16 if use_cuda else torch.float32
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_TOKEN, use_fast=True)
# use eos as pad token
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
# instantiate model and set device
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=HF_TOKEN,
    #torch_dtype=dtype,
    dtype=dtype,
    device_map="auto" if use_cuda else None
)
# ensure model is correct device
if not use_cuda:
    model.to(device)
# switch model to evalutation
model.eval()
prompt = "Once upon a time"
# get inputs
inputs = tokenizer(
    prompt,
    return_tensors="pt",
    padding=False,
    truncation=True,
)
# move inputs to the right device (works for both CPU and GPU)
inputs = {k: v.to(device) for k, v in inputs.items()}
gen_kwargs = dict(
    max_new_tokens=200,                 # generate 200 new tokens (recommended over max_length)
    do_sample=True,                     # sampling (works with temperature/top_p)
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.1,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
# switch torch to inference mode
with torch.inference_mode():
    output_ids = model.generate(**inputs, **gen_kwargs)
# prepare and print text
text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(f"\n--- Generated text ---\n{text}")

  from .autonotebook import tqdm as notebook_tqdm


Torch version: 2.9.1+cu128
Device: cuda
GPU: NVIDIA GeForce RTX 3060

--- Generated text ---
Once upon a time there was an old man, who had a daughter. They lived in the middle of nowhere, and the old man couldn't afford to send her to school, so he made his own books.
His first book was about the moon. It was called Moon: The Story of Our Friend. He wrote it from memory, and read it aloud to his little girl every night.
Then one day he started writing another story. This one was about a boy named John Smith, who loved horses very much. His horse's name was Jack. And when they were out riding together, they saw something strange on the horizon. A great white ship appeared, and they knew that they must hurry back home before their enemies found them.
As they got closer, they saw more and more people crowded onto the deck, waving flags and shouting for joy.
John and Jack were amazed at the sight. When they reached land again, they asked their father if he could tell them what this meant.

$\checkmark$