In [1]:
# Welcome to MyTorch!
# This Notebook demonstrates how easy it is to use our cloud-based GPUs.
# Notice that we are using standard PyTorch syntax, but our venv has MyTorch instead.
# That's how easy it is to use MyTorch.
#
# Setup Instructions:
# 1) Register at MyTorch.net and get your access token, then save it to ~/.mytorch
#    > echo "token=xxx" > ~/.mytorch
# 2) Set Up a Clean Virtual Environment
#    Create and activate a new virtual environment to isolate your MyTorch installation:
#    > python3 -m venv ~/venv_mytorch
#    > source ~/venv_mytorch/bin/activate
# 3) Install MyTorch and Jupyter
#    > pip install --upgrade pip
#    > pip install --upgrade mytorch-ai
#    > pip install jupyterlab ipykernel jupyter
# 4) Register the Virtual Environment as a Jupyter Kernel
#    This makes your new environment available as a kernel named “mytorch (venv)” in Jupyter:
#    > python -m ipykernel install --user --name mytorch_env --display-name "mytorch (venv)"
# 5) Launch Jupyter Notebook
#    > jupyter notebook

import torch
print("MyTorch version:", torch.__version__)

MyTorch version: 0.3.0


In [None]:
# Step One: Confirm that MyTorch is working, GPU should be available.
device = "cuda" if torch.cuda.is_available() else "cpu"
if torch.cuda.is_available():
    print(f"*** Using GPU: {torch.cuda.get_device_name()} ***")
else:
    print("*** No GPU available ***")

INFO - Client process is exiting; disconnecting from server...
INFO - Connecting to server proxy.mytorch.net:50051


In [None]:
# Step Two: Confirm that the MyTorch Token is saved in the correct place.
import os
home_dir = os.path.expanduser("~")
print("Does ~/.mytorch exist?", os.path.exists(os.path.join(home_dir, ".mytorch")))

In [None]:
# Step Three: Log in to Hugging Face
import huggingface_hub
huggingface_token = os.getenv("HF_TOKEN")
if huggingface_token is not None:
    huggingface_hub.login(token=huggingface_token)
else:
    # make sure they have logged in
    hf_token_path = os.path.join(os.path.expanduser('~'), '.cache/huggingface/token')
    if os.path.exists(hf_token_path):
        print("You are already logged into Hugging Face, which makes me happy!!!")
    else:
        import sys
        print("*** You must either set the environment variable HF_TOKEN or \n"
              "*** login to hugging face using the CLI command", file=sys.stderr)
        exit(1)

In [None]:
# Step Four: Load the Llama 3.2 3B Instruct model, then run your prompt:
# Note this is one step because we do not want to use GPU memory for longer than necessary.
import sys
import time
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer

model = "meta-llama/Llama-3.2-3B-Instruct"

print(f"Loading tokenizer for {model}...")
tokenizer = AutoTokenizer.from_pretrained(
    model
)
tokenizer.pad_token = tokenizer.eos_token

print(f"Loading model for {model}...")
model = AutoModelForCausalLM.from_pretrained(
    model
)
print("Model loaded")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Loading model onto {device}")
model.to(device)
print("Model moved to device")

start_time = time.time()

# Prepare the prompt
prompt = "Tell me a 500 word story about a black cat named George"

# Format the prompt according to Llama 2's chat template
formatted_prompt = f"[INST] {prompt} [/INST]"

inputs = tokenizer.encode_plus(
    formatted_prompt,
    add_special_tokens=True,  # Adds special tokens (e.g., [CLS], [SEP])
    return_tensors="pt",  # Return PyTorch tensors
    padding='max_length',  # Pad to a maximum length specified by the model or manually set
    truncation=True,  # Truncate to a maximum length specified by the model or manually set
    max_length=512  # Specify the maximum length
)
print("Prompt: ", prompt)

# Move each tensor in the inputs dictionary to the correct device
inputs = {k: v.to(device) for k, v in inputs.items()}

print("\nGenerating response...")
  
# Generate the response
outputs = model.generate(
    input_ids=inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    max_length=1000,  # Adjust based on desired length
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
)

# Decode and print the response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Clean up the response by removing instruction tokens, the original prompt, and extra whitespace
response = response.replace("[INST]", "").replace("[/INST]", "").strip()
response = response.replace(prompt, "").strip()
end_time = time.time()

print()
print(f"Prompt: {prompt}")
print()
print("Response:")
print(response)
print()
print(f"Time taken: {end_time - start_time:.2f} seconds")

In [None]:
# Step Five: Free up resources
import gc
import torch

del model
del tokenizer
del inputs
del outputs

gc.collect()
torch.cuda.empty_cache()
print("GPU memory freed.")