# Environment Setup
Sets up the environment variable for Hugging Face file transfer protocol. 

This ensures efficient model downloads using the newer transfer protocol.

In [1]:
# Environment Setup

# Import the os module to interact with the operating system
import os

# Set the environment variable to enable Hugging Face file transfer protocol
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# Model Download

Imports and uses the custom utility function to load model configurations from settings. 

This determines which models need to be downloaded.

In [2]:
# Model Configuration Loading

# Import the custom utility function to load model configurations
from frames.utils.settings import load_models

# Load the model configurations
model_configs = load_models()

# Iterate through the model IDs and print the download message for each model
for model_id in model_configs["id"]:
    print(f"Running download for: {model_id}")
    # Use the Hugging Face CLI to download the model, excluding files with the .gguf extension
    os.system(f"huggingface-cli download {model_id} --exclude=*.gguf")

Running download for: meta-llama/Llama-3.2-11B-Vision-Instruct


Downloading '.gitattributes' to '/home/pedro/.cache/huggingface/hub/models--meta-llama--Llama-3.2-11B-Vision-Instruct/blobs/a6344aac8c09253b3b630fb776ae94478aa0275b.incomplete'
Download complete. Moving file to /home/pedro/.cache/huggingface/hub/models--meta-llama--Llama-3.2-11B-Vision-Instruct/blobs/a6344aac8c09253b3b630fb776ae94478aa0275b
Downloading 'LICENSE.txt' to '/home/pedro/.cache/huggingface/hub/models--meta-llama--Llama-3.2-11B-Vision-Instruct/blobs/085b47c1575cb889b7024030e60b78f54f0b8c9e.incomplete'
Download complete. Moving file to /home/pedro/.cache/huggingface/hub/models--meta-llama--Llama-3.2-11B-Vision-Instruct/blobs/085b47c1575cb889b7024030e60b78f54f0b8c9e
Downloading 'README.md' to '/home/pedro/.cache/huggingface/hub/models--meta-llama--Llama-3.2-11B-Vision-Instruct/blobs/bec38d534f6fa603cf50aead7bfc27429a5d826a.incomplete'
Download complete. Moving file to /home/pedro/.cache/huggingface/hub/models--meta-llama--Llama-3.2-11B-Vision-Instruct/blobs/bec38d534f6fa603cf50

/home/pedro/.cache/huggingface/hub/models--meta-llama--Llama-3.2-11B-Vision-Instruct/snapshots/9eb2daaa8597bf192a8b0e73f848f3a102794df5
Running download for: google/paligemma2-3b-pt-224-keras
/home/pedro/.cache/huggingface/hub/models--google--paligemma2-3b-pt-224-keras/snapshots/fd72b019f62fab913cc0ce2e5b6d413b8d9fc61f


## NLTK Download

Downloads all of NLTK for offline usage

In [3]:
!python -m nltk.downloader all

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/pedro/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/pedro/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/pedro/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /home/pedro/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/pedro/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]   

## TikToken tokenizer offline issue fix

In [4]:
import hashlib

import requests

# Define the blob URL and blob path
blob_url = "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"

# Compute the SHA-1 hash of the blob path
hash_filename = hashlib.sha1(blob_url.encode()).hexdigest()

# Define the target directory using pathlib
target_dir = os.environ["TIKTOKEN_CACHE_DIR"]
target_dir.mkdir(parents=True, exist_ok=True)

# Full path to save the file
file_path = target_dir / hash_filename

# Download the file
response = requests.get(blob_url)
response.raise_for_status()  # Ensure the download was successful

# Save the file
file_path.write_bytes(response.content)

print(f"File downloaded and saved to {file_path}")

KeyError: 'TIKTOKEN_CACHE_DIR'

In [None]:
import tiktoken

tiktoken.get_encoding("cl100k_base")