# Environment Setup
Sets up the environment variable for Hugging Face file transfer protocol. 

This ensures efficient model downloads using the newer transfer protocol.

In [1]:
# Environment Setup

# Import the os module to interact with the operating system
import os

# Set the environment variable to enable Hugging Face file transfer protocol
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# Model Download

Imports and uses the custom utility function to load model configurations from settings. 

This determines which models need to be downloaded.

In [None]:
# Model Configuration Loading

# Import the custom utility function to load model configurations
from frames.utils.settings import load_models

# Load the model configurations
model_configs = load_models()

# Iterate through the model IDs and print the download message for each model
for model_id in model_configs["id"]:
    print(f"Running download for: {model_id}")
    # Use the Hugging Face CLI to download the model, excluding files with the .gguf extension
    os.system(f"huggingface-cli download {model_id} --exclude=*.gguf")

Running download for: hugging-quants/Meta-Llama-3.1-70B-BNB-NF4-BF16
/home/pedro/.cache/huggingface/hub/models--hugging-quants--Meta-Llama-3.1-70B-BNB-NF4-BF16/snapshots/befe3578a6bdfc80cc702b69706c10e41659be45
Running download for: hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4


Removing incomplete file '/home/pedro/.cache/huggingface/hub/models--hugging-quants--Meta-Llama-3.1-70B-Instruct-AWQ-INT4/blobs/f1e5572d6b0bb1a5bf71cce73c069cffa03dd0d1fbe38bb2b85056e50dd76476.incomplete' (hf_transfer=True)
Downloading 'model-00001-of-00009.safetensors' to '/home/pedro/.cache/huggingface/hub/models--hugging-quants--Meta-Llama-3.1-70B-Instruct-AWQ-INT4/blobs/f1e5572d6b0bb1a5bf71cce73c069cffa03dd0d1fbe38bb2b85056e50dd76476.incomplete'


## NLTK Download

Downloads all of NLTK for offline usage

In [None]:
!python -m nltk.downloader all

## TikToken tokenizer offline issue fix

In [None]:
import hashlib

import requests

# Define the blob URL and blob path
blob_url = "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"

# Compute the SHA-1 hash of the blob path
hash_filename = hashlib.sha1(blob_url.encode()).hexdigest()

# Define the target directory using pathlib
target_dir = os.environ["TIKTOKEN_CACHE_DIR"]
target_dir.mkdir(parents=True, exist_ok=True)

# Full path to save the file
file_path = target_dir / hash_filename

# Download the file
response = requests.get(blob_url)
response.raise_for_status()  # Ensure the download was successful

# Save the file
file_path.write_bytes(response.content)

print(f"File downloaded and saved to {file_path}")

In [None]:
import tiktoken

tiktoken.get_encoding("cl100k_base")