# Environment Setup

**Set up the environment variables in a .env file!!**

# Model Download

Imports and uses the custom utility function to load model configurations from settings. 

This determines which models need to be downloaded.

In [2]:
# Model Configuration Loading
import os

# Import the custom utility function to load model configurations
from frames.utils.settings import load_models

# Load the model configurations
model_configs = load_models()

# Iterate through the model IDs and print the download message for each model
for model_id in model_configs["id"]:
    print(f"Running download for: {model_id}")
    # Use the Hugging Face CLI to download the model, excluding files with the .gguf extension
    os.system(f"huggingface-cli download {model_id} --exclude=*.gguf,*.pth")

Running download for: meta-llama/Llama-3.1-8B-Instruct
/home/CVLABPJ/pvalois/.cache/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659
Running download for: google/gemma-2-9b-it
/home/CVLABPJ/pvalois/.cache/huggingface/hub/models--google--gemma-2-9b-it/snapshots/11c9b309abf73637e4b6f9a3fa1e92e615547819
Running download for: microsoft/Phi-3-medium-128k-instruct
/home/CVLABPJ/pvalois/.cache/huggingface/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/fa7d2aa4f5ea69b2e36b20d050cdae79c9bfbb3f


## NLTK Download

Downloads all of NLTK for offline usage

In [3]:
!python -m nltk.downloader all

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /home/CVLABPJ/pvalois/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/CVLABPJ/pvalois/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/CVLABPJ/pvalois/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /home/CVLABPJ/pvalois/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/CVLABPJ/pvalois/nltk_data...
[nltk_data]    |   Package averaged_perceptron

## TikToken tokenizer offline issue fix

In [6]:
import hashlib
import os
from pathlib import Path

import requests

# Define the blob URL and blob path
blob_url = "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"

# Compute the SHA-1 hash of the blob path
hash_filename = hashlib.sha1(blob_url.encode()).hexdigest()

# Define the target directory using pathlib
target_dir = Path(os.environ["TIKTOKEN_CACHE_DIR"])
target_dir.mkdir(parents=True, exist_ok=True)

# Full path to save the file
file_path = target_dir / hash_filename

# Download the file
response = requests.get(blob_url)
response.raise_for_status()  # Ensure the download was successful

# Save the file
file_path.write_bytes(response.content)

print(f"File downloaded and saved to {file_path}")

File downloaded and saved to /home/CVLABPJ/pvalois/.cache/tiktoken/9b5ad71b2ce5302211f9c61530b329a4922fc6a4


In [8]:
import tiktoken

tiktoken.get_encoding("cl100k_base")

<Encoding 'cl100k_base'>