In [2]:
from textwrap import TextWrapper

def wrap_print_text(print):
    """Adapted from: https://stackoverflow.com/questions/27621655/how-to-overload-print-function-to-expand-its-functionality/27621927"""

    def wrapped_func(text):
        if not isinstance(text, str):
            text = str(text)
        wrapper = TextWrapper(
            width=80,
            break_long_words=True,
            break_on_hyphens=False,
            replace_whitespace=False,
        )
        return print("\n".join(wrapper.fill(line) for line in text.split("\n")))

    return wrapped_func

# Wrap the print function
print = wrap_print_text(print)

To see a list of models currently available through the OpenAI API run the cell below

In [2]:
from openai import OpenAI

client = OpenAI()

# Fetch the list of models
list_of_models = client.models.list()

# Extract model IDs and sort them alphabetically
model_ids = sorted(model.id for model in list_of_models)

# Print the model IDs
for model_id in model_ids:
    print(model_id)


babbage-002
chatgpt-4o-latest
dall-e-2
dall-e-3
davinci-002
gpt-3.5-turbo
gpt-3.5-turbo-0125
gpt-3.5-turbo-1106
gpt-3.5-turbo-16k
gpt-3.5-turbo-instruct
gpt-3.5-turbo-instruct-0914
gpt-4
gpt-4-0125-preview
gpt-4-0613
gpt-4-1106-preview
gpt-4-turbo
gpt-4-turbo-2024-04-09
gpt-4-turbo-preview
gpt-4o
gpt-4o-2024-05-13
gpt-4o-2024-08-06
gpt-4o-2024-11-20
gpt-4o-audio-preview
gpt-4o-audio-preview-2024-10-01
gpt-4o-audio-preview-2024-12-17
gpt-4o-mini
gpt-4o-mini-2024-07-18
gpt-4o-mini-audio-preview
gpt-4o-mini-audio-preview-2024-12-17
gpt-4o-mini-realtime-preview
gpt-4o-mini-realtime-preview-2024-12-17
gpt-4o-realtime-preview
gpt-4o-realtime-preview-2024-10-01
gpt-4o-realtime-preview-2024-12-17
o1-mini
o1-mini-2024-09-12
o1-preview
o1-preview-2024-09-12
omni-moderation-2024-09-26
omni-moderation-latest
text-embedding-3-large
text-embedding-3-small
text-embedding-ada-002
tts-1
tts-1-1106
tts-1-hd
tts-1-hd-1106
whisper-1


In [3]:
import os
import gc
import torch
from openai import OpenAI
from transformers import AutoModelForCausalLM, AutoTokenizer
import re

from transformers.utils import logging
logging.set_verbosity_error()

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

LOADED_MODELS = {}  # Track loaded models
LOADED_TOKENIZERS = {}  # Track tokenizers

def release_model():
    """Unloads the currently loaded model from memory and GPU."""
    global LOADED_MODELS, LOADED_TOKENIZERS

    if LOADED_MODELS:
        model_str = next(iter(LOADED_MODELS))  # Get the first (and only) loaded model
        print(f"🛑 Unloading model: {model_str} from GPU...")

        # Move model to CPU before deletion
        model = LOADED_MODELS[model_str]
        model.to("cpu")  # Moves model out of GPU memory
        
        del LOADED_MODELS[model_str]  # Remove model from registry
        del LOADED_TOKENIZERS[model_str]  # Remove tokenizer from registry
        
        torch.cuda.empty_cache()
        gc.collect()  # Force garbage collection
        print(f"✅ Model {model_str} has been fully unloaded.")
    else:
        print(f"❌ No model to unload.")


class ModelConfig:
    """Configuration object to store model, tokenizer, and API client settings."""
    def __init__(self, model_str, model=None, tokenizer=None, api_type=None, cost_per_M_input=None, cost_per_M_output=None):
        self.model_str = model_str
        self.model = model
        self.tokenizer = tokenizer
        self.api_type = api_type  # Identifies API model type
        self.client = None  # API client instance (if applicable)
        self.cost_per_M_input = cost_per_M_input  # Cost per million input tokens
        self.cost_per_M_output = cost_per_M_output  # Cost per million output tokens

        # Initialize API client if applicable
        if api_type in ["openai", "gemini"]:
            api_key = os.getenv("GEMINI_API_KEY" if api_type == "gemini" else "OPENAI_API_KEY")
            if not api_key:
                raise ValueError(f"Missing {api_type.upper()} API key. Set the appropriate environment variable.")
            
            base_url = "https://generativelanguage.googleapis.com/v1beta/openai/" if api_type == "gemini" else None
            self.client = OpenAI(api_key=api_key, base_url=base_url)

    def unload(self):
        """Unloads the model and tokenizer from memory and GPU."""
        release_model()

def llm_configure(model_str, cost_per_M_input=None, cost_per_M_output=None):
    """Ensures only one model is loaded at a time: reuses existing or unloads the previous one."""
    
    if model_str in ["gpt-4o", "gpt-4o-mini", "o1-mini", "o3-mini"]:
        return ModelConfig(model_str, api_type="openai", cost_per_M_input=cost_per_M_input, cost_per_M_output=cost_per_M_output)

    if model_str in ["gemini-2.0-flash-lite-preview-02-05", "gemini-2.0-flash"]:
        return ModelConfig(model_str, api_type="gemini", cost_per_M_input=cost_per_M_input, cost_per_M_output=cost_per_M_output)

    global LOADED_MODELS

    # Check if a model is already loaded
    if LOADED_MODELS:
        current_model_str = next(iter(LOADED_MODELS))  # Get the currently loaded model name

        if current_model_str == model_str:
            print(f"✅ Found existing loaded model: {model_str}")
            return ModelConfig(model_str, LOADED_MODELS[model_str], LOADED_TOKENIZERS[model_str])

        # Unload the currently loaded model before loading the new one
        release_model()

        # Ensure memory is cleared before loading the new model
        torch.cuda.empty_cache()
        gc.collect()

    # Debug: Check free GPU memory
    print(f"🔍 Checking GPU memory before loading {model_str}...")
    print(f"    🔸 Reserved memory: {torch.cuda.memory_reserved()} bytes")
    print(f"    🔸 Allocated memory: {torch.cuda.memory_allocated()} bytes")

    # Load the new model
    print(f"🚀 Loading model: {model_str} (this may take a while)...")
    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_str,
            torch_dtype=torch.float16,
            device_map="auto",
        )
        tokenizer = AutoTokenizer.from_pretrained(model_str)

        # Store new model and tokenizer in global registry
        LOADED_MODELS[model_str] = model
        LOADED_TOKENIZERS[model_str] = tokenizer

        return ModelConfig(model_str, model, tokenizer)
    except Exception as e:
        print(f"❌ Error loading model {model_str}: {str(e)}")
        return None



def clean_response(response, prompt):
    """Cleans the response by removing the input prompt, assistant label, blank lines, and structures the output."""
    prompt_marker_pattern = re.escape(prompt) + r"\s*\.\s*assistant"
    match = re.search(prompt_marker_pattern, response)
    if match:
        response = response[match.end():].strip()
    else:
        match = re.search(re.escape(prompt), response)
        if match:
            response = response[match.end():].strip()
    
    response = re.sub(r"^\s*assistant\s*", "", response, flags=re.IGNORECASE).strip()
    response = "\n".join([line for line in response.split("\n") if line.strip()])  # Remove blank lines
    
    return response

def llm_prompt(model_config, prompts, max_new_tokens=200, temperature=0.7, 
               search_strategy="top_p", top_k=50, top_p=0.9, num_beams=1,
               estimate_cost=False, system_prompt="You are an AI assistant that provides brief answers."):
    """
    Generates a response from an LLM using a provided ModelConfig object.
    Supports OpenAI API (including Gemini) and local Hugging Face models.
    """
    if model_config is None:
        return "❌ Error: Invalid model configuration. Please check the model name."

    is_batch = isinstance(prompts, list)
    responses = []
    num_input_tokens = 0.0
    num_output_tokens = 0.0
    
    if model_config.api_type in ["openai", "gemini"]:
        try:
            messages = [{"role": "system", "content": system_prompt}]
            
            for prompt in ([prompts] if not is_batch else prompts):
                user_message = {"role": "user", "content": prompt}
                full_messages = messages + [user_message]
                
                response = model_config.client.chat.completions.create(
                    model=model_config.model_str,
                    messages=full_messages,
                    temperature=temperature,
                    max_tokens=max_new_tokens
                )
                
                response_text = response.choices[0].message.content.strip()
                responses.append(clean_response(response_text))

                if estimate_cost and model_config.cost_per_M_input is not None and model_config.cost_per_M_output is not None:
                    num_input_tokens += response.usage.prompt_tokens
                    num_output_tokens += response.usage.completion_tokens
            
            if estimate_cost:
                total_cost = ((num_input_tokens / 1_000_000) * model_config.cost_per_M_input) + \
                            ((num_output_tokens / 1_000_000) * model_config.cost_per_M_output)
                print(f"💰 Estimated Cost: ${total_cost:.6f} (Input: {num_input_tokens} tokens, Output: {num_output_tokens} tokens)")
            
            
            return responses if is_batch else responses[0]
        
        except Exception as e:
            return f"{model_config.api_type.capitalize()} API error: {str(e)}"
    

    # Handle Local Hugging Face models

    tokenizer = model_config.tokenizer
    model = model_config.model

    if model is None or tokenizer is None:
        return "❌ Error: Model or tokenizer is not properly initialized."
    
    # Batch tokenization for efficiency
    if hasattr(tokenizer, "apply_chat_template"):
        conversations = [[{"role": "system", "content": system_prompt}, {"role": "user", "content": p}] for p in (prompts if is_batch else [prompts])]
        input_ids = tokenizer.apply_chat_template(conversations, return_tensors="pt", padding=True, truncation=True).to(model.device)
    else:
        input_ids = tokenizer(prompts if is_batch else [prompts], return_tensors="pt", padding=True, truncation=True).to(model.device)
    
    # Define stopping criteria
    terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

    gen_kwargs = {
        "max_new_tokens": max_new_tokens,  # Limit response length
        "eos_token_id": terminators,  # Ensure early stopping
        "repetition_penalty": 1.2,
        "num_beams": num_beams,  # Limit beams to prevent verbosity
        "do_sample": temperature > 0,  # Enable sampling if temp > 0
        "temperature": temperature,
    }
    if search_strategy == "top_k":
        gen_kwargs.update({"do_sample": True, "top_k": top_k, "temperature": temperature})
    elif search_strategy == "top_p":
        gen_kwargs.update({"do_sample": True, "top_p": top_p, "temperature": temperature})
    elif search_strategy == "contrastive":
        gen_kwargs.update({"do_sample": True, "penalty_alpha": 0.6, "top_k": 4, "temperature": temperature})
    
    with torch.no_grad():
        output = model.generate(input_ids, **gen_kwargs)
    responses = tokenizer.batch_decode(output, skip_special_tokens=True)
    
    responses = [clean_response(resp, prompt) for resp, prompt in zip(responses, prompts if is_batch else [prompts])]
    
    return responses if is_batch else responses[0]







In [5]:

# Specify the model name or path
MODEL_NAME = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
#MODEL_NAME = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
#MODEL_NAME = "unsloth/Llama-3.2-3B-Instruct-unsloth-bnb-4bit"

local_config = llm_configure(MODEL_NAME)
system_prompt = "You are an AI assistant that provides brief, sarcastic answers and likes to sound like a drunk pirate."
response = llm_prompt(local_config, "Tell me three interesting facts about outer space.", system_prompt=system_prompt)
print(response)

#local_config.unload()

✅ Found existing loaded model: unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit
(hiccup) Oh ho ho... ye be wantin' tae know 'bout the vastness o' outer space,
eh? Alrighty then, matey! (burp)
Fact #1: There be more stars in the universe than grains o' sand on all yer
beaches combined! Can't even begin t'dig it, lad!
Fact #2: The Andromeda galaxy's comin' straight fer us! No worries though; we
won't collide till about five billion years from now... plenty o' time t'drink
yerself into oblivion.
Fact #3: Space has no smell... which is probably fer the best, or I wouldn't
have any sense left by this point! Arrr! Now if ye'll excuse me, I need another
pint...


In [70]:
LOADED_MODELS

{'unsloth/Llama-3.2-3B-Instruct-unsloth-bnb-4bit': LlamaForCausalLM(
   (model): LlamaModel(
     (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
     (layers): ModuleList(
       (0): LlamaDecoderLayer(
         (self_attn): LlamaSdpaAttention(
           (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
           (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
           (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
           (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
           (rotary_emb): LlamaRotaryEmbedding()
         )
         (mlp): LlamaMLP(
           (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
           (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
           (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
           (act_fn): SiLU()
         )
         (input_layernorm): LlamaRMSNorm((30

In [41]:

gemini_config = llm_configure("gemini-2.0-flash-lite-preview-02-05", cost_per_M_input=0.075, cost_per_M_output=0.30)
response = llm_prompt(gemini_config, "Tell me three interesting facts about outer space.", estimate_cost=True, system_prompt=system_prompt)
print(response)

💰 Estimated Cost: $0.000044 (Input: 31.0 tokens, Output: 138.0 tokens)
Ahoy, matey! Here be yer facts, but don't be expectin' any treasure:

1.  **Space be completely silent.** Aye, ye can't hear a blasted thing, even if a kraken be roarin' at ye.
2.  **There be more stars than grains o' sand on all the beaches o' the world.** Makes ye feel small, don't it? Just like my rum supply after a long voyage.
3.  **Some planets be rainin' diamonds.** Fancy that! I'd trade me eye patch for a diamond shower, I would. Now, where's me grog?


In [39]:
openai_config = llm_configure("gpt-4o-mini", cost_per_M_input=0.15, cost_per_M_output=0.60)
response = llm_prompt(openai_config, "Tell me three interesting facts about outer space.", estimate_cost=True, system_prompt=system_prompt)
print(response)

Arrr, ye be wantin' some space trivia, eh? Here ye go:

1. There be a whole lotta nothin’ out there, matey! Space is so vast that if ye tried to count all the stars, ye’d be countin’ for more years than ye got left to live, savvy?

2. That black hole thingamajig? It’s like a cosmic vacuum cleaner, suckin’ up everything in sight. So, if ye drop yer rum in one, consider it gone forever, yarrr!

3. Ye ever heard of a supernova? It’s when a star goes out with a bang, like a pirate’s last party before sinkin' to Davy Jones' locker. Talk about a dramatic exit, eh?


In [15]:
questions = ["What is the capital of France?", "What is the largest mammal?", "What is the speed of light?"]
responses = llm_prompt(local_config, questions)
print(responses)


['The capital of France is Paris.', 'The blue whale (Balaenoptera musculus) is
considered the largest mammal, with some individuals reaching lengths of up to
33 meters (108 feet).', 'The speed of light in a vacuum is approximately 299,792
kilometers per second (km/s) or about 186,282 miles per second.']


In [25]:
questions = ["What is the capital of France?", "What is the largest mammal?", "What is the speed of light?"]
responses = llm_prompt(gemini_config, questions, estimate_cost=True)
print(responses)

💰 Estimated Cost: $0.000018 (Input: 56.0 tokens, Output: 47.0 tokens)
['Paris', 'The largest mammal is the blue whale.', 'The speed of light in a vacuum is approximately 299,792,458 meters per second (about 186,282 miles per second).']


As of February 7, 2025, OpenAI's API pricing for various models is as follows:

| Model           | Input Tokens (per 1M) | Output Tokens (per 1M) | Context Length | Modalities Supported |
|-----------------|-----------------------|------------------------|----------------|----------------------|
| **OpenAI o1**   | $15                   | $60                    | 200k           | Text and Vision      |
| **OpenAI o3-mini** | $1.10               | $4.40                  | 200k           | Text                 |
| **GPT-4o**      | $2.50                 | $10                    | 128k           | Text and Vision      |
| **GPT-4o mini** | $0.15                 | $0.60                  | 128k           | Text and Vision      |

These models offer varying capabilities and pricing structures to accommodate different application needs. For more detailed information, you can refer to OpenAI's official API [pricing page](https://openai.com/api/pricing/). 

As of February 11, 2025, Google's API pricing for its Gemini models is as follows:

| Model           | Input Tokens (per 1M) | Output Tokens (per 1M) | Context Length | Modalities Supported |
|-----------------|-----------------------|------------------------|----------------|----------------------|
| **Gemini 2.0 Flash**| $0.10             | $0.40                    | ???          | Text, Images, Video, Audio* |
| **Gemini 2.0 Flash Lite** | $0.075 | $0.30 | ??? | Text Images, Video, Audio |

*Audio costs more for $0.70 per 1M with Gemini 2.0 Flash.

The nice thing about the Gemini models is that support free, limited API use for testing.  For Flash / Flash Lite the free tier is limited to 30 / 15 requests per minute or 1500 / 1500 requests per day.  You can learn more about Gemini [pricing here](https://ai.google.dev/pricing#2_0flash).  





In [6]:
gpt_config = llm_configure("gpt-4o-mini")

response = llm_prompt(
    gpt_config,
    "What are three interesting facts about the Eiffel Tower?",
    cost_per_M_input=0.15,  # Example: $0.005 per million input tokens
    cost_per_M_output=0.60   # Example: $0.015 per million output tokens
)

print(response)

💰 Estimated Cost: $0.000132 (Input: 32 tokens, Output: 212 tokens)
Here are three interesting facts about the Eiffel Tower:

1. **Initial Controversy**: When the Eiffel Tower was completed in 1889, it faced significant criticism from some of Paris's leading artists and intellectuals, who considered it an eyesore. They even published an open letter denouncing the structure. However, over time, it became one of the most iconic landmarks in the world.

2. **Height Variations**: The height of the Eiffel Tower can change due to temperature fluctuations. The iron expands when it gets hot and contracts when it cools, which can cause the tower to grow by about 6 inches (15 centimeters) during warm weather.

3. **A Temporary Structure**: Originally built as a temporary exhibit for the 1889 Exposition Universelle (World's Fair) to celebrate the 100th anniversary of the French Revolution, the Eiffel Tower was intended to be dismantled after 20 years. However, it was saved due to its usefulness as

In [9]:
from google import genai
import os
from google.genai import types

MODEL_NAME = "gemini-2.0-flash-lite-preview-02-05" # or "gemini-2.0-flash"

api_key = os.getenv("GEMINI_API_KEY")

client = genai.Client(api_key=api_key)

system_prompt = "You are a helpful and informative AI assistant. Always provide clear and concise explanations. Talk like a teenager and be sarcastic."

response = client.models.generate_content(
    model=MODEL_NAME,
    contents=["Explain how AI works"],
    config=types.GenerateContentConfig(
        max_output_tokens=500,
        temperature=0.7,
        system_instruction=system_prompt,
    )
)
print(response.text)

Okay, so you wanna know how AI works? Brace yourself, it's not as easy as, like, ordering a pizza online.

Basically, AI is all about making computers think and act like humans. Yeah, I know, sounds kinda like a sci-fi movie, right?

Here's the lowdown, simplified for your easily-distracted brain:

*   **Data is King (or Queen, whatever):** AI needs HUGE amounts of data to learn. Think of it like cramming for a test, but instead of a textbook, it's got, like, the entire internet. The more data, the "smarter" it *might* get.
*   **Algorithms are the Brains:** Algorithms are basically sets of instructions that tell the computer what to do with the data. It's like a super-complex recipe. They try to find patterns and make predictions.
*   **Machine Learning is the Cool Kid:** This is a big part of AI. It's where the computer actually *learns* from the data and improves over time, without being explicitly programmed for every single scenario. Seriously, it's like the computer is teaching i

In [6]:
from openai import OpenAI

client = OpenAI(
    api_key=os.getenv("GEMINI_API_KEY"),
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
)

response = client.chat.completions.create(
    model=MODEL_NAME,
    n=1,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": "Explain to me how AI works"
        }
    ]
)

print(response.choices[0].message.content)

Let's break down how AI (Artificial Intelligence) works in a way that's easy to understand. Think of it like teaching a very smart, but initially clueless, student.

**1. The Core Idea: Mimicking Human Intelligence**

The goal of AI is to create machines (computers, programs, etc.) that can perform tasks that typically require human intelligence. This includes things like:

*   **Learning:** Acquiring information and rules.
*   **Reasoning:** Using information to draw conclusions and make decisions.
*   **Problem-solving:** Finding solutions to challenges.
*   **Perception:** Understanding the world through senses (like sight, sound, and touch).
*   **Language Understanding:** Processing and generating human language.

**2. Types of AI: A Spectrum of Capabilities**

AI isn't a single thing. It comes in different flavors, with varying levels of sophistication:

*   **Narrow or Weak AI:** This is AI designed for a specific task.  It's the most common type today. Examples:
    *   **Spam 

In [8]:
response.usage

CompletionUsage(completion_tokens=1646, prompt_tokens=14, total_tokens=1660, completion_tokens_details=None, prompt_tokens_details=None)