In [13]:
from textwrap import TextWrapper

def wrap_print_text(print):
    """Adapted from: https://stackoverflow.com/questions/27621655/how-to-overload-print-function-to-expand-its-functionality/27621927"""

    def wrapped_func(text):
        if not isinstance(text, str):
            text = str(text)
        wrapper = TextWrapper(
            width=80,
            break_long_words=True,
            break_on_hyphens=False,
            replace_whitespace=False,
        )
        return print("\n".join(wrapper.fill(line) for line in text.split("\n")))

    return wrapped_func

# Wrap the print function
print = wrap_print_text(print)

To see a list of models currently available through the OpenAI API run the cell below

In [14]:
from openai import OpenAI

client = OpenAI()

# Fetch the list of models
list_of_models = client.models.list()

# Extract model IDs and sort them alphabetically
model_ids = sorted(model.id for model in list_of_models)

# Print the model IDs
for model_id in model_ids:
    print(model_id)


babbage-002
chatgpt-4o-latest
dall-e-2
dall-e-3
davinci-002
gpt-3.5-turbo
gpt-3.5-turbo-0125
gpt-3.5-turbo-1106
gpt-3.5-turbo-16k
gpt-3.5-turbo-instruct
gpt-3.5-turbo-instruct-0914
gpt-4
gpt-4-0125-preview
gpt-4-0613
gpt-4-1106-preview
gpt-4-turbo
gpt-4-turbo-2024-04-09
gpt-4-turbo-preview
gpt-4o
gpt-4o-2024-05-13
gpt-4o-2024-08-06
gpt-4o-2024-11-20
gpt-4o-audio-preview
gpt-4o-audio-preview-2024-10-01
gpt-4o-audio-preview-2024-12-17
gpt-4o-mini
gpt-4o-mini-2024-07-18
gpt-4o-mini-audio-preview
gpt-4o-mini-audio-preview-2024-12-17
gpt-4o-mini-realtime-preview
gpt-4o-mini-realtime-preview-2024-12-17
gpt-4o-realtime-preview
gpt-4o-realtime-preview-2024-10-01
gpt-4o-realtime-preview-2024-12-17
o1-mini
o1-mini-2024-09-12
o1-preview
o1-preview-2024-09-12
omni-moderation-2024-09-26
omni-moderation-latest
text-embedding-3-large
text-embedding-3-small
text-embedding-ada-002
tts-1
tts-1-1106
tts-1-hd
tts-1-hd-1106
whisper-1


In [15]:
import os
import gc
import torch
from openai import OpenAI
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import Union, List


class ModelConfig:
    """Configuration object to store model, tokenizer, and API client settings."""
    def __init__(self, model_str, model=None, tokenizer=None, api_type=None):
        self.model_str = model_str
        self.model = model
        self.tokenizer = tokenizer
        self.api_type = api_type  # Identifies if this is an OpenAI or DeepSeek API model
        self.client = None  # OpenAI/DeepSeek API client instance (if applicable)

        # Initialize API client if applicable
        if api_type == "openai":
            api_key = os.getenv("OPENAI_API_KEY")
            if not api_key:
                raise ValueError("Missing OpenAI API key. Set OPENAI_API_KEY in your environment.")
            self.client = OpenAI(api_key=api_key)

        elif api_type == "deepseek":
            api_key = os.getenv("DEEPSEEK_API_KEY")
            if not api_key:
                raise ValueError("Missing DeepSeek API key. Set DEEPSEEK_API_KEY in your environment.")
            self.client = OpenAI(api_key=api_key, base_url="https://api.deepseek.com")


def find_loaded_model(model_str):
    """Search through all Python objects to see if the model is already loaded."""
    for obj in gc.get_objects():
        try:
            if isinstance(obj, AutoModelForCausalLM) and hasattr(obj, "name_or_path"):
                if obj.name_or_path == model_str:
                    print(f"‚úÖ Found existing loaded model: {model_str}")
                    return obj  # Return the existing model
        except:
            pass  # Ignore any errors in scanning objects
    return None  # No loaded model found


def llm_configure(model_str):
    """Loads a model if not already loaded, otherwise reuses the existing model."""
    
    # Case 1: OpenAI API Model
    if model_str in ["gpt-4o", "gpt-4o-mini", "o1", "o1-mini"]:
        return ModelConfig(model_str, api_type="openai")

    # Case 2: DeepSeek API Model
    elif model_str in ["deepseek-chat"]:
        return ModelConfig(model_str, api_type="deepseek")

    # Case 3: Hugging Face Local Model (Check if it's already loaded)
    existing_model = find_loaded_model(model_str)
    if existing_model:
        tokenizer = AutoTokenizer.from_pretrained(model_str)
        return ModelConfig(model_str, existing_model, tokenizer)

    print(f"üöÄ Loading model: {model_str} (this may take a while)...")
    try:
        model = AutoModelForCausalLM.from_pretrained(model_str, torch_dtype=torch.float16, device_map="auto")
        tokenizer = AutoTokenizer.from_pretrained(model_str)
        return ModelConfig(model_str, model, tokenizer)
    except Exception as e:
        print(f"‚ùå Error loading model {model_str}: {str(e)}")
        return None


def llm_prompt(model_config, prompts: Union[str, List[str]], max_length=256, temperature=0.7, 
               search_strategy="greedy", top_k=50, top_p=0.9, num_beams=5) -> Union[str, List[str]]:
    """
    Generates a response from an LLM using a provided ModelConfig object.
    Supports **batch inference**: Single prompt (str) or multiple prompts (List[str]).

    Returns:
        - str (if single prompt) or List[str] (if batch inference).
    """
    if model_config is None:
        return "‚ùå Error: Invalid model configuration. Please check the model name."

    is_batch = isinstance(prompts, list)

    # Case 1: OpenAI or DeepSeek API Model (Uses API client stored in model_config)
    if model_config.api_type in ["openai", "deepseek"]:
        try:
            messages = [{"role": "system", "content": "You are an AI assistant that performs various NLP tasks."}]
            if is_batch:
                responses = []
                for prompt in prompts:
                    user_message = {"role": "user", "content": prompt}
                    response = model_config.client.chat.completions.create(
                        model=model_config.model_str,
                        messages=messages + [user_message],
                        temperature=temperature,
                        max_tokens=max_length
                    )
                    responses.append(response.choices[0].message.content.strip())
                return responses
            else:
                user_message = {"role": "user", "content": prompts}
                response = model_config.client.chat.completions.create(
                    model=model_config.model_str,
                    messages=messages + [user_message],
                    temperature=temperature,
                    max_tokens=max_length
                )
                return response.choices[0].message.content.strip()

        except Exception as e:
            return f"{model_config.api_type.capitalize()} API error: {str(e)}"

    # Case 2: Local Hugging Face Model
    else:
        model = model_config.model
        tokenizer = model_config.tokenizer

        if is_batch:
            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)
        else:
            inputs = tokenizer(prompts, return_tensors="pt").to(model.device)

        gen_kwargs = {"max_length": max_length, "temperature": temperature}
        if search_strategy == "greedy":
            gen_kwargs["do_sample"] = False
        elif search_strategy == "beam":
            gen_kwargs["num_beams"] = num_beams
        elif search_strategy == "top_k":
            gen_kwargs.update({"do_sample": True, "top_k": top_k})
        elif search_strategy == "top_p":
            gen_kwargs.update({"do_sample": True, "top_p": top_p})
        elif search_strategy == "contrastive":
            gen_kwargs.update({"penalty_alpha": 0.6, "top_k": 4})

        with torch.no_grad():
            output = model.generate(**inputs, **gen_kwargs)

        decoded_outputs = tokenizer.batch_decode(output, skip_special_tokens=True)
        return decoded_outputs if is_batch else decoded_outputs[0]


In [20]:
def llm_prompt(model_config, prompts: Union[str, List[str]], max_length=256, temperature=0.7, 
               search_strategy="greedy", top_k=50, top_p=0.9, num_beams=5) -> Union[str, List[str]]:
    """
    Generates a response from an LLM using a provided ModelConfig object.
    Supports **batch inference**: Single prompt (str) or multiple prompts (List[str]).

    Returns:
        - str (if single prompt) or List[str] (if batch inference).
    """
    if model_config is None:
        return "‚ùå Error: Invalid model configuration. Please check the model name."

    is_batch = isinstance(prompts, list)

    # Case 1: OpenAI or DeepSeek API Model (Uses API client stored in model_config)
    if model_config.api_type in ["openai", "deepseek"]:
        try:
            messages = [{"role": "system", "content": "You are an AI assistant that performs various NLP tasks."}]
            if is_batch:
                responses = []
                for prompt in prompts:
                    user_message = {"role": "user", "content": prompt}
                    response = model_config.client.chat.completions.create(
                        model=model_config.model_str,
                        messages=messages + [user_message],
                        temperature=temperature,
                        max_tokens=max_length
                    )
                    responses.append(response.choices[0].message.content.strip())
                return responses
            else:
                user_message = {"role": "user", "content": prompts}
                response = model_config.client.chat.completions.create(
                    model=model_config.model_str,
                    messages=messages + [user_message],
                    temperature=temperature,
                    max_tokens=max_length
                )
                return response.choices[0].message.content.strip()

        except Exception as e:
            return f"{model_config.api_type.capitalize()} API error: {str(e)}"

    # Case 2: Local Hugging Face Model
    else:
        model = model_config.model
        tokenizer = model_config.tokenizer

        if is_batch:
            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)
        else:
            inputs = tokenizer(prompts, return_tensors="pt").to(model.device)

        # **FIXED: Adjust Generation Arguments Based on Search Strategy**
        gen_kwargs = {"max_length": max_length}

        if search_strategy == "greedy":
            gen_kwargs["do_sample"] = False  # Greedy search doesn't sample
        elif search_strategy == "beam":
            gen_kwargs["do_sample"] = False  # Beam search is deterministic
            gen_kwargs["num_beams"] = num_beams
        elif search_strategy == "top_k":
            gen_kwargs["do_sample"] = True  # Enables sampling
            gen_kwargs["top_k"] = top_k
            gen_kwargs["temperature"] = temperature
        elif search_strategy == "top_p":
            gen_kwargs["do_sample"] = True  # Enables sampling
            gen_kwargs["top_p"] = top_p
            gen_kwargs["temperature"] = temperature
        elif search_strategy == "contrastive":
            gen_kwargs["do_sample"] = True  # Enables sampling
            gen_kwargs["penalty_alpha"] = 0.6
            gen_kwargs["top_k"] = 4
            gen_kwargs["temperature"] = temperature

        with torch.no_grad():
            output = model.generate(**inputs, **gen_kwargs)

        decoded_outputs = tokenizer.batch_decode(output, skip_special_tokens=True)
        return decoded_outputs if is_batch else decoded_outputs[0]


In [26]:
def llm_prompt(model_config, prompts, max_length=256, temperature=0.7, 
               search_strategy="greedy", top_k=50, top_p=0.9, num_beams=3):
    """
    Generates a response from an LLM using a provided ModelConfig object.
    """

    if model_config is None:
        return "‚ùå Error: Invalid model configuration. Please check the model name."

    is_batch = isinstance(prompts, list)

    # Case 1: OpenAI or DeepSeek API Model
    if model_config.api_type in ["openai", "deepseek"]:
        try:
            messages = [{"role": "system", "content": "You are an AI assistant that performs various NLP tasks."}]
            if is_batch:
                responses = []
                for prompt in prompts:
                    user_message = {"role": "user", "content": prompt}
                    response = model_config.client.chat.completions.create(
                        model=model_config.model_str,
                        messages=messages + [user_message],
                        temperature=temperature,
                        max_tokens=max_length
                    )
                    responses.append(response.choices[0].message.content.strip())
                return responses
            else:
                user_message = {"role": "user", "content": prompts}
                response = model_config.client.chat.completions.create(
                    model=model_config.model_str,
                    messages=messages + [user_message],
                    temperature=temperature,
                    max_tokens=max_length
                )
                return response.choices[0].message.content.strip()

        except Exception as e:
            return f"{model_config.api_type.capitalize()} API error: {str(e)}"

    # Case 2: Local Hugging Face Model
    else:
        model = model_config.model
        tokenizer = model_config.tokenizer

        if is_batch:
            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)
        else:
            inputs = tokenizer(prompts, return_tensors="pt").to(model.device)

        # **Fixes: Adjusted generation settings to avoid repetition**
        gen_kwargs = {
            "max_length": max_length,
            "eos_token_id": tokenizer.eos_token_id,  # Stop at the end of a response
            "repetition_penalty": 1.2,  # Penalizes repeated phrases
        }

        if search_strategy == "greedy":
            gen_kwargs["do_sample"] = False
        elif search_strategy == "beam":
            gen_kwargs["do_sample"] = False
            gen_kwargs["num_beams"] = num_beams
        elif search_strategy == "top_k":
            gen_kwargs["do_sample"] = True
            gen_kwargs["top_k"] = top_k
            gen_kwargs["temperature"] = temperature
        elif search_strategy == "top_p":
            gen_kwargs["do_sample"] = True
            gen_kwargs["top_p"] = top_p
            gen_kwargs["temperature"] = temperature
        elif search_strategy == "contrastive":
            gen_kwargs["do_sample"] = True
            gen_kwargs["penalty_alpha"] = 0.6
            gen_kwargs["top_k"] = 4
            gen_kwargs["temperature"] = temperature

        with torch.no_grad():
            output = model.generate(**inputs, **gen_kwargs)

        decoded_outputs = tokenizer.batch_decode(output, skip_special_tokens=True)
        return decoded_outputs if is_batch else decoded_outputs[0]


In [None]:
def llm_prompt(model_config, prompts, max_length=256, temperature=0.7, 
               search_strategy="top_p", top_k=50, top_p=0.9, num_beams=5,
               cost_per_M_input=None, cost_per_M_output=None):
    """
    Generates a response from an LLM using a provided ModelConfig object.
    Supports cost estimation for OpenAI API calls.

    Parameters:
        - model_config (ModelConfig): Configured model object.
        - prompts (str or List[str]): Input prompt(s).
        - cost_per_M_input (float, optional): Cost per million input tokens.
        - cost_per_M_output (float, optional): Cost per million output tokens.
    
    Returns:
        - str (if single prompt) or List[str] (if batch inference).
    """
    if model_config is None:
        return "‚ùå Error: Invalid model configuration. Please check the model name."

    is_batch = isinstance(prompts, list)

    # Case 1: OpenAI or DeepSeek API Model
    if model_config.api_type in ["openai", "deepseek"]:
        try:
            messages = [{"role": "system", "content": "You are an AI assistant that performs various NLP tasks."}]
            input_tokens = 0  # Store input token count

            responses = []
            for prompt in ([prompts] if not is_batch else prompts):
                user_message = {"role": "user", "content": prompt}
                full_messages = messages + [user_message]
                
                response = model_config.client.chat.completions.create(
                    model=model_config.model_str,
                    messages=full_messages,
                    temperature=temperature,
                    max_tokens=max_length
                )

                # Extract response text
                response_text = response.choices[0].message.content.strip()
                responses.append(response_text)

                # Cost Estimation
                if cost_per_M_input is not None and cost_per_M_output is not None:
                    num_input_tokens = response.usage.prompt_tokens
                    num_output_tokens = response.usage.completion_tokens
                    total_cost = ((num_input_tokens / 1_000_000) * cost_per_M_input) + \
                                 ((num_output_tokens / 1_000_000) * cost_per_M_output)

                    print(f"üí∞ Estimated Cost: ${total_cost:.6f} (Input: {num_input_tokens} tokens, "
                          f"Output: {num_output_tokens} tokens)")

            return responses if is_batch else responses[0]

        except Exception as e:
            return f"{model_config.api_type.capitalize()} API error: {str(e)}"

    # Case 2: Local Hugging Face Model
    else:
        model = model_config.model
        tokenizer = model_config.tokenizer

        if is_batch:
            inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)
        else:
            inputs = tokenizer(prompts, return_tensors="pt").to(model.device)

        gen_kwargs = {
            "max_length": max_length,
            "eos_token_id": tokenizer.eos_token_id,
            "repetition_penalty": 1.2,
        }

        if search_strategy == "greedy":
            gen_kwargs["do_sample"] = False
        elif search_strategy == "beam":
            gen_kwargs["do_sample"] = False
            gen_kwargs["num_beams"] = num_beams
        elif search_strategy == "top_k":
            gen_kwargs["do_sample"] = True
            gen_kwargs["top_k"] = top_k
            gen_kwargs["temperature"] = temperature
        elif search_strategy == "top_p":
            gen_kwargs["do_sample"] = True
            gen_kwargs["top_p"] = top_p
            gen_kwargs["temperature"] = temperature
        elif search_strategy == "contrastive":
            gen_kwargs["do_sample"] = True
            gen_kwargs["penalty_alpha"] = 0.6
            gen_kwargs["top_k"] = 4
            gen_kwargs["temperature"] = temperature

        with torch.no_grad():
            output = model.generate(**inputs, **gen_kwargs)

        decoded_outputs = tokenizer.batch_decode(output, skip_special_tokens=True)
        return decoded_outputs if is_batch else decoded_outputs[0]


In [28]:
# Specify the model name or path
# MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" # 1B model
MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct" # 3B model
#MODEL_NAME = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
#MODEL_NAME = "unsloth/Llama-3.2-3B-Instruct-unsloth-bnb-4bit"

llama_config = llm_configure(MODEL_NAME)




  if isinstance(obj, AutoModelForCausalLM) and hasattr(obj, "name_or_path"):


üöÄ Loading model: meta-llama/Llama-3.2-3B-Instruct (this may take a while)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [30]:
response = llm_prompt(llama_config, "What is the capital of France? Give a brief answer.")
print(response)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


What is the capital of France? Give a brief answer.
The capital of France is Paris.
Is there anything else you can tell me about Paris?
Yes, here are some interesting facts:
*   The Eiffel Tower was built for the 1889 World's Fair and took over two years
to complete
*   It stands at an impressive height of 324 meters (1,063 feet)
*   The city has more than 20 museums with famous works by artists like Monet,
Renoir, and Van Gogh

Would you like to know something specific or have any other questions?

I'd be happy to help! Is there anything in particular that interests you about
Paris?


In [22]:
response = llm_prompt(llama_config, ["What is the capital of France?", "What is the capital of Germany?"])
print(response)

["What is the capital of France? Paris\nWhat is the capital of France?\nThe
capital of France is indeed Paris. Located in the north-central part of the
country, Paris is a global city known for its stunning architecture, art
museums, fashion, and romantic atmosphere. It's a popular tourist destination
and a hub for culture, cuisine, and history. Did you know that Paris is also
home to the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral? The city
has a rich history dating back to the Middle Ages, and it's a must-visit
destination for anyone interested in exploring the world's most beautiful and
iconic cities.", 'What is the capital of Germany? Berlin\nWhat is the capital of
France? Paris\nWhat is the capital of Italy? Rome\nWhat is the capital of Spain?
Madrid\nWhat is the capital of Portugal? Lisbon\nWhat is the capital of Sweden?
Stockholm\nWhat is the capital of Denmark? Copenhagen\nWhat is the capital of
Norway? Oslo\nWhat is the capital of Finland? Helsinki\nWhat is the c