In [3]:
%load_ext autoreload
%autoreload 2
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=2


env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=2


In [4]:
import vllm
from outlines import generate, models

llm = vllm.LLM("NousResearch/Hermes-2-Pro-Llama-3-8B")
model = models.VLLM(llm)

INFO 09-25 14:32:14 llm_engine.py:223] Initializing an LLM engine (v0.6.1.post2) with config: model='NousResearch/Hermes-2-Pro-Llama-3-8B', speculative_config=None, tokenizer='NousResearch/Hermes-2-Pro-Llama-3-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=NousResearch/Hermes-2-Pro-Llama-3-8B, use_v2_block_manager=False, num_scheduler_steps=1, enable

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:10<00:32, 10.87s/it]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:22<00:23, 11.58s/it]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:26<00:07,  7.78s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:38<00:00,  9.46s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:38<00:00,  9.56s/it]



INFO 09-25 14:32:56 model_runner.py:1008] Loading model weights took 14.9605 GB
INFO 09-25 14:32:57 gpu_executor.py:122] # GPU blocks: 13277, # CPU blocks: 2048
INFO 09-25 14:33:00 model_runner.py:1311] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-25 14:33:00 model_runner.py:1315] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-25 14:33:19 model_runner.py:1430] Graph capturing finished in 19 secs.


In [26]:
import httpx
from enum import Enum
from pydantic import BaseModel, Field
from typing import Union
import pandas as pd

def wikipedia(q):
    return httpx.get("https://en.wikipedia.org/w/api.php", params={
        "action": "query",
        "list": "search",
        "srsearch": q,
        "format": "json"
    }).json()["query"]["search"][0]["snippet"]




def mean(nums: list[int]) -> float:
    return pd.Series(nums).mean()

def avg(nums: list[int]) -> float:
    return pd.Series(nums).mean()

def get_max(nums: list[int]) -> int:
    return pd.Series(nums).max()

def get_min(nums: list[int]) -> int:
    return pd.Series(nums).min()

def get_sum(nums: list[int]) -> int:
    return pd.Series(nums).sum()

def sort(nums: list[int]) -> list[int]:
    return list(pd.Series(nums).sort_values())


class Action(str, Enum):
    mean = "mean"
    avg = "avg"
    get_max = "get_max"
    get_min = "get_min"
    get_sum = "get_sum"
    sort = "sort"
    
    


class Reason_and_Act(BaseModel):
    Scratchpad: str = Field(..., description="Information from the Observation useful to answer the question")
    Thought: str = Field(..., description="It describes your thoughts about the question you have been asked")
    Action: Action
    Action_Input: str = Field(..., description="The arguments of the Action.")
    
class Final_Answer(BaseModel):
    Scratchpad: str = Field(..., description="Information from the Observation useful to answer the question")
    Final_Answer: str = Field(..., description="Answer to the question grounded on the Observation")
    

class Decision(BaseModel):
    Decision: Union[Reason_and_Act, Final_Answer]

In [27]:
from outlines.integrations.utils import convert_json_schema_to_str
from outlines.fsm.json_schema import build_regex_from_schema

json_schema = Decision.model_json_schema()
schema_str = convert_json_schema_to_str(json_schema=json_schema)
regex_str = build_regex_from_schema(schema_str)
print(regex_str)

\{[ ]?"Decision"[ ]?:[ ]?(\{[ ]?"Scratchpad"[ ]?:[ ]?"([^"\\\x00-\x1F\x7F-\x9F]|\\["\\])*"[ ]?,[ ]?"Thought"[ ]?:[ ]?"([^"\\\x00-\x1F\x7F-\x9F]|\\["\\])*"[ ]?,[ ]?"Action"[ ]?:[ ]?("mean"|"avg"|"get_max"|"get_min"|"get_sum"|"sort")[ ]?,[ ]?"Action_Input"[ ]?:[ ]?"([^"\\\x00-\x1F\x7F-\x9F]|\\["\\])*"[ ]?\}|\{[ ]?"Scratchpad"[ ]?:[ ]?"([^"\\\x00-\x1F\x7F-\x9F]|\\["\\])*"[ ]?,[ ]?"Final_Answer"[ ]?:[ ]?"([^"\\\x00-\x1F\x7F-\x9F]|\\["\\])*"[ ]?\})[ ]?\}


In [51]:
import datetime

def generate_hermes_prompt(question, schema=""):
    return (
        "<|im_start|>system\n"
        "You are a world class AI model who answers questions in JSON with correct Pydantic schema. "
        f"Here's the json schema you must adhere to:\n<schema>\n{schema}\n</schema>\n"
        "Today is " + datetime.datetime.today().strftime('%Y-%m-%d') + ".\n" +
        "You run in a loop of Scratchpad, Thought, Action, Action Input, PAUSE, Observation. "
        "At the end of the loop you output a Final Answer. "
        "Use Scratchpad to store the information from the Observation useful to answer the question "
        "Use Thought to describe your thoughts about the question you have been asked "
        "and reflect carefully about the Observation if it exists. "
        "Use Action to run one of the actions available to you. "
        "Use Action Input to input the arguments of the selected action - then return PAUSE. "
        "Observation will be the result of running those actions. "
        "Your available actions are:\n"
        "mean:\n"
        "e.g. mean: [1, 2, 3, 4, 5]\n"
        "Calculates the mean (average) of a list of numbers.\n"
        "avg:\n"
        "e.g. avg: [1, 2, 3, 4, 5]\n"
        "Alias for mean - calculates the average of a list of numbers.\n"
        "get_max:\n"
        "e.g. get_max: [1, 2, 3, 4, 5]\n"
        "Finds the maximum value in a list of numbers.\n"
        "get_min:\n"
        "e.g. get_min: [1, 2, 3, 4, 5]\n"
        "Finds the minimum value in a list of numbers.\n"
        "get_sum:\n"
        "e.g. get_sum: [1, 2, 3, 4, 5]\n"
        "Calculates the sum of a list of numbers.\n"
        "sort:\n"
        "e.g. sort: [5, 1, 3, 2, 4]\n"
        "Sorts a list of numbers in ascending order.\n"
        "Think step by step and always use minimum one action. "
        "DO NOT TRY TO GUESS THE ANSWER. Begin! <|im_end|>"
        "\n<|im_start|>user\n" + question + "<|im_end|>"
        "\n<|im_start|>assistant\n"    
        )

    

In [29]:
class ChatBot:
    def __init__(self, prompt=""):
        self.prompt = prompt

    def __call__(self, user_prompt):
        self.prompt += user_prompt
        result = self.execute()
        return result

    def execute(self):
        generator = generate.regex(model, regex_str)
        result = generator(self.prompt, max_tokens=1024, seed=42)
        return result
    

In [49]:
import json

def query(question, max_turns=5):
    i = 0
    next_prompt = (
        "\n<|im_start|>user\n" + question + "<|im_end|>"
        "\n<|im_start|>assistant\n"
    )
    previous_actions = []
    while i < max_turns:
        i += 1
        prompt = generate_hermes_prompt(question=question, schema=Decision.model_json_schema())
        bot = ChatBot(prompt=prompt)
        result = bot(next_prompt)
        json_result = json.loads(result)['Decision']
        if "Final_Answer" not in list(json_result.keys()):
            scratchpad = json_result['Scratchpad'] if i == 0 else ""
            thought = json_result['Thought']
            action = json_result['Action']
            action_input = json_result['Action_Input']
            print(f"\x1b[34m Scratchpad: {scratchpad} \x1b[0m")
            print(f"\x1b[34m Thought: {thought} \x1b[0m")
            print(f"\x1b[36m  -- running {action}: {str(action_input)}\x1b[0m")
            if action + ": " + str(action_input) in previous_actions:
                observation = "You already ran that action. **TRY A DIFFERENT ACTION INPUT.**"
            else:
                try:
                    # Check if action is in the defined enum and perform the corresponding action
                    if action in Action.__members__:
                        # Convert the action string to the corresponding function call
                        observation = globals()[action](action_input)
                    else:
                        observation = "Invalid action."
                except Exception as e:
                    observation = f"{e}"
            print()
            print(f"\x1b[33m Observation: {observation} \x1b[0m")
            print()
            previous_actions.append(action + ": " + str(action_input))
            next_prompt += (
                "\nScratchpad: " + scratchpad +
                "\nThought: " + thought +
                "\nAction: " + action  +
                "\nAction Input: " + action_input +
                "\nObservation: " + str(observation)
            )
        else:
            scratchpad = json_result["Scratchpad"]
            final_answer = json_result["Final_Answer"]
            print(f"\x1b[34m Scratchpad: {scratchpad} \x1b[0m")
            print(f"\x1b[34m Final Answer: {final_answer} \x1b[0m")
            return final_answer
    print("\nFinal Answer: I am sorry, but I am unable to answer your question. Please provide more information or a different question.")
    return "No answer found"

In [52]:
print(query("Think step by step and reason about your answer. Was Caterpillar's average total revenue higher or lower than Realogy's lowest net income from 2019 to 2021? Here is the table as markdown: '|    | company1               | facts              |   year |      value |\n|---:|:-----------------------|:-------------------|-------:|-----------:|\n|  0 | CATERPILLAR INC        | us-gaap:Revenues   |   2019 | 5.38e+10   |\n|  1 | Realogy Holdings Corp. | us-gaap:ProfitLoss |   2019 | 1.85e+08   |\n|  2 | CATERPILLAR INC        | us-gaap:Revenues   |   2020 | 4.1748e+10 |\n|  3 | Realogy Holdings Corp. | us-gaap:ProfitLoss |   2020 | 3.56e+08   |\n|  4 | CATERPILLAR INC        | us-gaap:Revenues   |   2021 | 5.0971e+10 |\n|  5 | Realogy Holdings Corp. | us-gaap:ProfitLoss |   2021 | 3.5e+08    |'"))

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.93s/it, est. speed input: 685.73 toks/s, output: 36.77 toks/s]

[34m Scratchpad: Caterpillar's average total revenue and Realogy's lowest net income from 2019 to 2021 need to be calculated and compared. [0m
[34m Final Answer: Caterpillar's average total revenue was higher than Realogy's lowest net income from 2019 to 2021. [0m
Caterpillar's average total revenue was higher than Realogy's lowest net income from 2019 to 2021.



