In [4]:
import os
import time
from math import exp

import numpy as np
import pandas as pd
from IPython.display import HTML, display
from openai import OpenAI
from pydantic import BaseModel, Field

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [54]:
def get_completion(
    messages: list[dict[str, str]],
    model: str = "gpt-4o-mini",
    max_tokens=500,
    temperature=0,
    stop=None,
    seed=123,
    tools=None,
    logprobs=None,  # whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message..
    top_logprobs=None,
) -> str:
    params = {
        "model": model,
        "messages": messages,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "stop": stop,
        "seed": seed,
        "logprobs": logprobs,
        "top_logprobs": top_logprobs,
    }
    if tools:
        params["tools"] = tools

    completion = client.chat.completions.create(**params)
    return completion

TEST_PROMPT = """Give me the name of the president of US at year 2000, return only the name, and nothing else"""
api_response = get_completion(
    [{"role": "user", "content": TEST_PROMPT}],
    logprobs=True,
    top_logprobs=3,
)
generate_answer = api_response.choices[0].message.content

possible_tokens = api_response.choices[0].logprobs.content
html_content = f"Question: {TEST_PROMPT}<br>Answer: {generate_answer}<br>"
for i, token in enumerate(possible_tokens, start=1):
    html_content += f"<span style='color: cyan'>Output token {i}:</span><br>"
    for j, logprob in enumerate(token.top_logprobs):
        html_content += (
            f"{logprob.token}, <span style='color: darkorange'>logprobs:</span> {logprob.logprob}, "
            f"<span style='color: magenta'>linear probability:</span> {np.round(np.exp(logprob.logprob)*100,2)}%<br>"
        )
display(HTML(html_content))
print("\n")





---

In [64]:
SYS_PROMPT = """You are provided with an article: {article}. Your task is to answer the user's question solely based on the information within this article. \
Before responding, assess whether the article contains enough information to answer the question accurately. \
Then, proceed to answer the question. If the article lacks sufficient information, respond with "Do not have sufficent information to answer the question". \
After providing your answer, evaluate whether your response is entirely based on the article's content and effectively answers the user's question."""
SYS_PROMPT_1 = """You are provided with an article: {article}. Your task is to answer the user's question solely based on the information within this article. \
But before you answer the question, assess whether the article contains enough information to answer the question accurately, Yes or No only, nothing else."""
SYS_PROMPT_2 = """You are provided with an article: {article}. Your task is to answer the user's question solely based on the information within this article. \
If you do not have sufficient information from the article to answer the question, respond with "Do not have sufficent information to answer the question"."""
SYS_PROMPT_3 = """You are given an article: {article}. You are also given an answer to user's question: {answer}.\
Evaluate whether the answer is entirely based on the article's content and effectively answers the user's question, Yes or No, nothing else."""
USER_PROMPT = """Question: {question}"""
#ARTICLE = "The president of the US at year of 2000 is Bill Clinton, and the president of the US at year 2024 is Joe Biden"
#QUESTION = "Give me the name of the president of US at year 2026, return only the name, and nothing else"
ARTICLE="""Augusta Ada King, Countess of Lovelace (née Byron; 10 December 1815 – 27 November 1852) was an English mathematician and writer, chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation.
Ada Byron was the only legitimate child of poet Lord Byron and reformer Lady Byron. All Lovelace's half-siblings, Lord Byron's other children, were born out of wedlock to other women. Byron separated from his wife a month after Ada was born and left England forever. He died in Greece when Ada was eight. Her mother was anxious about her upbringing and promoted Ada's interest in mathematics and logic in an effort to prevent her from developing her father's perceived insanity. Despite this, Ada remained interested in him, naming her two sons Byron and Gordon. Upon her death, she was buried next to him at her request. Although often ill in her childhood, Ada pursued her studies assiduously. She married William King in 1835. King was made Earl of Lovelace in 1838, Ada thereby becoming Countess of Lovelace.
Her educational and social exploits brought her into contact with scientists such as Andrew Crosse, Charles Babbage, Sir David Brewster, Charles Wheatstone, Michael Faraday, and the author Charles Dickens, contacts which she used to further her education. Ada described her approach as "poetical science" and herself as an "Analyst (& Metaphysician)".
When she was eighteen, her mathematical talents led her to a long working relationship and friendship with fellow British mathematician Charles Babbage, who is known as "the father of computers". She was in particular interested in Babbage's work on the Analytical Engine. Lovelace first met him in June 1833, through their mutual friend, and her private tutor, Mary Somerville.
Between 1842 and 1843, Ada translated an article by the military engineer Luigi Menabrea (later Prime Minister of Italy) about the Analytical Engine, supplementing it with an elaborate set of seven notes, simply called "Notes".
Lovelace's notes are important in the early history of computers, especially since the seventh one contained what many consider to be the first computer program—that is, an algorithm designed to be carried out by a machine. Other historians reject this perspective and point out that Babbage's personal notes from the years 1836/1837 contain the first programs for the engine. She also developed a vision of the capability of computers to go beyond mere calculating or number-crunching, while many others, including Babbage himself, focused only on those capabilities. Her mindset of "poetical science" led her to ask questions about the Analytical Engine (as shown in her notes) examining how individuals and society relate to technology as a collaborative tool.
"""
QUESTION="What concepts did Lovelace build with Charles Babbage"
TOP_LOGPROBS_NUM = 3
#MODEL_NAME = "gpt-4o-2024-08-06"
MODEL_NAME="gpt-4o-mini"

In [65]:
class ResponseFormat(BaseModel):
    pre_validation: str = Field(
        description="""Before you answer, consider if you have sufficient information from the article to answer the question, respond with "Yes" or "No" only, nothing else."""
    )
    question_answer: str = Field(
        description="The answer of the question, nothing else. Anser Unknow if you do not have sufficient information from the article to answer the question."
    )
    post_validation: str = Field(
        descrption="""After providing your answer, evaluate whether your response is entirely based on the article's content and effectively answers the user's question., respond with "Yes" or "No" only, nothing else."""
    )


test_all_completion = client.beta.chat.completions.parse(
    model=MODEL_NAME,
    messages=[
        {"role": "system", "content": SYS_PROMPT.format(article=ARTICLE)},
        {"role": "user", "content": USER_PROMPT.format(question=QUESTION)},
    ],
    response_format=ResponseFormat,
    logprobs=True,
    top_logprobs=TOP_LOGPROBS_NUM,
)

test_all_message = test_all_completion.choices[0].message

In [66]:
possible_tokens = test_all_completion.choices[0].logprobs.content
print("Question: {}".format(QUESTION))
print(
    "Pre validation:\t\t{}\nQuestion anser:\t\t{}\nPost validation:\t{}".format(
        test_all_message.parsed.pre_validation,
        test_all_message.parsed.question_answer,
        test_all_message.parsed.post_validation,
    )
)
color_end_string = "\033[0m"
for i, token in enumerate(possible_tokens, start=1):
    print("Token {:<2}: ".format(i), end="")
    for j, logprob in enumerate(token.top_logprobs):
        if logprob.token == "Yes" or logprob.token == "No":
            color_start_string = "\033[1m"
        else:
            color_start_string = ""
        print(
            """{}{:<15} : {:<5}% {}""".format(
                color_start_string,
                repr(logprob.token),
                np.round(np.exp(logprob.logprob) * 100, 2),
                color_end_string,
            ),
            end="",
        )
        if j == len(token.top_logprobs) - 1:
            print()
        else:
            print("\t|\t", end="")

Question: What concepts did Lovelace build with Charles Babbage
Pre validation:		Yes
Question anser:		Lovelace built concepts around the capabilities of computers to go beyond mere calculating or number-crunching, focusing on technology as a collaborative tool, and she developed the first computer program through her notes on the Analytical Engine.
Post validation:	Yes
Token 1 : '{"'            : 100.0% [0m	|	'{'             : 0.0  % [0m	|	'{\n'           : 0.0  % [0m
Token 2 : 'pre'           : 100.0% [0m	|	'pr'            : 0.0  % [0m	|	'p'             : 0.0  % [0m
Token 3 : '_validation'   : 100.0% [0m	|	'_valid'        : 0.0  % [0m	|	'_val'          : 0.0  % [0m
Token 4 : '":"'           : 100.0% [0m	|	'":"","'        : 0.0  % [0m	|	'":"\''         : 0.0  % [0m
Token 5 : [1m'Yes'           : 99.75% [0m	|	[1m'No'            : 0.25 % [0m	|	' Yes'          : 0.0  % [0m
Token 6 : '","'           : 100.0% [0m	|	'.","'          : 0.0  % [0m	|	"','"           : 0.0  % 

---

In [67]:
def llm_test_all(MODEL_NAME="gpt-4o-2024-08-06"):

    class ResponseFormat(BaseModel):
        pre_validation: str = Field(
            description="""Before you answer, consider if you have sufficient information from the article to answer the question, respond with "Yes" or "No" only, nothing else."""
        )
        question_answer: str = Field(
            description="The answer of the question, nothing else. Anser Unknow if you do not have sufficient information from the article to answer the question."
        )
        post_validation: str = Field(
            descrption="""After providing your answer, evaluate whether your response is entirely based on the article's content and effectively answers the user's question., respond with "Yes" or "No" only, nothing else."""
        )

    start_time = time.time()
    test_all_completion = client.beta.chat.completions.parse(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": SYS_PROMPT.format(article=ARTICLE)},
            {"role": "user", "content": USER_PROMPT.format(question=QUESTION)},
        ],
        response_format=ResponseFormat,
        logprobs=True,
        top_logprobs=TOP_LOGPROBS_NUM,
    )
    end_time = time.time()
    cost_time = end_time - start_time
    test_all_message = test_all_completion.choices[0].message
    pre_validation_answer = test_all_message.parsed.pre_validation
    question_answer = test_all_message.parsed.question_answer
    post_validation_answer = test_all_message.parsed.post_validation
    pre_validation_probability = np.round(
        np.exp(test_all_completion.choices[0].logprobs.content[4].logprob) * 100, 2
    )
    post_validation_probability = np.round(
        np.exp(test_all_completion.choices[0].logprobs.content[-2].logprob) * 100, 2
    )
    return (
        pre_validation_answer,
        pre_validation_probability,
        question_answer,
        post_validation_answer,
        post_validation_probability,
        None,
        None,
        None,
        cost_time,
    )

In [68]:
llm_test_all(MODEL_NAME=MODEL_NAME)

('Yes',
 99.75,
 'Ada Lovelace built the concepts related to the Analytical Engine with Charles Babbage, including the idea that computers could have applications beyond mere calculation and could serve as collaborative tools for individuals and society.',
 'Yes',
 100.0,
 None,
 None,
 None,
 8.77829647064209)

In [69]:
def llm_test_seperate(MODEL_NAME="gpt-4o-2024-08-06"):
    start_time = time.time()
    test_1_completion = client.beta.chat.completions.parse(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": SYS_PROMPT_1.format(article=ARTICLE)},
            {"role": "user", "content": USER_PROMPT.format(question=QUESTION)},
        ],
        logprobs=True,
        top_logprobs=TOP_LOGPROBS_NUM,
    )
    end_time = time.time()
    cost_time_1 = end_time - start_time
    pre_validation_answer = test_1_completion.choices[0].message.content
    pre_validation_probability = np.round(
        np.exp(test_1_completion.choices[0].logprobs.content[0].logprob) * 100, 2
    )

    start_time = time.time()
    test_2_completion = client.beta.chat.completions.parse(
        model=MODEL_NAME,
        messages=[
            {"role": "system", "content": SYS_PROMPT_2.format(article=ARTICLE)},
            {"role": "user", "content": USER_PROMPT.format(question=QUESTION)},
        ],
        logprobs=True,
        top_logprobs=TOP_LOGPROBS_NUM,
    )
    end_time = time.time()
    cost_time_2 = end_time - start_time
    question_answer = test_2_completion.choices[0].message.content

    start_time = time.time()
    test_3_completion = client.beta.chat.completions.parse(
        model=MODEL_NAME,
        messages=[
            {
                "role": "system",
                "content": SYS_PROMPT_3.format(article=ARTICLE, answer=question_answer),
            },
            {"role": "user", "content": USER_PROMPT.format(question=QUESTION)},
        ],
        logprobs=True,
        top_logprobs=TOP_LOGPROBS_NUM,
    )
    end_time = time.time()
    cost_time_3 = end_time - start_time
    post_validation_answer = test_3_completion.choices[0].message.content
    post_validation_probability = np.round(
        np.exp(test_3_completion.choices[0].logprobs.content[0].logprob) * 100, 2
    )
    total_time = cost_time_3 + cost_time_2 + cost_time_1
    return (
        pre_validation_answer,
        pre_validation_probability,
        question_answer,
        post_validation_answer,
        post_validation_probability,
        cost_time_1,
        cost_time_2,
        cost_time_3,
        total_time,
    )

In [70]:
llm_test_seperate(MODEL_NAME=MODEL_NAME)

('No',
 97.7,
 "Lovelace built a long working relationship and friendship with Charles Babbage, focusing on his work on the Analytical Engine. She developed a vision of the capability of computers to go beyond mere calculating or number-crunching, as she examined how individuals and society relate to technology as a collaborative tool. Additionally, her notes on the Analytical Engine included what many consider to be the first computer program, demonstrating her understanding of the machine's applications.",
 'Yes',
 99.68,
 0.728041410446167,
 1.6403543949127197,
 0.35438108444213867,
 2.7227768898010254)

In [71]:
results = []
for i in range(5):
    results.append(["A"] + list(llm_test_all()))
    results.append(["S"] + list(llm_test_seperate()))

In [72]:
df = pd.DataFrame(
    results,
    columns=[
        "Type",
        "Pre",
        "Pre_prob",
        "Answer",
        "Post",
        "Post_prob",
        "Time_pre",
        "Time_answer",
        "Time_post",
        "Time_total"
    ],
)
df

Unnamed: 0,Type,Pre,Pre_prob,Answer,Post,Post_prob,Time_pre,Time_answer,Time_post,Time_total
0,A,No,26.89,Do not have sufficent information to answer th...,Yes,99.99,,,,1.609255
1,S,No,99.93,The article does not specify particular concep...,Yes,99.48,1.102515,2.513767,0.353676,3.969959
2,A,No,29.42,Do not have sufficent information to answer th...,Yes,100.0,,,,1.000639
3,S,No,99.93,The article does not specify any particular co...,Yes,99.98,0.373406,2.149766,0.373589,2.896761
4,A,Yes,77.73,Ada Lovelace and Charles Babbage worked togeth...,Yes,100.0,,,,1.80231
5,S,No,99.99,The article does not specify particular concep...,Yes,99.96,0.378137,1.701163,0.434065,2.513365
6,A,Yes,77.73,Lovelace developed a vision of the capability ...,Yes,100.0,,,,1.562442
7,S,No,99.99,The article does not provide specific concepts...,Yes,99.97,0.374094,1.431242,0.363143,2.168478
8,A,Yes,79.82,Lovelace worked with Charles Babbage on the An...,Yes,100.0,,,,1.370857
9,S,No,99.99,Lovelace was particularly interested in Charle...,Yes,99.99,0.380145,2.582684,0.390399,3.353228


In [74]:
df[df["Type"] == "A"]["Time_total"].mean()

1.4691008567810058

In [75]:
df[df["Type"] == "S"]["Time_total"].mean()

2.9803582668304442