In [1]:
from openai import OpenAI
from math import exp
import numpy as np
from IPython.display import display, HTML
import os

# load env
from dotenv import load_dotenv
load_dotenv()

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [2]:
def get_completion(
    messages: list[dict[str, str]],
    model: str = "gpt-3.5-turbo",
    max_tokens=500,
    temperature=0,
    stop=None,
    seed=123,
    tools=None,
    logprobs=None,  # whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message..
    top_logprobs=None,
) -> str:
    params = {
        "model": model,
        "messages": messages,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "stop": stop,
        "seed": seed,
        "logprobs": logprobs,
        "top_logprobs": top_logprobs,
    }
    if tools:
        params["tools"] = tools

    completion = client.chat.completions.create(**params)
    return completion

In [17]:
def single_inference_gpt(question):

    input_text = question + " Do not add anything else to your output: "

    API_RESPONSE = get_completion(
        [{"role": "user", "content": input_text}],
        model="gpt-3.5-turbo",
        logprobs=True,
        temperature=1,
        # top_logprobs=2,
    )
    generated_text = []
    log_probs = []
    linear_probs = []
    # Extract generated text and tokens with log probabilities
    # generated_text = API_RESPONSE.choices[0].message.content
    tokens = API_RESPONSE.choices[0].logprobs.content
    for each in tokens:
        # print(each.token)
        # print(each.logprob)
        # print(exp(each.logprob)*100)
        generated_text.append(each.token)
        log_probs.append(each.logprob)
        linear_probs.append(exp(each.logprob)*100)
    # print(tokens)
    # log_probs = API_RESPONSE.choices[0].logprobs.token_logprobs
    # log_probs = ""
    # linear_probs = ""
    # Convert log probabilities to linear probabilities
    # linear_probs = [exp(log_prob) * 100 for log_prob in log_probs]

    # print("Generated text: ", generated_text)
    # print("Tokens: ", tokens)
    # print("Log probabilities: ", log_probs)
    # print("Linear probabilities: ", linear_probs)

    return generated_text, tokens, log_probs, linear_probs

In [None]:
from tqdm import tqdm
import pandas as pd

df = pd.read_csv('../MMLU_5000_sample.csv')[:1000]

responses = []

for i, row in tqdm(df.iterrows()):
    question = row['prompt']
    answer = row['answer']
    
    response = single_inference_gpt(question)

    responses.append(response)


new_df = pd.DataFrame(responses, columns=['generated', 'log_probs', 'linear_probs'])
new_df['correct_answer'] = df['answer']

new_df.to_csv("gpt_lobprob_benchmark.csv", index=False)


611it [07:32,  1.20it/s]