In [9]:
import openai
import json
import time
import tqdm
import ast
import random
import argparse
import pdb
from time import sleep
import os

# from utils import global_template, run_logger, task_list, load_copa, run_batched_logger, api_base, api_type, api_version, openai_key

api_base = "https://msri-openai-ifaq.azure-api.net"
api_type = "azure"
api_version = "2023-03-15-preview"

openai.api_base = api_base
openai.api_type = api_type
openai.api_version = api_version
# openai.api_key = openai_key


def attenuator(number_of_failed_hits):
    return 2 ** (number_of_failed_hits)


def construct_prompt(query, task):
    message = []
    language = query["lang"]
    sys_prompt = {
        "xlsum": """You are an LLM evaluator. Given the main text and the corresponding summary in {language}, your task is to rate the summary.
    Rate the summary on a scale of 1 to 5, with 1 being the worst and 5 being the best. \n\n Just output the score and nothing else. \n\n""",
        "translation": """You are an LLM evaluator. Given the main text and the corresponding translation in {language}, your task is to rate the translation.
    Rate the translation on a scale of 1 to 5, with 1 being the worst and 5 being the best. \n\nJust output the score and nothing else. \n\n""",
    }
    message.append({"role": "system", "content": f"{sys_prompt[task]}"})
    message.append(
        {
            "role": "user",
            "content": f"Main text: {query['text']}\n Summary: {query['summ']}\n",
        }
    )

    return message


def get_llm_eval(prompt):
    number_of_failed_hits = 0
    while number_of_failed_hits < 5:
        try:
            response = openai.ChatCompletion.create(
                engine="gpt-35-turbo", temperature=0, messages=prompt, max_tokens=100
            )
            output = response["choices"][0]["message"]["content"].strip().split("\n")[0]
            return output
        except (openai.error.RateLimitError, openai.error.Timeout) as e:
            output = "FAILED"
            number_of_failed_hits += 1
            slugger = attenuator(number_of_failed_hits)
            print(f"Retrying after {slugger} seconds!")
            sleep(slugger)
        except (
            openai.error.APIConnectionError,
            openai.error.APIError,
            openai.error.InvalidRequestError,
            TypeError,
        ) as e:
            output = f"FAILED with Unrecoverable Exception {e}."
            break
        except Exception as e:
            output = f"FAILED with generic exception {e}"
            break
    return output


def get_seed_queries(seed_file):
    with open(seed_file, "r") as f:
        return f.read().split("\n")


def batched_response(
    task, queries, sleep_period, batch_size, temperature, response_logger_file
):
    batch_iter = 0
    num_of_batches = len(queries) // batch_size
    for batch_idx in range(num_of_batches):
        batch_responses = []
        for query in tqdm.tqdm(queries[batch_iter : batch_iter + batch_size]):
            time.sleep(sleep_period)
            prompt = construct_prompt(query, task)
            model_response = get_llm_eval(prompt)
            batch_responses.append(model_response)
        # run_batched_logger(task, queries[batch_iter: batch_iter + batch_size], batch_responses, response_logger_file)
        batch_iter += batch_size
        print(f"Processed {batch_idx + 1} batch. Moving on to next!")


if __name__ == "__main__":

    # parser = argparse.ArgumentParser()
    # parser.add_argument('--root', type = str, default = '../results/llm_eval/')
    # parser.add_argument('--model', type = str, default = "gpt-35-turbo") # Add a deployment name here
    # parser.add_argument('--keypath', type = str, default = '../../openai_key.txt')
    # parser.add_argument('--seed_dataset_name', type=str, default = 'xlsum')
    # parser.add_argument('--sleep_period', type=float, default = 1)
    # parser.add_argument('--batch_size', type = int, default = 25)
    # parser.add_argument('--max_samples', type=int, default = 500)
    # parser.add_argument('--temperature', type=float, default = 0)
    # parser.add_argument('--queries_file', type=str, default = '../../data/EMNLP/explict_bias_seed.txt')
    # args = parser.parse_args()

    ## Initializing openai key and response logging file
    # with open(args.keypath, 'r') as file:
    #     openai.api_key = file.read().split('\n')[0] # In case there are any useless delimiters

    openai.api_key = ""
    query = {}
    query["text"] = (
        """The Securities and Exchange Commission said it would "review the unusual trading activity" and "take appropriate steps to protect investors". There are rumours the drop may have been caused by an erroneous "fat finger" trade at a Wall Street bank. The term refers to when a trader enters data incorrectly. There was speculation that a Citigroup employee had been responsible for an erroneous trade - however, the bank said there was no evidence of this. "We, along with the rest of the financial industry, are investigating to find the source of today's market volatility," Citigroup spokesman Stephen Cohen said in a statement. "At this point, we have no evidence that Citi was involved in any erroneous transaction." US President Barack Obama said the authorities were evaluating the unusual activity and hoped to prevent it from happening again. "They will make findings of their review public along with recommendations for appropriate action," he told reporters on Friday. Algorithms Among those affected were 3M, whose shares fell 25t one point, and Procter & Gamble, which saw its stocks fall by 37%. There have been reports that a "fat finger" incident involved trade in Procter & Gamble shares, when a trader entered a "b" for billion instead of "m" for million. All share trading is processed electronically - with stocks changing hands in fractions of a second. One theory is that algorithms on which systems rely automatically triggered thousands of transactions to be executed after an error was made. The tumultuous session on Wall Street saw the Dow Jones index fall by almost 1,000 points in a matter of minutes. It later recovered, closing down 3.2%, or 347.8 points, at 10,520.32. The Nasdaq exchange also saw sharp drops and said it was also investigating what happened."""
    )
    query["summ"] = (
        """Stock market go down because of fat finger mistake, maybe bank do bad trade. People confused, stocks change hands fast. Wall Street and SEC looking into problem."""
    )
    query["lang"] = "en"

    # response_logger_file = f'{args.root}{args.seed_dataset_name}.txt'
    response_logger_file = "results.txt"
    prompt = construct_prompt(query, "xlsum")
    model_response = get_llm_eval(prompt)
    print(model_response)
    # batched_response('xlsum', query, 1, 1, 0,response_logger_file)

    # if args.queries_file:
    #     with open(args.queries_file, 'r') as file:
    #         queries = file.read().strip().split('\n')[:args.max_samples]
    #         batched_response(args.task, queries,  args.sleep_period, args.batch_size, args.temperature, response_logger_file)

Score: 3
