In [1]:
import os
import json
import time
import tiktoken
from openai import AzureOpenAI

In [2]:
def extract_infos(json_file_path) -> dict:
    with open(json_file_path, 'r') as file:
        return json.load(file)


def init_client(infos: dict):
    client = AzureOpenAI(
    azure_endpoint = infos['azure_endpoint'],
    api_key = infos['api_key'],
    api_version = infos['api_version']
    )

    return client

In [3]:
def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model("gpt-4")
    num_tokens = len(encoding.encode(string))

    return num_tokens


def message(role, content) -> dict:
    return {"role": role, "content": content}


def read_file(absolute_path):
    with open(absolute_path) as file:
        return file.read()


def load_tables_from_json(json_file):
    with open(json_file, 'r') as file:
        data = json.load(file)
    return data


def build_messages(file_name, messages_file_path, html_table, output_prompts_folder):
    content_system_1 = read_file(messages_file_path['system_1'])
    content_user_1 = read_file(messages_file_path['user_1'])
    content_assistant = read_file(messages_file_path['assistant'])
    content_user_2 = read_file(messages_file_path['user_2']) + '\n\n' + html_table
    content_system_2 = read_file(messages_file_path['system_2'])

    messages_dict = [
        message("system", content_system_1),
        message("user", content_user_1),
        message("assistant", content_assistant),
        message("user", content_user_2),
        message("system", content_system_2)
    ]

    # save prompt for replication purposes
    file_name_txt = file_name + '.txt'
    with open(os.path.join(output_prompts_folder, file_name_txt), "w") as text_file:
        text_file.write(json.dumps(messages_dict))
    print(f"\t Saved prompt at: {os.path.join(output_prompts_folder, file_name_txt)}")

    # number of input tokens
    input_tokens = num_tokens_from_string(content_system_1 + content_user_1 + content_assistant + content_user_2 + content_system_2)

    return messages_dict, input_tokens

In [4]:
def send_request(client, prompt: dict, max_tokens = 16000):

    start_time = time.time()

    with client.chat.completions.with_streaming_response.create(
        model="gpt-4-32k", # model = "deployment_name".
        max_tokens = 6000,
        temperature = 0,
        stream=True,
        messages = prompt,
    ) as response:
        # print(response.headers.get("X-My-Header"))
        answer = ''
        current_answer = ''
        output_tokens = 0
        stream = ''

        for line in response.iter_lines():

            stream += line + '\n'

            if len(line) > 0:
                output_tokens += 1
                line = line.replace('data: ', '')
                if line == '[DONE]':
                    break
                json_line = json.loads(line)
                if len(json_line['choices']) > 0 and  json_line['choices'][0] != None and json_line['choices'][0]['delta'] != None and len(json_line['choices'][0]['delta']) > 0 and json_line['choices'][0]['delta']['content'] != None:
                    current_token = json_line['choices'][0]['delta']['content']
                    # answer += json_line['choices'][0]['delta']['content']
                    answer += current_token
                    current_answer += current_token
                    if '\n' in current_token:
                        print(current_answer)
                        current_answer = ''
    request_time = time.time() - start_time

    return answer, output_tokens, request_time, stream

def save_answer_and_stats(answer, input_tokens, output_tokens, request_time, stream, file_name, output_answers_folder):
    file_name_txt = file_name + '.txt'
    with open(os.path.join(output_answers_folder, file_name_txt), "w") as text_file:
        text_file.write(answer.encode('ascii', 'ignore').decode())
    print(f"\t Saved answer at: {os.path.join(output_answers_folder, file_name_txt)}")

    '''
    data_dict = {"file_name": file_name, "input_tokens": input_tokens, "output_tokens": output_tokens, "request_time": request_time, "stream": stream}
    print(data_dict)
    df = pd.DataFrame([data_dict])
    with pd.ExcelWriter(os.path.join(output_stats_folder, stats_file), engine='openpyxl', if_sheet_exists="overlay", mode='a') as writer:
        df.to_excel(writer, sheet_name='main', startrow=writer.sheets['main'].max_row, index=False, header=False)

    print(f"\t Saved stats at: {os.path.join(output_stats_folder, stats_file)}")
    '''

    return

def run(connection_data: dict, messages_file_paths: dict, articles_tables: dict, output_prompts_folder, output_answers_folder):

    client = init_client(connection_data)

    for article_id, article_tables in articles_tables.items():
        for index, article_table in enumerate(article_tables):

            html = article_table.encode('ascii', 'ignore').decode()
            file_name = f"{article_id}_{index}"

            prompt, input_tokens = build_messages(file_name, messages_file_paths, html, output_prompts_folder)
            print(f"Sending request for: [{article_id} - T{index + 1}]")
            answer, output_tokens, request_time, stream = send_request(client, prompt)

            save_answer_and_stats(answer, input_tokens, output_tokens, request_time, stream, file_name, output_answers_folder)
    return

In [5]:
connection_infos = extract_infos('private.json')
tables = load_tables_from_json('extracted_tables/ER.json')

msgs_base_path = 'messages/ER'

msgs_file_paths = {
    'system_1':  f'{msgs_base_path}/system_1.txt',
    'system_2':  f'{msgs_base_path}/system_2.txt',
    'user_1':    f'{msgs_base_path}/user_1.txt',
    'user_2':    f'{msgs_base_path}/user_2.txt',
    'assistant': f'{msgs_base_path}/assistant.txt'
}

answers_folder = 'answers/ER'
prompts_folder = 'prompts/ER'

In [6]:
# output_stats_folder = 'stats/ER/'
# stats_file = 'stats_1.xlsx'

run(connection_infos, msgs_file_paths, tables, prompts_folder, answers_folder)

	 Saved prompt at: prompts/ER\1503.02427_0.txt
Sending request for: [1503.02427 - T1]
1. <{<Model, CosSim>}, P@1 (1v1), 0.554>

2. <{<Model, CosSim>}, P@1 (1v9), 0.377>

3. <{<Model, DeepMatch topic>}, P@1 (1v1), 0.701>

4. <{<Model, DeepMatch topic>}, P@1 (1v9), 0.330>

5. <{<Model, WordEmbed>}, P@1 (1v1), 0.774>

6. <{<Model, WordEmbed>}, P@1 (1v9), 0.370>

7. <{<Model, Translation>}, P@1 (1v1), 0.819>

8. <{<Model, Translation>}, P@1 (1v9), 0.586>

9. <{<Model, DeepMatch cnn>}, P@1 (1v1), 0.851>

10. <{<Model, DeepMatch cnn>}, P@1 (1v9), 0.496>

11. <{<Model, LR tree>}, P@1 (1v1), 0.853>

12. <{<Model, LR tree>}, P@1 (1v9), 0.652>

13. <{<Model, DeepMatch tree>}, P@1 (1v1), 0.889>

14. <{<Model, DeepMatch tree>}, P@1 (1v9), 0.708>

	 Saved answer at: answers/ER\1503.02427_0.txt
	 Saved prompt at: prompts/ER\1503.02427_1.txt
Sending request for: [1503.02427 - T2]
1. <{<Model, Baseline>}, P@1, 0.574>

2. <{<Model, +DeepMatchtopic>}, P@1, 0.587>

3. <{<Model, +WordEmbed>}, P@1, 0.579>


KeyboardInterrupt: 