In [1]:
import json
import random
import pandas as pd
import openai
import time
import os
from random import shuffle
openai.api_key = ""
questions_file_path = 'All-Questions.xlsx'

# Encode ID

In [11]:
question_banks = {
    "mike_questions": 'Mike-Questions.xlsx',
    "abram_questions": 'Abram-Questions.xlsx',
    "haoran_questions": 'Haoran-Questions.xlsx',
    "anurag_questions": 'Anurag-Questions.xlsx',
    "siyu_questions": 'Siyu-Questions.xlsx',
    "ziwei_questions": 'Ziwei-Questions.xlsx',
    # "ryan_questions": 'Ryan-Questions.xlsx',
    # "other_questions": 'Other-Questions.xlsx',
}

########### each question id is encoded by the following format: ##########
# 0 place:   difficulty (easy, medium, hard)
# 1 place:   type (known, paraphrase, original)
# 2 place:   context (no context, relevant, vague, irrelevant)
# 3 place:   author
# 4-6 place: question number
###########################################################################

difficulty_encoder = {
    "easy": '1',
    "Easy": '1',
    "medium": '2',
    "Medium": '2',
    "hard": '3',
    "Hard": '3',
}

type_encoder = {
    "Known": '0',
    "Copied": '0',
    "Paraphrase": '1',
    "original": '2',
    "Original": '2',
}

context_encoder = {
    "No Context": '0',
    "Relevant": '1',
    "Vague": '2',
    "Irrelevant": '3',
}

author_encoder = {
    "mike_questions": '0',
    "abram_questions": '1',
    "haoran_questions": '2',
    "anurag_questions": '3',
    "siyu_questions": '4',
    "ziwei_questions": '5',
    "ryan_questions": '6',
    "other_questions": '7',
}

# Merge all questions

In [12]:
def merge_all_questions(questions_banks):
    all_questions = pd.DataFrame()
    for bank in question_banks:
        bank_df = pd.read_excel(question_banks[bank])
        if 'Number' in bank_df.columns:
            bank_df.drop(columns=['Number'], inplace=True)
        for idx in range(len(bank_df)):
            difficulty = bank_df.at[idx, 'Difficulty']
            question_type = bank_df.at[idx, 'Type']
            prefix = difficulty_encoder[difficulty]+type_encoder[question_type]+'0'+author_encoder[bank]
            bank_df.at[idx, 'ID'] = str(prefix + format(idx, '03d'))
        # Move the ID column to the first position
        cols = ['ID'] + [col for col in bank_df.columns if col != 'ID']
        bank_df = bank_df[cols]
        all_questions = pd.concat([all_questions, bank_df])
    all_questions = all_questions.reset_index(drop=True)
    all_questions.to_excel(questions_file_path, index=False)

merge_all_questions(question_banks)

  bank_df.at[idx, 'ID'] = str(prefix + format(idx, '03d'))
  bank_df.at[idx, 'ID'] = str(prefix + format(idx, '03d'))
  bank_df.at[idx, 'ID'] = str(prefix + format(idx, '03d'))
  bank_df.at[idx, 'ID'] = str(prefix + format(idx, '03d'))
  bank_df.at[idx, 'ID'] = str(prefix + format(idx, '03d'))
  bank_df.at[idx, 'ID'] = str(prefix + format(idx, '03d'))


# Excel to JSON

In [13]:
def convert_excel_to_json(file_path):
    json_file_path = 'Question_bank.json'
    df = pd.read_excel(file_path)
    df.rename(columns={'Question':'question','Ground Truth':'answer','Relevant Context':'relevant','Vague Context':'vague','Irrelevant Context':'irrelevant'}, inplace=True)
    
    # take only questions, ground truth, and context
    df = df[['ID','question','answer','relevant','vague','irrelevant']]

    # save to json
    data = df.to_json(orient='records')
    df.to_json(json_file_path,orient='records')
    return json.loads(data)

extracted_data = convert_excel_to_json(questions_file_path)
shuffle(extracted_data)

# Query GPT for Answering Questions

In [128]:
def reformat_generate_answer(id, question,generated_answer,ground_truth):
    text = f"Question:{question}\n\n"
    text += f"<br><br>\n<strong>Generated Answer: </strong>{generated_answer}\n\n"
    text += f"<br><br>\n<strong>Ground Truth: </strong>{ground_truth}\n\n"
    text += f"<br>\n<hr size='2'>\n"
    text += f"<strong>How good is the generated answer?</strong>"
    answer = {
        'id': id,
        'question': 'How good is the generated answer?',
        'text':text
    }
    return answer

def generate_answer(extracted_data,json_file_path):
    generated_answers = []
    raw_generated_answers = []
    for idx,item in enumerate(extracted_data):
        messages = []
        messages.append({"role": "system", "content": """
                        You are QuantumGPT, a tool that answers quantum related questions. Please answer the question and write equations in latex.
                        """})
        # *************No context**********
        message_nh=messages
        message_nh.append({"role": "user", "content": item['question']})
        response = openai.chat.completions.create(
                    model="gpt-4-1106-preview",
                    temperature=1.0,
                    max_tokens=1000,
                    messages=message_nh,
                )
        response_no_context=response.choices[0].message.content
        id_no_context = id
        answer_no_context = reformat_generate_answer(id_no_context,item['question'],response_no_context,item['answer'])
        generated_answers.append(answer_no_context)
        # print('No context generated')
        time.sleep(60)

        # *******Relevant context*****
        message_h1=messages
        message_h1.append({"role": "assistant", "content": item['relevant']})
        message_h1.append({"role": "user", "content": item['question']})
        response = openai.chat.completions.create(
                    model="gpt-4-1106-preview",
                    temperature=1.0,
                    max_tokens=1000,
                    messages=message_h1,
                )
        response_relevant =response.choices[0].message.content
        id_relevant = list(id)
        id_relevant[2] = context_encoder['Relevant']
        id_relevant = ''.join(id_relevant)
        answer_relevant = reformat_generate_answer(id_relevant,item['question'],response_relevant,item['answer'])
        generated_answers.append(answer_relevant)
        # print('Relevant generated')
        time.sleep(60)

        # *******Vague context********
        message_h2=messages
        message_h2.append({"role": "assistant", "content": item['vague']})
        message_h2.append({"role": "user", "content": item['question']})
        response = openai.chat.completions.create(
                    model="gpt-4-1106-preview",
                    temperature=1.0,
                    max_tokens=1000,
                    messages=message_h2,
                )
        response_vague=response.choices[0].message.content
        id_vague = list(id)
        id_vague[2] = context_encoder['Vague']
        id_vague = ''.join(id_vague)
        answer_vague = reformat_generate_answer(id_vague,item['question'],response_vague,item['answer'])
        generated_answers.append(answer_vague)
        # print('Vague generated')
        time.sleep(60)

        # **********Irrelevant context********
        message_h3=messages
        message_h3.append({"role": "assistant", "content": item['irrelevant']})
        message_h3.append({"role": "user", "content": item['question']})
        response = openai.chat.completions.create(
                    model="gpt-4-1106-preview",
                    temperature=1.0,
                    max_tokens=1000,
                    messages=message_h3,
                )
        response_irrelevant=response.choices[0].message.content
        id_irrelevant = list(id)
        id_irrelevant[2] = context_encoder['Irrelevant']
        id_irrelevant = ''.join(id_irrelevant)
        answer_irrelevant = reformat_generate_answer(id_irrelevant,item['question'],response_irrelevant,item['answer'])
        generated_answers.append(answer_irrelevant)
        # print('Irrelevant generated')
        time.sleep(60)

        answer = {
            'question': item['question'],
            'no_context': response_no_context,
            'relevant': response_relevant,
            'vague': response_vague,
            'irrelevant': response_irrelevant,
            'ground_truth': item['answer']
        }
        raw_generated_answers.append(answer)

        with open(json_file_path, 'w') as f:
            json.dump(generated_answers, f)
        with open('Raw_Generated_answer_bank.json', 'w') as f:
            json.dump(raw_generated_answers, f)
    return generated_answers

In [129]:
json_file_path = 'Generated_answer_bank.json'
generated_answers = generate_answer(extracted_data,json_file_path)

0000003
No context generated
Relevant generated
Vague generated
Irrelevant generated
2204005
No context generated
Relevant generated
Vague generated
Irrelevant generated
1000000
No context generated
Relevant generated
Vague generated
Irrelevant generated
0005004
No context generated
Relevant generated
Vague generated
Irrelevant generated


In [14]:
generate_bank = pd.read_json('Generated_answer_bank.json')

In [15]:
generate_bank.to_excel('Generated_answer_bank.xlsx', index=False)

In [7]:
# for idx in range(len(generate_bank)):
#     id_int = generate_bank.at[idx, 'id']
#     id_str = str(id_int)
#     id_int += 1000000
#     generate_bank.at[idx, 'id'] = id_int

In [10]:
# generate_bank.to_json('Corrected_Generated_answer_bank.json',orient='records')