In [1]:
from openai import OpenAI
import json

client = OpenAI()

In [2]:
# Define function chatgpt grading
def generate_score(text):
    response=client.chat.completions.create(
        model="gpt-4-1106-preview",
        temperature=0.5,
        max_tokens=800,
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": """You are a grader grading physics questions. You will be given a question, a response, 
            and a ground truth for reference. You should grade the response according to the ground truth, and give your scores in
            json format. Make sure you include three fields 'Correctness','Logic', and'Truthfulness' in your output json. Each of these
            fields must contain a numerical value between 1 to 5, in which 5 is best and 1 is worst. Please provide explanation and justification
            in a seperate 'Comment' field."""},
            {"role": "user", "content": text},
        ],
    )
    return response.choices[0].message.content

In [3]:
# Load your dataset
with open("test_data.json", "r") as f:
    data = json.load(f)["data"]
scores_dict = {}
avg_dataset={"text":[],"score":[]}
for entry in data:
    text = entry["displayed_text"]
    # Compute scores, handling None values
    scores = [0.0 if v is None else float(v) for v in [
        entry["correctness_score"], entry["logic_score"], entry["truthfulness_score"]
    ]]
    
    # Check if the text is already in the dictionary
    if text not in scores_dict:
        scores_dict[text] = {
            "cumulative_scores": scores,
            "count": 1
        }
    else:
        # Add current scores to the cumulative scores and increment count
        scores_dict[text]["cumulative_scores"] = [sum(x) for x in zip(scores_dict[text]["cumulative_scores"], scores)]
        scores_dict[text]["count"] += 1

# Calculate average scores for each text
for text, values in scores_dict.items():
    average_scores = [x / values["count"] for x in values["cumulative_scores"]]
    avg_dataset["text"].append(text)
    avg_dataset["score"].append(average_scores)

# average_scores_dict now contains the average scores for each unique text
print(avg_dataset)


{'text': ['Question:What is the significance of the photoelectric effect in quantum mechanics? <br><br> <strong>Generated Answer:</strong>The photoelectric effect holds great significance in quantum mechanics due to several reasons: 1. **Discovery of Photons**: The Photoelectric effect was key in establishing the particle nature of light. Einstein explained the photoelectric effect by introducing the concept of "light quanta" or "photons". He postulated that light is composed of packets of energy, now known as photons, having energy (E) proportional to the frequency (ν) of light, expressed by the formula: \\[ E=h\\nu \\] where \\(h\\) is Planck\'s constant. 2. **Justification of Quantum Theory**: Classical physics, which assumed light energy was spread evenly over the wave front, failed to explain the photoelectric effect. Quantum mechanics, with its inherent discontinuity due to the particle-like behavior of light, resolved the issue. 3. **Energy Quantization**: The photoelectric effe

In [4]:
responses=[]
c_mse=0.0
l_mse=0.0
t_mse=0.0
for index, text in enumerate(avg_dataset["text"]):
    try:
        response=generate_score(text)
        responses.append(response)
        json_r=json.loads(response)
        correctness=float(json_r["Correctness"])
        logic=float(json_r["Logic"])
        truthfulness=float(json_r["Truthfulness"])
        c_mse+=(correctness-avg_dataset["score"][index][0])**2
        l_mse+=(logic-avg_dataset["score"][index][1])**2
        t_mse+=(truthfulness-avg_dataset["score"][index][2])**2
    except Exception as e:
        print("An error occured")
        print(e)
        print(index)

In [7]:
with open("gpt_4_response", 'w') as file:
    json.dump(responses, file)
print(len(avg_dataset["text"]))
print("The correctness MSE of GPT-4:")
print(c_mse/32)
print("The logic MSE of GPT-4:")
print(l_mse/32)
print("The truthfulness MSE of GPT-4:")
print(t_mse/32)

32
The correctness MSE of GPT-4:
0.4392361111111112
The logic MSE of GPT-4:
0.9505208333333334
The truthfulness MSE of GPT-4:
0.7977430555555556
