This notebook contains the code to preprocess the data and evaluate the GPT-3-5 and GPT-4 models on the data.

In [1]:
import tiktoken
import json
from openai import OpenAI
import setup
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import re
import html

In [4]:
client = OpenAI(api_key=setup.openai_api_key, organization=setup.openai_organization_id)

### Data Preparation

In [2]:
def extract_text(folder_name):
  # extract the data
  folder_names = sorted(os.listdir(folder_name))
  # put all text in a list
  text = []
  for folder in folder_names:
    with open(folder_name + '/' + folder + '/text.json') as file:
      file_content = file.read()
      file_content_str = json.loads(file_content)
      # need to convert the file content to a dict to extract text
      file_content_dict = json.loads(file_content_str)
      text.append(file_content_dict["text"])
  return text

def clean_text(text):
  # clean the data
  text_prepros = []
  for ex in text:
    new = re.sub('\n.+\|thumb.*\n?(.*)?', '', ex)
    new = re.sub('\n?(.*)?\|thumb.*\n', '', new)
    new = re.sub('\n.+thumb\|.*\n?(.*)?', '', new)
    new = re.sub('\n?(.*)?thumb\|.*\n', '', new)
    new = re.sub('\n.+\|thumbnail.*\n?(.*)?', '', new)
    new = re.sub('\n?(.*)?\|thumbnail.*\n', '', new)
    new = re.sub('\n.+thumbnail\|.*\n?(.*)?', '', new)
    new = re.sub('\n?(.*)?thumbnail\|.*\n', '', new)
    new = new.replace('\"', '')
    new = new.replace('"\"', '')
    new = html.unescape(new)
    new = new.encode('ascii', 'ignore').decode()
    new = re.sub(r'Accessed.*?\.', '', new)
    new = new.replace('  ', ' ')
    new = re.sub(r'(http|www)\S+', '', new)
    new = re.sub(r'External Links.*', '', new, flags=re.DOTALL)
    new = re.sub(r'References.*', '', new, flags=re.DOTALL)
    if '\nthumb |upright=1.35 |Centres of origin, as numbered by Nikolai Vavilov in the 1930s. Area 3 (gray) is no longer recognised as a centre of origin, and Papua New Guinea (area P, orange) was identified more recently.\n' in new:
      new = new.replace('\nthumb |upright=1.35 |Centres of origin, as numbered by Nikolai Vavilov in the 1930s. Area 3 (gray) is no longer recognised as a centre of origin, and Papua New Guinea (area P, orange) was identified more recently.\n', '')
    if '\n375px|Deprotonation equilibrium of acetic acid in water' in new:
      new = new.replace('\n375px|Deprotonation equilibrium of acetic acid in water', '')
    text_prepros.append(new)
  return text_prepros

In [3]:
text = extract_text("wikipedia_data_v3")
text_clean = clean_text(text)

### Measurement Functions

In [5]:
def tokenize(enc, text_clean):
    # tokenize the text
    tokenized = []
    for ex in text_clean:
        encoding = enc.encode(ex)
        tokenized.append(encoding)
    return tokenized

In [6]:
def measure_accuracy(example, filepath1_context, filepath2_topp, filepath3_predictions, enc, model_name):
    # make model predictions and obtain accuracy measurements
    with open(filepath1_context, 'w') as file:
        json.dump({}, file)
    with open(filepath2_topp, 'w') as file:
        json.dump({}, file)
    with open(filepath3_predictions, 'w') as file:
        json.dump({}, file)
    total_acc_p1, total_acc_p2, total_acc_p3, total_acc_p4, total_acc_p5 = 0, 0, 0, 0, 0
    context_lengths = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000]
    for c in context_lengths:
        example_dict = {}
        top_p_predictions = {}
        predictions_dict = {}
        context_tokens = example[0:c]
        context = enc.decode(context_tokens)
        response = client.chat.completions.create(
            model=model_name,
            messages = [
                {"role": "user", "content": context}
            ],
            temperature=0,
            max_tokens=1,
            logprobs=True,
            top_logprobs=5
        )
        correct_string = enc.decode([example[c]])
        stripped_correct_string = correct_string.strip()

        predictions = response.choices[0].logprobs.content[0].top_logprobs
        
        probs = []
        top_tokens = []
        # get probabilities and tokens in order
        for i in predictions:
            probs.append(i.logprob)
            top_tokens.append(i.token)
        predictions_dict["tokens"] = top_tokens
        predictions_dict["probabilities"] = probs
        if probs == sorted(probs, reverse=True):
            for e in range(0, len(predictions)):
                top_p_predictions['token_top_' + str(e+1)] = predictions[e].token
                top_p_predictions['prob_top_' + str(e+1)] = predictions[e].logprob
                if stripped_correct_string in top_tokens[0:e+1] or correct_string in top_tokens[0:e+1]:
                    top_p_predictions['accuracy_top_' + str(e+1)] = 1
                    if e + 1 == 1:
                        total_acc_p1 += 1
                    elif e + 1 == 2:
                        total_acc_p2 += 1
                    elif e + 1 == 3:
                        total_acc_p3 += 1
                    elif e + 1 == 4:
                        total_acc_p4 += 1
                    elif e + 1 == 5:
                        total_acc_p5 += 1
                else:
                    top_p_predictions['accuracy_top_' + str(e+1)] = 0
        else:
            together = sorted(zip(probs, top_tokens), reverse=True)
            probs, top_tokens = zip(*together)
            for e in range(0, len(probs)):
                top_p_predictions['token_top_' + str(e+1)] = top_tokens[e]
                top_p_predictions['prob_top_' + str(e+1)] = probs[e]
                if stripped_correct_string in top_tokens[0:e+1] or correct_string in top_tokens[0:e+1]:
                    top_p_predictions['accuracy_top_' + str(e+1)] = 1
                    if e + 1 == 1:
                        total_acc_p1 += 1
                    elif e + 1 == 2:
                        total_acc_p2 += 1
                    elif e + 1 == 3:
                        total_acc_p3 += 1
                    elif e + 1 == 4:
                        total_acc_p4 += 1
                    elif e + 1 == 5:
                        total_acc_p5 += 1
                else:
                    top_p_predictions['accuracy_top_' + str(e+1)] = 0
        top_p_predictions['token_correct_stripped'] = stripped_correct_string
        top_p_predictions['token_correct_not_stripped'] = correct_string

        gpt_output = top_tokens[0]
        accuracy_string = 0
        if stripped_correct_string == gpt_output or correct_string == gpt_output:
            accuracy_string = 1

        example_dict["context"] = context
        example_dict["correct_string_not_stripped"] = correct_string
        example_dict["correct_token_not_stripped"] = example[c]
        example_dict["correct_string_stripped"] = stripped_correct_string
        example_dict["gpt_predicted_string"] = gpt_output
        example_dict["accuracy_string"] = accuracy_string

        # save results
        with open(filepath1_context, "r") as file:
            current = json.load(file)
        current.update({c: example_dict})
        with open(filepath1_context, "w") as file:
            json.dump(current, file)

        with open(filepath2_topp, "r") as file:
            current = json.load(file)
        current.update({c: top_p_predictions})
        with open(filepath2_topp, "w") as file:
            json.dump(current, file)

        with open(filepath3_predictions, "r") as file:
            current = json.load(file)
        current.update({c: predictions_dict})
        with open(filepath3_predictions, "w") as file:
            json.dump(current, file)

    return total_acc_p1/len(context_lengths), total_acc_p2/len(context_lengths), total_acc_p3/len(context_lengths), total_acc_p4/len(context_lengths), total_acc_p5/len(context_lengths)

### Measurements

In [7]:
# select which model to use for encoding
enc = tiktoken.encoding_for_model("gpt-3.5-turbo-0125")
# enc = tiktoken.encoding_for_model("gpt-4")
# enc = tiktoken.encoding_for_model("gpt-4-0125-preview")

# select which model to use
model_name = "gpt-3.5-turbo-0125"
# model_name = "gpt-4"
# model_name = "gpt-4-0125-preview"

In [8]:
difficulty_tensor_p1 = torch.zeros(100)
difficulty_tensor_p2 = torch.zeros(100)
difficulty_tensor_p3 = torch.zeros(100)
difficulty_tensor_p4 = torch.zeros(100)
difficulty_tensor_p5 = torch.zeros(100)
# change output filenames according to model
torch.save(difficulty_tensor_p1, 'gpt3-5_100ex_p1_diff.pt')
torch.save(difficulty_tensor_p2, 'gpt3-5_100ex_p2_diff.pt')
torch.save(difficulty_tensor_p3, 'gpt3-5_100ex_p3_diff.pt')
torch.save(difficulty_tensor_p4, 'gpt3-5_100ex_p4_diff.pt')
torch.save(difficulty_tensor_p5, 'gpt3-5_100ex_p5_diff.pt')

In [9]:
# run the accuracy measurements on all examples
# change filenames according to model
difficulty_tensor_p1 = torch.load('gpt3-5_100ex_p1_diff.pt')
difficulty_tensor_p2 = torch.load('gpt3-5_100ex_p2_diff.pt')
difficulty_tensor_p3 = torch.load('gpt3-5_100ex_p3_diff.pt')
difficulty_tensor_p4 = torch.load('gpt3-5_100ex_p4_diff.pt')
difficulty_tensor_p5 = torch.load('gpt3-5_100ex_p5_diff.pt')
tokens = tokenize(enc, text_clean)
for i in range(0, 100):
    ex = tokens[i]
    # change filenames according to model
    accuracy_p1, accuracy_p2, accuracy_p3, accuracy_p4, accuracy_p5 = measure_accuracy(ex, 'output_context_gpt35/gpt35context_ex' + str(i) + '.json', 'output_accuracies_gpt35/gpt35topp_ex' + str(i) + '.json', 'output_predictions_gpt35/gpt35pred_ex' + str(i) + '.json', enc, model_name)
    difficulty_tensor_p1[i] = accuracy_p1
    difficulty_tensor_p2[i] = accuracy_p2
    difficulty_tensor_p3[i] = accuracy_p3
    difficulty_tensor_p4[i] = accuracy_p4
    difficulty_tensor_p5[i] = accuracy_p5
# change filenames according to model
torch.save(difficulty_tensor_p1, 'gpt3-5_100ex_p1_diff.pt')
torch.save(difficulty_tensor_p2, 'gpt3-5_100ex_p2_diff.pt')
torch.save(difficulty_tensor_p3, 'gpt3-5_100ex_p3_diff.pt')
torch.save(difficulty_tensor_p4, 'gpt3-5_100ex_p4_diff.pt')
torch.save(difficulty_tensor_p5, 'gpt3-5_100ex_p5_diff.pt')