In [26]:
import tiktoken
import json
from openai import OpenAI
import setup
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import re

In [27]:
client = OpenAI(api_key=setup.openai_api_key, organization=setup.openai_organization_id)

### Data Preparation

In [67]:
def extract_text(folder_name):
  folder_names = os.listdir(folder_name)
  # put all text in a list
  text = []
  for folder in folder_names:
    with open(folder_name + '/' + folder + '/text.json') as file:
      file_content = file.read()
      file_content_str = json.loads(file_content)
      # need to convert the file content to a dict to extract text
      file_content_dict = json.loads(file_content_str)
      text.append(file_content_dict["text"])
  return text

def preprocess_text(text):
  text_prepros = []
  for ex in text:
    new1 = re.sub('\n.+\|thumb.*\n?(.*)?', '', ex)
    new2 = re.sub('\n?(.*)?\|thumb.*\n', '', new1)
    new3 = re.sub('\n.+thumb\|.*\n?(.*)?', '', new2)
    new4 = re.sub('\n?(.*)?thumb\|.*\n', '', new3)
    new5 = re.sub('\n.+\|thumbnail.*\n?(.*)?', '', new4)
    new6 = re.sub('\n?(.*)?\|thumbnail.*\n', '', new5)
    new7 = re.sub('\n.+thumbnail\|.*\n?(.*)?', '', new6)
    new8 = re.sub('\n?(.*)?thumbnail\|.*\n', '', new7)
    new9 = new8.lstrip(' ')
    new10 = new9.lstrip('\n')
    text_prepros.append(new10)
  folder_names = os.listdir("../../input_data/wikipedia_data_v3")
  ind = folder_names.index("Agriculture")
  text_prepros[ind] = text_prepros[ind].replace('\nthumb |upright=1.35 |Centres of origin, as numbered by Nikolai Vavilov in the 1930s. Area 3 (gray) is no longer recognised as a centre of origin, and Papua New Guinea (area P, orange) was identified more recently.\n', '')
  rem1 = folder_names.index("AT&T_Plaza")
  del text_prepros[rem1]
  rem2 = folder_names.index("Elgato")
  del text_prepros[rem2 - 1]
  rem3 = folder_names.index("Engineer_boot")
  del text_prepros[rem3 - 2]
  return text_prepros

In [5]:
text = extract_text("../../input_data/wikipedia_data_v3")
text_clean = preprocess_text(text)

### Measurement Functions

In [6]:
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
def tokenize(enc, text_clean):
    tokenized = []
    for ex in text_clean:
        encoding = enc.encode(ex)
        tokenized.append(encoding)
    return tokenized

In [44]:
def measure_accuracy(example, filepath1_context, filepath2_topp):
    with open(filepath1_context, 'w') as file:
        json.dump({}, file)
    with open(filepath2_topp, 'w') as file:
        json.dump({}, file)
    total_acc = 0
    context_lengths = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 50, 75, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000]
    for c in context_lengths:
        example_dict = {}
        top_p_predictions = {}
        context_tokens = example[0:c]
        context = enc.decode(context_tokens)
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = [
                {"role": "user", "content": context}
            ],
            temperature=0,
            max_tokens=1,
            logprobs=True,
            top_logprobs=5
        )
        correct_string = enc.decode([example[c]])
        stripped_correct_string = correct_string.strip()

        predictions = response.choices[0].logprobs.content[0].top_logprobs
        temp_top_pred = {}
        for i in predictions:
            temp_top_pred[i.logprob] = i.token
        temp_top_pred_sorted = dict(sorted(temp_top_pred.items(), reverse=True))
        top_words = list(temp_top_pred_sorted.values())
        for e,(k,v) in enumerate(temp_top_pred_sorted.items()):
            top_p_predictions['token_top_' + str(e+1)] = v
            top_p_predictions['prob_top_' + str(e+1)] = k
            if stripped_correct_string in top_words[0:e+1] or correct_string in top_words[0:e+1]:
                top_p_predictions['accuracy_top_' + str(e+1)] = 1
            else:
                top_p_predictions['accuracy_top_' + str(e+1)] = 0
        top_p_predictions['token_correct_stripped'] = stripped_correct_string
        top_p_predictions['token_correct_not_stripped'] = correct_string

        gpt_output = top_words[0]
        accuracy_string = 0
        if stripped_correct_string == gpt_output or correct_string == gpt_output:
            accuracy_string = 1
            total_acc += 1

        example_dict["context"] = context
        example_dict["correct_string_not_stripped"] = correct_string
        example_dict["correct_token_not_stripped"] = example[c]
        example_dict["correct_string_stripped"] = stripped_correct_string
        example_dict["gpt_predicted_string"] = gpt_output
        example_dict["accuracy_string"] = accuracy_string

        with open(filepath1_context, "r") as file:
            current = json.load(file)
        current.update({c: example_dict})
        with open(filepath1_context, "w") as file:
            json.dump(current, file)

        with open(filepath2_topp, "r") as file:
            current = json.load(file)
        current.update({c: top_p_predictions})
        with open(filepath2_topp, "w") as file:
            json.dump(current, file)

    return total_acc

### Measurements

In [None]:
tokens = tokenize(enc, text_clean)
for i in range(0, 100):
    ex = tokens[i]
    accuracy = measure_accuracy(ex, 'output_context/gpt35context_ex' + str(i) + '.json', 'output_accuracies/gpt35topp_ex' + str(i) + '.json')