In [1]:
import tiktoken
import json
import openai
import setup
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import re

In [2]:
openai.api_key = setup.openai_api_key
openai.organization = setup.openai_organization_id

### Data Preparation

In [3]:
def extract_text(folder_name):
  folder_names = os.listdir(folder_name)
  # put all text in a list
  text = []
  for folder in folder_names:
    with open(folder_name + '/' + folder + '/text.json') as file:
      file_content = file.read()
      file_content_str = json.loads(file_content)
      # need to convert the file content to a dict to extract text
      file_content_dict = json.loads(file_content_str)
      text.append(file_content_dict["text"])
  return text

def preprocess_text(text):
  text_prepros = []
  for ex in text:
    new1 = re.sub('\n.+\|thumb.*\n?(.*)?', '', ex)
    new2 = re.sub('\n?(.*)?\|thumb.*\n', '', new1)
    new3 = re.sub('\n.+thumb\|.*\n?(.*)?', '', new2)
    new4 = re.sub('\n?(.*)?thumb\|.*\n', '', new3)
    new5 = re.sub('\n.+\|thumbnail.*\n?(.*)?', '', new4)
    new6 = re.sub('\n?(.*)?\|thumbnail.*\n', '', new5)
    new7 = re.sub('\n.+thumbnail\|.*\n?(.*)?', '', new6)
    new8 = re.sub('\n?(.*)?thumbnail\|.*\n', '', new7)
    new9 = new8.lstrip(' ')
    new10 = new9.lstrip('\n')
    text_prepros.append(new10)
  folder_names = os.listdir("../../input_data/wikipedia_data_v3")
  ind = folder_names.index("Agriculture")
  text_prepros[ind] = text_prepros[ind].replace('\nthumb |upright=1.35 |Centres of origin, as numbered by Nikolai Vavilov in the 1930s. Area 3 (gray) is no longer recognised as a centre of origin, and Papua New Guinea (area P, orange) was identified more recently.\n', '')
  rem1 = folder_names.index("AT&T_Plaza")
  rem2 = folder_names.index("Elgato")
  rem3 = folder_names.index("Engineer_boot")
  del text_prepros[rem1]
  del text_prepros[rem2]
  del text_prepros[rem3]
  return text_prepros

In [4]:
text = extract_text("../../input_data/wikipedia_data_v3")
text_clean = preprocess_text(text)

### Measurement

In [None]:
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
def tokenize(enc, text_clean):
    tokenized = []
    for ex in text_clean:
        encoding = enc.encode(ex)
        tokenized.append(encoding)
    return tokenized

In [None]:
tokens = tokenize(enc, text_clean)

In [16]:
def measure_accuracy(example, filename):
    total_acc = 0
    context_lengths = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 50, 75, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000]
    for c in context_lengths:
        example_dict = {}
        context_tokens = example[0:c]
        context = enc.decode(context_tokens)
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages = [
                {"role": "user", "content": context}
            ],
            temperature=0,
            max_tokens=1
        )
        gpt_output = response['choices'][0]['message']['content']
        correct_string = enc.decode([example[c]])
        stripped_correct_string = correct_string.strip()
        # stripped_correct_token = enc.encode(stripped_correct_string)
        accuracy_string = 0
        # accuracy_token = 0
        # if stripped_correct_token == enc.encode(gpt_output):
        #     accuracy_token = 1
        if stripped_correct_string == gpt_output:
            accuracy_string = 1
            total_acc += 1

        example_dict["context"] = context
        example_dict["correct_string_not_stripped"] = correct_string
        example_dict["correct_token_not_stripped"] = example[c]
        example_dict["correct_string_stripped"] = stripped_correct_string
        #example_dict["correct_token_stripped"] = stripped_correct_token
        example_dict["gpt_predicted_string"] = gpt_output
        #example_dict["gpt_predicted_token"] = enc.encode(gpt_output)
        example_dict["accuracy_string"] = accuracy_string
        #example_dict["accuracy_token"] = accuracy_token
        print(example_dict)

        with open(filename, "r") as file:
            current = json.load(file)

        current.update({c: example_dict})

        with open(filename, "w") as file:
            json.dump(current, file)
    return total_acc

In [17]:
example = tokens[15]
print(example)
print(enc.decode(example))

[2149, 98106, 374, 264, 220, 2550, 17, 3778, 11625, 18273, 18884, 4632, 9124, 555, 36367, 16795, 20595, 21415, 323, 6004, 555, 36367, 16795, 29485, 13, 578, 4632, 374, 279, 220, 2148, 267, 16795, 11625, 4668, 4632, 11, 323, 574, 279, 11999, 9124, 2391, 279, 16795, 4632, 11639, 3967, 439, 279, 16795, 55383, 13, 1102, 574, 9124, 323, 15910, 555, 14662, 53227, 82, 323, 3842, 5444, 7197, 11, 323, 374, 3196, 389, 279, 35217, 9630, 5964, 1604, 315, 279, 1890, 836, 505, 279, 3861, 75453, 323, 3861, 64643, 13, 578, 7899, 6445, 4519, 10016, 1226, 5248, 11, 17582, 13926, 11, 39162, 445, 93421, 11, 24150, 50664, 11, 9454, 26056, 7197, 11, 46092, 69109, 98625, 323, 31164, 1369, 1604, 13, 578, 4632, 11263, 1708, 98106, 11, 459, 73698, 8761, 220, 2639, 258, 11, 889, 14035, 264, 11204, 29062, 8649, 264, 83857, 13, 763, 2015, 311, 10477, 279, 29062, 505, 279, 10517, 48850, 1291, 11, 568, 41877, 5014, 5678, 439, 264, 28994, 42826, 11, 323, 16696, 311, 10098, 279, 76086, 323, 813, 10003, 382, 48412, 226

In [18]:
accuracy = measure_accuracy(example, 'accuracies_gpt35_ex15.json')

{'context': 'Al', 'correct_string_not_stripped': 'addin', 'correct_token_not_stripped': 98106, 'correct_string_stripped': 'addin', 'gpt_predicted_string': 'I', 'accuracy_string': 0}
{'context': 'Aladdin', 'correct_string_not_stripped': ' is', 'correct_token_not_stripped': 374, 'correct_string_stripped': 'is', 'gpt_predicted_string': 'Al', 'accuracy_string': 0}
{'context': 'Aladdin is', 'correct_string_not_stripped': ' a', 'correct_token_not_stripped': 264, 'correct_string_stripped': 'a', 'gpt_predicted_string': 'a', 'accuracy_string': 1}
{'context': 'Aladdin is a', 'correct_string_not_stripped': ' ', 'correct_token_not_stripped': 220, 'correct_string_stripped': '', 'gpt_predicted_string': 'Disney', 'accuracy_string': 0}
{'context': 'Aladdin is a ', 'correct_string_not_stripped': '199', 'correct_token_not_stripped': 2550, 'correct_string_stripped': '199', 'gpt_predicted_string': 'Disney', 'accuracy_string': 0}
{'context': 'Aladdin is a 199', 'correct_string_not_stripped': '2', 'correct_

In [19]:
print(accuracy)

16


In [20]:
df = pd.read_json('accuracies_gpt35_ex15.json')
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,550,600,650,700,750,800,850,900,950,1000
context,Al,Aladdin,Aladdin is,Aladdin is a,Aladdin is a,Aladdin is a 199,Aladdin is a 1992,Aladdin is a 1992 American,Aladdin is a 1992 American animated,Aladdin is a 1992 American animated musical,...,Aladdin is a 1992 American animated musical fa...,Aladdin is a 1992 American animated musical fa...,Aladdin is a 1992 American animated musical fa...,Aladdin is a 1992 American animated musical fa...,Aladdin is a 1992 American animated musical fa...,Aladdin is a 1992 American animated musical fa...,Aladdin is a 1992 American animated musical fa...,Aladdin is a 1992 American animated musical fa...,Aladdin is a 1992 American animated musical fa...,Aladdin is a 1992 American animated musical fa...
correct_string_not_stripped,addin,is,a,,199,2,American,animated,musical,fantasy,...,that,a,lies,Al,",",Al,At,the,stresses,to
correct_token_not_stripped,98106,374,264,220,2550,17,3778,11625,18273,18884,...,430,264,15812,1708,11,1708,1688,279,59623,311
correct_string_stripped,addin,is,a,,199,2,American,animated,musical,fantasy,...,that,a,lies,Al,",",Al,At,the,stresses,to
gpt_predicted_string,I,Al,a,Disney,Disney,2,animated,animated,mus,film,...,that,a,ref,Al,but,With,Al,and,restrict,to
accuracy_string,0,0,1,0,0,1,0,1,0,0,...,1,1,0,1,0,0,0,0,0,1


In [24]:
example = tokens[75]
print(example)
print(enc.decode(example))

[791, 21080, 3314, 17283, 374, 264, 220, 4278, 46199, 5277, 3799, 78, 85421, 87382, 304, 14013, 28931, 29890, 11, 1561, 4356, 4409, 13, 1102, 574, 6319, 555, 1443, 92076, 11, 32402, 612, 40759, 323, 8308, 304, 220, 7285, 16, 13, 578, 4857, 706, 264, 15485, 2673, 315, 220, 323, 13656, 264, 2860, 315, 220, 16615, 11, 2737, 1202, 41032, 13, 11699, 836, 374, 14592, 505, 330, 29831, 556, 3314, 498, 279, 30499, 315, 1561, 4356, 11, 902, 374, 315, 9987, 6371, 13, 578, 21080, 3314, 17283, 14980, 439, 279, 1917, 596, 82717, 4857, 369, 7154, 220, 1272, 1667, 3156, 279, 8246, 315, 279, 4435, 17657, 5955, 596, 4892, 22703, 304, 28636, 29890, 304, 3389, 220, 4468, 15, 13, 23548, 279, 6250, 220, 806, 8951, 304, 220, 1049, 16, 11, 433, 574, 1578, 279, 82717, 4857, 304, 1561, 4356, 4409, 3156, 433, 574, 68328, 555, 279, 502, 3861, 4435, 17657, 5955, 304, 220, 679, 17, 13, 1174, 279, 4857, 374, 279, 31487, 2442, 19790, 4857, 304, 1561, 4356, 4409, 11, 279, 26084, 2442, 19790, 8308, 85421, 87382, 304, 2

In [25]:
accuracy = measure_accuracy(example, 'accuracies_gpt35_ex75.json')

{'context': 'The', 'correct_string_not_stripped': ' Empire', 'correct_token_not_stripped': 21080, 'correct_string_stripped': 'Empire', 'gpt_predicted_string': 'The', 'accuracy_string': 0}
{'context': 'The Empire', 'correct_string_not_stripped': ' State', 'correct_token_not_stripped': 3314, 'correct_string_stripped': 'State', 'gpt_predicted_string': 'The', 'accuracy_string': 0}
{'context': 'The Empire State', 'correct_string_not_stripped': ' Building', 'correct_token_not_stripped': 17283, 'correct_string_stripped': 'Building', 'gpt_predicted_string': 'The', 'accuracy_string': 0}
{'context': 'The Empire State Building', 'correct_string_not_stripped': ' is', 'correct_token_not_stripped': 374, 'correct_string_stripped': 'is', 'gpt_predicted_string': 'The', 'accuracy_string': 0}
{'context': 'The Empire State Building is', 'correct_string_not_stripped': ' a', 'correct_token_not_stripped': 264, 'correct_string_stripped': 'a', 'gpt_predicted_string': 'a', 'accuracy_string': 1}
{'context': 'The

In [26]:
print(accuracy)

11


In [2]:
df = pd.read_json('accuracies_gpt35_ex75.json')
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,550,600,650,700,750,800,850,900,950,1000
context,The,The Empire,The Empire State,The Empire State Building,The Empire State Building is,The Empire State Building is a,The Empire State Building is a,The Empire State Building is a 102,The Empire State Building is a 102-story,The Empire State Building is a 102-story Art,...,The Empire State Building is a 102-story Art D...,The Empire State Building is a 102-story Art D...,The Empire State Building is a 102-story Art D...,The Empire State Building is a 102-story Art D...,The Empire State Building is a 102-story Art D...,The Empire State Building is a 102-story Art D...,The Empire State Building is a 102-story Art D...,The Empire State Building is a 102-story Art D...,The Empire State Building is a 102-story Art D...,The Empire State Building is a 102-story Art D...
correct_string_not_stripped,Empire,State,Building,is,a,,102,-story,Art,Dec,...,was,in,West,",",",",The,is,Madison,are,Avenue
correct_token_not_stripped,21080,3314,17283,374,264,220,4278,46199,5277,3799,...,574,304,4410,1174,11,578,374,31015,527,17569
correct_string_stripped,Empire,State,Building,is,a,,102,-story,Art,Dec,...,was,in,West,",",",",The,is,Madison,are,Avenue
gpt_predicted_string,The,The,The,The,a,sk,sk,-story,sk,Dec,...,is,between,West,this,",",The,are,Lex,are,A
accuracy_string,0,0,0,0,1,0,0,1,0,1,...,0,0,1,0,1,1,0,0,1,0


In [19]:
for col in df.columns:
    print("Context: ", df.loc["context", col])
    print("Correct answer: ", df.loc["correct_string_stripped", col], "| GPT-output: ", df.loc["gpt_predicted_string", col])
    print("---------")

Context:  The
Correct answer:  Empire | GPT-output:  The
---------
Context:  The Empire
Correct answer:  State | GPT-output:  The
---------
Context:  The Empire State
Correct answer:  Building | GPT-output:  The
---------
Context:  The Empire State Building
Correct answer:  is | GPT-output:  The
---------
Context:  The Empire State Building is
Correct answer:  a | GPT-output:  a
---------
Context:  The Empire State Building is a
Correct answer:   | GPT-output:  sk
---------
Context:  The Empire State Building is a 
Correct answer:  102 | GPT-output:  sk
---------
Context:  The Empire State Building is a 102
Correct answer:  -story | GPT-output:  -story
---------
Context:  The Empire State Building is a 102-story
Correct answer:  Art | GPT-output:  sk
---------
Context:  The Empire State Building is a 102-story Art
Correct answer:  Dec | GPT-output:  Dec
---------
Context:  The Empire State Building is a 102-story Art Deco skyscraper in Midtown Manhattan, New
Correct answer:  York | GPT