# Group Project 1: Detection

## Creation of Dataset

In [5]:
import pandas as pd
header_list = ['word', 'punStatus', 'pronunciation']
ingestionDataFrameHete = pd.read_csv("./data/concatTrainHete.txt", sep=" ", names=header_list)
ingestionDataFrameHom = pd.read_csv("./data/concatTrainHom.txt", sep=" ", names=header_list)
ingestionDataFrameHete

Unnamed: 0,word,punStatus,pronunciation
0,',O,PUNCTUATION_'
1,',O,PUNCTUATION_'
2,I,O,UNKNOWN
3,',O,PUNCTUATION_'
4,m,O,"EH1,M"
...,...,...,...
19723,circular,O,"S,ER1,K,Y,AH0,L,ER0"
19724,sores,P,"S,AO1,R,Z"
19725,!,O,PUNCTUATION_!
19726,<sentEnd>,<sentEnd>,<sentEnd>


Ideally we would have a json with the following format

`{"prompt": "<pun text>", "completion": "<pun word>"}`

Dataframe will look something like this

| prompt | punWord |



In [43]:
gpt3dataFrameHete = pd.DataFrame(columns=["prompt", "punWord"])

prompt = ""
punWord = ""

for index, row in ingestionDataFrameHete.iterrows():
    word = row["word"]
    punStatus = row["punStatus"]

    if word == "<sentEnd>":
        prompt = prompt.replace("\'\'", "\"")
        prompt = prompt.replace("\' ", "\'")
        prompt = prompt.replace("\" ", "\"")
        prompt = prompt.replace("- ", "-")
        prompt = prompt.replace(',"', '," ')
        prompt = prompt.replace(',\'', ',\' ')
        appendDict = {"prompt":prompt, "punWord":punWord}
        tempAddDf = pd.DataFrame([appendDict])
        gpt3dataFrameHete = pd.concat([gpt3dataFrameHete, tempAddDf], ignore_index=True)

        prompt = ""
        punWord = ""
    else:
        if "PUNCTUATION" in row["pronunciation"]:
            prompt = prompt + word
        else:
            prompt = prompt + " " + word
        if punStatus == "P":
            punWord = word

In [44]:
gpt3dataFrameHom = pd.DataFrame(columns=["prompt", "punWord"])

prompt = ""
punWord = ""

for index, row in ingestionDataFrameHom.iterrows():
    word = row["word"]
    punStatus = row["punStatus"]

    if word == "<sentEnd>":
        prompt = prompt.replace("\'\'", "\"")
        prompt = prompt.replace("\' ", "\'")
        prompt = prompt.replace("\" ", "\"")
        prompt = prompt.replace("- ", "-")
        prompt = prompt.replace(',"', '," ')
        prompt = prompt.replace(',\'', ',\' ')
        appendDict = {"prompt":prompt, "punWord":punWord}
        tempAddDf = pd.DataFrame([appendDict])
        gpt3dataFrameHom = pd.concat([gpt3dataFrameHom, tempAddDf], ignore_index=True)

        prompt = ""
        punWord = ""
    else:
        if "PUNCTUATION" in row["pronunciation"]:
            prompt = prompt + word
        else:
            prompt = prompt + " " + word
        if punStatus == "P":
            punWord = word

### Merge two datasets together

In [45]:
gpt3dataFrame = pd.concat([gpt3dataFrameHete, gpt3dataFrameHom], ignore_index=True)
gpt3dataFrame = gpt3dataFrame.drop_duplicates()
gpt3dataFrame = gpt3dataFrame.rename(columns={'punWord':'completion'})

## Untrained Model Evaluation

In [42]:
# Let's start evaluating with openai -- untrained

import openai

openai.api_key = ''
# General, combined
errDataFrame = pd.DataFrame(columns=['errorIndex', 'errorWord', 'realWord', 'sentence'])

from sklearn.utils import shuffle
gpt3dataFrame = shuffle(gpt3dataFrame)

import re

for index, row in gpt3dataFrame[0:100].iterrows():
    prompt = row['prompt']
    response = openai.Completion.create(
      model="text-davinci-002",
      prompt="Find the Pun word in the prompt:\n\n"+ prompt + "\n\nPun word:",
      temperature=0,
      max_tokens=256,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    )
    openAiResponse = re.sub(r'[^a-zA-Z0-9]', '', response["choices"][0]["text"].strip().lower())

    if openAiResponse != re.sub(r'[^a-zA-Z0-9]', '', row['completion'].lower()):
        appendDict = {"errorIndex":index, "errorWord":openAiResponse, 'realWord': re.sub(r'[^a-zA-Z0-9]', '', row['completion'].lower()), 'sentence':row['prompt']}
        errDataFrame = pd.concat([errDataFrame, pd.DataFrame([appendDict])], ignore_index=True)

errDataFrame

Unnamed: 0,errorIndex,errorWord,realWord,sentence
0,267,mount,hilariously,"""I was the first to climb Mount Everest,"" said..."
1,1246,mother,abbie,My name is Abbie. I'm a Mother Superior.
2,672,investments,appreciatively,"My investments are worth more every day, said..."
3,95,funeral,mourning,He avoided funerals because he was not a mour...
4,821,injection,vain,"""I need an injection,"" Tom pleaded in vain."
5,839,operatic,placidly,"""Who's your favorite operatic tenor?""Tom asked..."
6,354,joy,junk,Those who find bargain antiques like to junk ...
7,944,memory,wrote,"""This is all from memory,"" Tom wrote."
8,975,death,wife,Shotgun wedding: A case of wife or death.
9,475,against,foe,My friend is very paranoid. He says people ar...


Hyperparameter notes:

*Seems that less capable model = more error
higher temp = more error*

### Creating Training and Evaluation Split

In [21]:
gpt3dataFrame = shuffle(gpt3dataFrame)

print(len(gpt3dataFrame))
gpt3dataFrameTraining = gpt3dataFrame[0:1000]
gpt3dataFrameTesting = gpt3dataFrame[1000:1270]

out = gpt3dataFrameTraining.to_json('./temp.json', orient='records', lines=True)

1269


## Trained Model Evaluation

In [40]:
errDataFrame = pd.DataFrame(columns=['errorIndex', 'errorWord', 'realWord', 'sentence'])

for index, row in gpt3dataFrameTesting[0:100].iterrows():
    prompt = row['prompt']
    response = openai.Completion.create(
      model="davinci:ft-personal-2022-10-02-22-42-09",
      prompt=prompt + "->",
    )
    groundTruth = re.sub(r'[^a-zA-Z0-9]', '', row['completion'].lower())

    openAiResponse = re.sub(r'[^a-zA-Z0-9]', '', response["choices"][0]["text"].strip().lower()[0:len(groundTruth)])

    if openAiResponse != groundTruth:
        appendDict = {"errorIndex":index, "errorWord":openAiResponse, 'realWord': re.sub(r'[^a-zA-Z0-9]', '', row['completion'].lower()), 'sentence':row['prompt']}
        errDataFrame = pd.concat([errDataFrame, pd.DataFrame([appendDict])], ignore_index=True)

errDataFrame

Unnamed: 0,errorIndex,errorWord,realWord,sentence
0,624,caber,whine,The steward refused to swallow the harsh whin...
1,608,held,forth,"""I won't finish in fifth place,"" Tom held forth."
2,244,donuts,greece,Did you know that donuts were first made in G...
3,685,io,serial,Computers at breakfast food companies use ser...
4,168,abs,tone,The man leaned on the printer cartridge becau...
