In [25]:
from openai import OpenAI
import tiktoken
import pandas as pd
import numpy as np

In [26]:
# creating gpt client
#client = OpenAI()
models = {'gpt-3.5-turbo': 'gpt-3.5-turbo',
          'gpt-4-turbo-1': 'gpt-4-turbo-preview',
          'gpt-4-turbo-2': 'gpt-4-1106-preview'}

model_name = models['gpt-4-turbo-2']

In [27]:
MAX_INPUT_TOKENS = 3800
NUM_OF_EXAMPLES = 5

path = '../Data/new_humor_datasets/reddit_dadjokes/'

In [28]:
# create examples and instruction to gpt request
examples_instructions_prompt = \
    "###" \
    "1. input: 'A grizzly kept talking to me and annoyed me He was unbearable'" \
    "output: 'A grizzly kept talking to me and annoyed me He was intolerable'" \
    "2. input: 'For Christmas, I requested my family not to give me duplicates of the same item. Now I anticipate " \
    "receiving the missing sock next time.' " \
    "output: 'For Christmas, I requested my family not to give me duplicates of the same item. Now I anticipate " \
    "receiving the other book next time.' " \
    "3. input: 'My son’s fourth birthday was today, but when he came to see me I didn’t recognize him at first. I’d " \
    "never seen him be 4.' " \
    "output: My son’s fourth birthday was today, but when he came to see me I didn’t recognize him at first. He grew " \
    "up so fast. " \
    "4. input: 'I asked my friend if he liked Nickleback. He told me that he never gave me any money'" \
    "output: 'I asked my friend if he liked Nickleback. He told me that he prefers Kings of Leon.'" \
    "5. input: 'I went to a bookstore and asked where the self-help section was The clerk said that if she told me, " \
    "it would defeat the purpose.' " \
    "output: 'I went to a bookstore and asked where the self-help section was The clerk said it was in the third " \
    "aisle .' " \
    "###" \
    "Using the examples in ### markers, please change some of the words in the following sentences to make" \
    " them non humorous. You can change anything but please change the least you can:\n"


In [29]:
# function for calculating num of tokens of sentence
def num_tokens_from_string(string: str, model_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(model_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [30]:
# load Reddit Dad Jokes dataset (filtered and with id)
df_dadjokes = pd.read_csv(path + 'reddit_dadjokes_with_id.csv')

In [31]:
# get only 1 score samples
df_dadjokes = df_dadjokes[df_dadjokes.score == 1]

In [33]:
lambda_num_tokens = lambda s: num_tokens_from_string(s, model_name)
df_dadjokes['num_tokens'] = df_dadjokes['joke'].apply(lambda_num_tokens)

In [69]:
# filter very long jokes
df_dadjokes = df_dadjokes[df_dadjokes['num_tokens'] < 500]

In [92]:
def get_next_input(df, index):
    samples_count = 150
    df_for_input = df.iloc[index:index + samples_count]
    input_tokens_count = np.sum(df_for_input['num_tokens'])

    while input_tokens_count > MAX_INPUT_TOKENS:
        samples_count -= 10
        df_for_input = df_for_input.iloc[:samples_count]
        input_tokens_count = np.sum(df_for_input['num_tokens'])

    index += samples_count
    return df_for_input, index

In [71]:
# print(f'average num of tokens per sample = {np.mean(df_dadjokes.num_tokens)}')
# print(f'overall samples tokens per 10K samples = {10000 * np.mean(df_dadjokes.num_tokens)}')

In [84]:
# check if file exists
from os.path import exists
not_jokes_filename = 'reddit_dadjokes_not_jokes.csv'
if exists(path + not_jokes_filename):
    df_not_jokes = pd.read_csv(path + not_jokes_filename)
else:
    df_not_jokes = pd.DataFrame(columns=df_dadjokes.columns)
    df_not_jokes['edited_joke'] = None

In [73]:
curr_index = len(df_not_jokes)

In [77]:
# create series from completion output
def parse_response(response):
# response = completion.choices[0].message.content
    start_joke_idx = NUM_OF_EXAMPLES + 1
    end_joke_idx = NUM_OF_EXAMPLES + len(df_for_input)
    edited_jokes = []
    for i in range(start_joke_idx, end_joke_idx + 1):
        if i == end_joke_idx: # this is the last joke special case
            edited_joke = response[:]
        else:
            index_next_joke = response.index(f'{i+1}.')
            edited_joke = response[:index_next_joke]
            response = response[index_next_joke:]

        edited_joke = edited_joke[edited_joke.index(f'{i}. ') + 3:].strip() # remove 'i. ' from the joke
        edited_jokes.append(edited_joke)

    for i, joke in enumerate(edited_jokes):
        print(i, joke)
    edited_jokes_series = pd.Series(edited_jokes, name='edited_joke')

    return edited_jokes_series

0 What did the first ant say to the second ant after he farted? It's not me, it's a coincidence.
1 What time did Sean Connery normally arrive at Wimbledon? Around ten.
2 Tell your young children that you bought them wet wipes. This way it will help to clean them whenever they use the toilet.
3 What does Forrest Gump suggest we should name the next generation? A new name.
4 What sort of music do people listen to while wrapping presents? Instrumental.
5 Someone threw a bunch of herbs in my face. Now I’m lightly dusted with spices.
6 How do you change the number one into a word? Just add a g to make it gone.
7 What do you say when someone is relaxing in bed? They are resting.
8 Christmas present: I asked my family not to give me two of the same kind. So now I look forward to getting a different item next time.
9 Why did the old man fall into the well? Because he wasn't looking where he was going.
10 What did Yoda say when he saw a clear picture? This is high definition.
11 Where do people

In [91]:
# add series as column to df_for_input
def save_samples(df_for_input, edited_jokes_series, df_not_jokes):
    df_to_output = pd.concat([df_for_input.reset_index(drop=True), edited_jokes_series], axis=1, ignore_index=True)
    df_to_output.columns = df_not_jokes.columns

    # append to df_not_jokes and save
    df_not_jokes = pd.concat([df_not_jokes, df_to_output], axis=0, ignore_index=True)
    df_not_jokes.to_csv(path + not_jokes_filename, index=False)

    return df_not_jokes

In [74]:
while curr_index < 11000: # until we get to the final count
    df_for_input, curr_index  = get_next_input(df_dadjokes, curr_index)
    input_str = '\n'.join([f'{i}. {joke.strip()}' for i, joke in enumerate(df_for_input['joke'], NUM_OF_EXAMPLES+1)])
    # print(input_str)

    # send to completion
    completion = client.chat.completions.create(
        model=model_name,
        messages=[
            {'role': 'user',
             'content': examples_instructions_prompt + input_str}
        ]
    )

    edited_jokes_series = parse_response(completion.choices[0].message.content)
    df_not_jokes = save_samples(df_for_input, edited_jokes_series, df_not_jokes)

6. What did the first ant say to the second ant after he farted? It's not me, it's *deodorant*
7. What time did Sean Connery normally make it to Wimbledon? Tennish
8. Tell your young children that you bought them edible toilet paper. This way it will wipe their butt for them whenever they poop. I told this to my kids years ago when we (parents) were having to wipe their butts for them after they pooped. They thought it was very funny, they were about 6 &amp; 7 years old at the time.
9. What does Forrest Gump think we should name the next generation? Gen A
10. What sort of music does cellophane play? Wrap
11. Someone threw a bunch of herbs in my face Now I’m parsley sighted.
12. How do you make the number one disappear? Just add a g and it's gone.
13. What do you say when a Mexican is in bed? Taco-Stado
14. Christmas present I Asked my family not to give me two of the same kind for christmas. So now I look forward to get the other sock next time.
15. Why did the old man fall in the well

In [76]:
# print(completion.choices[0].message.content)

Based on the instruction to make the sentences non-humorous by changing as little as possible:

6. What did the first ant say to the second ant after he farted? It's not me, it's a coincidence.
7. What time did Sean Connery normally arrive at Wimbledon? Around ten.
8. Tell your young children that you bought them wet wipes. This way it will help to clean them whenever they use the toilet.
9. What does Forrest Gump suggest we should name the next generation? A new name.
10. What sort of music do people listen to while wrapping presents? Instrumental.
11. Someone threw a bunch of herbs in my face. Now I’m lightly dusted with spices.
12. How do you change the number one into a word? Just add a g to make it gone.
13. What do you say when someone is relaxing in bed? They are resting.
14. Christmas present: I asked my family not to give me two of the same kind. So now I look forward to getting a different item next time.
15. Why did the old man fall into the well? Because he wasn't looking w

In [37]:
# print(examples_instructions_prompt + input_str)
# print(num_tokens_from_string(examples_instructions_prompt + input_str, model_name))

886


In [41]:
print(f'input tokens = {completion.usage.prompt_tokens}')
print(f'output tokens = {completion.usage.completion_tokens}')

input tokens = 893
output tokens = 460
