In [9]:
from openai import OpenAI
import tiktoken
import pandas as pd
import numpy as np

In [36]:
# creating gpt client
models = {'gpt-3.5-turbo': 'gpt-3.5-turbo',
          'gpt-4-turbo-1': 'gpt-4-turbo-preview',
          'gpt-4-turbo-2': 'gpt-4-1106-preview'}

model_name = models['gpt-4-turbo-2']

In [3]:
# function for calculating num of tokens of sentence
def num_tokens_from_string(string: str, model_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(model_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [17]:
# load Reddit Dad Jokes dataset
df_dadjokes = pd.read_csv('../Data/new_humor_datasets/reddit_dadjokes/reddit_dadjokes.csv')
# get only 1 score samples
df_dadjokes = df_dadjokes[df_dadjokes.score == 1]

In [18]:
# clean from duplicates and reposts
df_dadjokes = df_dadjokes[df_dadjokes['joke'].apply(lambda joke: 'reposted' not in joke.lower())]
df_dadjokes.drop_duplicates(inplace=True)

In [19]:
lambda_num_tokens = lambda s: num_tokens_from_string(s, model_name)
df_dadjokes['num_tokens'] = df_dadjokes['joke'].apply(lambda_num_tokens)

In [20]:
print(f'average num of tokens per sample = {np.mean(df_dadjokes.num_tokens)}')
print(f'overall samples tokens per 10K samples = {10000 * np.mean(df_dadjokes.num_tokens)}')

average num of tokens per sample = 25.840883317895756
overall samples tokens per 10K samples = 258408.83317895755


In [21]:
df_for_input = df_dadjokes['joke'].iloc[100:120]

In [30]:
num_of_examples = 5
input_str = '\n'.join([f'{i}. {joke}' for i, joke in enumerate(df_for_input, num_of_examples+1)])
print(input_str)

In [32]:
# create examples and instruction to gpt request
examples_instructions_prompt = \
    "###" \
    "1. input: 'A grizzly kept talking to me and annoyed me He was unbearable'" \
    "output: 'A grizzly kept talking to me and annoyed me He was intolerable'" \
    "2. input: 'For Christmas, I requested my family not to give me duplicates of the same item. Now I anticipate " \
    "receiving the missing sock next time.' " \
    "output: 'For Christmas, I requested my family not to give me duplicates of the same item. Now I anticipate " \
    "receiving the other book next time.' " \
    "3. input: 'My son’s fourth birthday was today, but when he came to see me I didn’t recognize him at first. I’d " \
    "never seen him be 4.' " \
    "output: My son’s fourth birthday was today, but when he came to see me I didn’t recognize him at first. He grew " \
    "up so fast. " \
    "4. input: 'I asked my friend if he liked Nickleback. He told me that he never gave me any money'" \
    "output: 'I asked my friend if he liked Nickleback. He told me that he prefers Kings of Leon.'" \
    "5. input: 'I went to a bookstore and asked where the self-help section was The clerk said that if she told me, " \
    "it would defeat the purpose.' " \
    "output: 'I went to a bookstore and asked where the self-help section was The clerk said it was in the third " \
    "aisle .' " \
    "###" \
    "Using the examples in ### markers, please change some of the words in the following sentences to make" \
    " them non humorous. You can change anything but please change the least you can:\n"


In [37]:
# print(examples_instructions_prompt + input_str)
print(num_tokens_from_string(examples_instructions_prompt + input_str, model_name))

886


In [38]:
# send to completion
completion = client.chat.completions.create(
    model=model_name,
    messages=[
        {'role': 'user',
         'content': examples_instructions_prompt + input_str}
    ]
)

In [39]:
print(completion.choices[0].message.content)

6. Did I tell you about my friend with a boat making business in his attic? Business is going well for him.
7. This year my resolution is to learn new skills that combine my interests in poetry and animals.
8. I value my family time and enjoy watching movies together.
9. Did you hear of the Librarian who became unwell while reading a book? She had to take a sick leave.
10. For some reason, I always think there are only 25 letters in the alphabet... ...I must be forgetting one.
11. What's the difference between a goldfish and a clownfish? They belong to different species and habitats.
12. Have you heard about the health concerns going on in France? It's a serious issue.
13. Did you hear Bill Gates was in a dance competition? It turns out dancing isn't one of his talents.
14. What can you expect when you ask a teenager a question? Sometimes you might get a cheeky response.
15. I used to have a job handling bulk agricultural goods before the company changed its packaging process.
16. Why 

In [41]:
print(f'input tokens = {completion.usage.prompt_tokens}')
print(f'output tokens = {completion.usage.completion_tokens}')

input tokens = 893
output tokens = 460
