# Caption Analysis

In [40]:
import tiktoken
import pandas as pd
import openai
from time import sleep
import os
file_path = "../../openai_key.txt"

with open(file_path, "r") as file:
    key = file.read()
openai.api_key = key
import numpy as np
import json
import time

In [14]:
encoding = tiktoken.get_encoding("cl100k_base")
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [15]:
# Word Counting Function
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [16]:
def get_questions_anwers(promt):
    completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "user",
         "content": promt}
    ])
    questions_answers = completion.choices[0].message.content
    return questions_answers

In [79]:
def chatgpt(promt, idx, start_time):
    success = False
    while not success:
        try:
            qa_pairs = get_questions_anwers(promt)
            success = True
        except:
            print("retrying")
            sleep(5)
    if (idx + 1) % 5 == 0:
        print("-"*50)
        print(f"Processing {(idx + 1)} captions")
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"running time: {round(elapsed_time, 0)} seconds")
    return qa_pairs
    

In [80]:
def caption_processing(caption):
    remove_words =["in this image i can see ", "in this image we can see",  "this is an image of, ", "a cartoon of ", "simpson", "homer", "a cartoon of ", "cartoon ", "this is an image of "]
    for remove_word in remove_words:
        caption = caption.replace(remove_word, "")
    replaced_by_person_words = ["duck", "bear", " character"]
    for word in replaced_by_person_words:
        caption.replace(word, "person")
    caption.replace("characters", "people")
    caption.replace("simpsons", "people")
    return caption + ". people skin is yellow."

In [81]:
def combined_promt(caption_1, caption_2, requirement):
    
    return f"{requirement}\ncontext 1: {caption_1} \ncontext 2: {caption_2}\noutput:"

def qa_generator(csv_path, demo = True):
    start_time = time.time()
    df = pd.read_csv(csv_path)
    if demo:
        df = df[:10]
    df = df[["id", "img", "caption_1", "caption_2"]]
    df = df.rename(columns={'id': 'img_id'})
    df['id'] = df.index
    df["caption_1"] = df["caption_1"].map(caption_processing)
    df["caption_2"] = df["caption_2"].map(caption_processing)
    

    requirement = "You are required to ask about 10 questions about 2 given contexts. " + \
    "The question topics are: yes/no, counting, object, activity, " + \
    "color, posistion. Answer within 3 words.\n" + \
    "context 1: a man is jumping to catch a ball.\n" + \
    "context 2: a white ball is being catch by a man.\n" + \
    "output:1. What is he doing?\nJumping.\n2. What sport is he playing?\nFootball.\n3. What color is the ball?\nwhite."

    df['promt'] = df[["caption_1", "caption_2"]].apply(lambda row: combined_promt(row["caption_1"], row["caption_2"], requirement), axis=1)
    df['qa pairs'] = df[['promt', "id"]].apply(lambda row: chatgpt(row["promt"], row["id"], start_time), axis=1)

    return df

result = qa_generator("clean_cartoon.csv")

Processing 5 captions
Processing 10 captions


In [82]:
result2 = result[["img_id", "img", "qa pairs"]]

In [83]:
def qa_processing(string):
    lines = string.split('\n')
    
    # Extract the question lines (odd indices)
    questions = [line.split('. ')[1] for line in lines[::2]]
    
    # Extract the answer lines (even indices)
    answers = [line for line in lines[1::2]]
    return questions, answers

In [84]:
annotation_data = []
question_data = []
answer_id = 1
question_id = 1
img_ids = []
imgs = []
questions = []
question_ids = []
answers = []
answer_ids = []
for idx in range(len(result2)):    
    img_id, img,  qa_pairs= result2.iloc[idx]
    ques, ans = qa_processing(qa_pairs)
    for i,q in enumerate(ques):
        ques_item = {
            'image_name': img,
            'image_id': int(img_id),
            'question': q,
            'question_id': int(question_id)
        }
        question_data.append(ques_item)
        
        ann_item = {
            'question_type': q.split(" ")[0],
            'answers': ans[i],
            'image_id': int(img_id),
            'question_id': int(question_id),
            'answer_id': int(answer_id)
        }

        annotation_data.append(ann_item )
        
        img_ids.append(img_id)
        imgs.append(img)
        questions.append(q)
        question_ids.append(question_id)
        answers.append(ans[i])
        answer_ids.append(answer_id)
        answer_id += 1
        question_id+=1
        


In [85]:
info = {'description': 'This is v0.1 of the Cartoon-VQA dataset.',
        'url': 'http://cartoonvqa.org',
        'version': '0.1',
        'year': 2023,
        'contributor': 'Deakin Team',
        'date_created': '2023-06-10'}

In [86]:
json_question_data ={
    "info": info,
    'data_type': "The Simpsons",
    "task_type": "Open-End",
    "question": question_data
}
with open("v0.1_cartoon_question.json", "w") as json_file:
    # Write the dictionary to the JSON file
    json.dump(json_question_data, json_file)
json_file.close()


In [87]:
json_annotation_data ={
    "info": info,
    'data_type': "The Simpsons",
    "task_type": "Open-End",
    "annotations": annotation_data
}
with open("v0.1_cartoon_annotations.json", "w") as json_file:
    # Write the dictionary to the JSON file
    json.dump(json_annotation_data, json_file)
json_file.close()

In [88]:
data = pd.DataFrame([])
data["id"] = img_ids 
data["img"] = imgs
data["question id"] = question_ids
data["question"] = questions
data["answer id"] = answer_ids
data["answer"] = answers
data.to_csv("v0.1_cartoon_triples.csv", index=False)


In [89]:
data.to_csv("v0.1_cartoon_triples.csv", index=False)