# OpenAI API

In [1]:
import os

import openai
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())  # read local .env file

openai.api_key = os.getenv('OPENAI_API_KEY')

In [2]:
schema = {
    "title": "Template",
    "description": "The properties of the result.",
    "required": ["result"],
    "type": "object",
    "properties": {
       "result": {
           "type": "array",
           "description": "The result of the prompt",
           "items": {
               "type": "object",
               "required": ["nlptask", "numberOfPrompts"],
               "properties": {
                   "nlptask": {
                       "type": "string",
                       "description": "The name of the NLP task",
                   },
                   "numberOfPrompts": {
                       "type": "integer",
                       "description": "The number of prompts that were classified as the NLP task",
                   },
               }
           }
       }
    }
}

In [3]:
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [
        {"role": "system", "content": "Classify the given prompts according to NLP tasks."},
        {"role": "user", "content": prompt}
    ]
    functions = [
        {"name": "classify_prompts", "parameters": schema}
    ]
    function_call = {
        "name": "classify_prompts",
    }
    response = openai.chat.completions.create(
        model=model,
        messages=messages,
        functions=functions,
        function_call=function_call,
        temperature=0,  # this is the degree of randomness of the model's output\n",
    )
    return response.choices[0].message.function_call.arguments

# P3

In [3]:
import json
from promptsource.templates import TemplateCollection, get_templates_data_frame

In [5]:
df = get_templates_data_frame()
df

Unnamed: 0,id,dataset,subset,name,reference,original_task,choices_in_prompt,metrics,answer_choices,jinja
0,94577b75-2eac-4eae-b367-3b413c4188c6,super_glue,record,Add sentence after (continuation choices),,True,False,[Accuracy],{% for entity in entities[:-1] %} {{ query | r...,"After reading the article, write another sente..."
1,24c267d4-359e-40a9-83d2-bff904d63b09,super_glue,record,Add sentence after after (continuation choices),,True,False,[Accuracy],{% for entity in entities[:-1] %} {{ query | r...,"Summary:\n\n- {{ passage.split(""@highlight"")[1..."
2,e68d13c5-df75-4de0-b59e-f2eaf4af6ce7,super_glue,record,Can you figure out…,,True,False,[Squad],"{{ entities | join(""|||"") }}",{{ passage }} \n{{ query }} \nCan you figure o...
3,df8d0822-2cad-42de-8191-687ae47f6098,super_glue,record,GPT-3 style (continuation choices),Brown et al. 2020,True,False,[Accuracy],{% for entity in entities[:-1] %} - {{ query |...,"{{ passage | replace(""@highlight"", ""\n- "") }} ..."
4,64013fb3-1afd-4e5a-8777-b164ca3b8e18,super_glue,record,GPT-3 style summary only (continuation choices),Brown et al. 2020,True,False,[Accuracy],{% for entity in entities[:-1] %} - {{ query |...,"{{ passage.split(""@highlight"")[0] }}\n\nSummar..."
...,...,...,...,...,...,...,...,...,...,...
2080,eed32ee4-ebc3-499f-ba61-e91461f56ccb,acronym_identification,,find_acronym,"Given the tokens, find the abbreviation for an...",True,False,[Other],,{% set random_exp = '' %}{% set _dummy = none ...
2081,64f438f2-9968-459f-82d2-24bad632b358,acronym_identification,,find_acronym_meaning,"Given the tokens, find the expansion of an abb...",True,False,[Other],,{% set random_abbr = '' %}\n{% set _dummy = no...
2082,e4e42433-0e37-4aa5-bbce-7f336ecac6a3,acronym_identification,,find_acronyms_and_expansions,"Given the tokens, find the abbreviation mappin...",True,False,[Other],,{% set _dummy = none %}\n{% set abbr_exp_dict ...
2083,cae58242-cde9-472d-ae9e-56fc7e79c0d1,acronym_identification,,list_abbreviations,"Given the tokens, list the abbreviations. Metr...",True,False,[Other],,List all the acryonyms in the following space-...


In [6]:
templates = TemplateCollection()

datasets = list(templates.datasets_templates.keys())

prompts = {}
for dataset, subset in datasets:
    prompts_for_dataset = df.loc[(df["dataset"] == dataset) & (df["subset"] == subset)]["jinja"].to_list()
    if len(prompts_for_dataset) == 0:
        prompts_for_dataset = df.loc[(df["dataset"] == dataset) & (df["subset"].isnull())]["jinja"].to_list()
    
    prompts[f"{dataset}_{subset}"] = prompts_for_dataset

In [7]:
prompt_sum = {f"{dataset}_{subset}": len(prompts[f"{dataset}_{subset}"]) for dataset, subset in datasets}
list(prompt_sum.keys())[62:66]

['wiki_qa_None',
 'scitail_snli_format',
 'scitail_tsv_format',
 'crows_pairs_None']

In [8]:
max(prompt_sum.values())

20

In [9]:
num_of_datasets = len(set(df['dataset'].to_list()))
print(f"number of datasets {num_of_datasets}")

number of datasets 180


In [26]:
num_of_prompts = sum(prompt_sum.values())
print(f"number of prompts: {num_of_prompts}")

number of prompts: 2085


In [11]:
print(f"average prompt per dataset: {num_of_prompts / num_of_datasets}")

average prompt per dataset: 11.583333333333334


In [34]:
valid = {}
invalid = {}

In [35]:
for key in prompt_sum.keys():
    formatted_list = "\n\n=============\n".join(f"prompt_{index+1}: {value}" for index, value in enumerate(prompts[key]))
    prompt_str = f"""The prompts are delimited by triple backticks.

```
{formatted_list}
```
"""
    
    res = json.loads(get_completion(prompt_str, model="gpt-4"))
    tasks_sum = sum([r["numberOfPrompts"] for r in res["result"]])
    
    if prompt_sum[key] == tasks_sum:
        valid[key] = {
            "task_sum": tasks_sum,
            "response": res
        }
    else:
        invalid[key] = {
            "num_prompts": prompt_sum[key],
            "task_sum": tasks_sum,
            "response": res
        }

In [36]:
print(f"Valid: \n{len(valid)}")
print(f"Invalid: \n{len(invalid)}")

Valid: 
273
Not Valid: 
6


In [37]:
valid_json = json.dumps(valid, indent=4)
with open("p3_classified_prompts.json", "w") as f:
    f.write(valid_json)

In [38]:
invalid_json = json.dumps(invalid, indent=4)
with open("p3_not_classified_prompts.json", "w") as f:
    f.write(invalid_json)

### Manually classify invalid results...

In [13]:
def get_num_of_prompts_for_task(nlp_tasks_per_dataset) -> dict:
    nlp_tasks = {}
    for ds in prompt_sum.keys():
        try:
            for task in nlp_tasks_per_dataset[ds]["response"]["result"]:
                nlp_tasks[task["nlptask"]] = nlp_tasks.get(task["nlptask"], 0) + task["numberOfPrompts"]
        except KeyError:
            pass
    return nlp_tasks

In [18]:
gpt_classified = json.load(open("p3_classified_prompts.json"))
manually_classified = json.load(open("p3_manually_classified_prompts.json"))

tasks_per_dataset = get_num_of_prompts_for_task({**gpt_classified, **manually_classified})

<generator object <genexpr> at 0x7f8b61030c10>


In [19]:
dict(sorted(tasks_per_dataset.items(), key=lambda item: item[1], reverse=True))

{'Question Answering': 395,
 'Text Generation': 263,
 'Text Classification': 215,
 'Natural Language Inference': 185,
 'Sentiment Analysis': 145,
 'Multiple Choice Question Answering': 121,
 'Paraphrase Identification': 56,
 'Named Entity Recognition': 45,
 'Text Summarization': 44,
 'Question Generation': 38,
 'Text Completion': 38,
 'Coreference Resolution': 28,
 'Extractive Question Answering': 27,
 'Pronoun Disambiguation': 26,
 'Textual Entailment': 25,
 'Commonsense Question Answering': 22,
 'Summarization': 20,
 'Cloze Test': 18,
 'Cloze Task': 18,
 'Emotion Detection': 16,
 'Semantic Textual Similarity': 14,
 'Text Simplification': 13,
 'Duplicate Question Detection': 12,
 'Intent Classification': 12,
 'Natural Language Generation': 11,
 'Word Sense Disambiguation': 10,
 'Dialogue Systems': 10,
 'Semantic Similarity': 10,
 'Next Word Prediction': 10,
 'Subjectivity Analysis': 10,
 'Masked Language Modeling': 9,
 'Topic Classification': 9,
 'Entity Recognition': 8,
 'Information

In [35]:
len(tasks_per_dataset)

85