In [8]:
import os

import openai
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())  # read local .env file

openai.api_key = os.getenv('OPENAI_API_KEY')

schema = {
    "title": "Template",
    "description": "The properties of the result.",
    "required": ["result"],
    "type": "object",
    "properties": {
        "result": {
            "type": "array",
            "items": {
                "type": "object",
                "required": ["nlptask", "numberOfPrompts"],
                "properties": {
                    "nlptask": {
                        "type": "string",
                        "description": "The category of the NLP task",
                    },
                    "numberOfPrompts": {
                        "type": "integer",
                        "description": "The number of instructions that were classified as the NLP task category",
                    },
                }
            }
        }
    }
}

def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [
        {"role": "system", "content": "Classify the given instructions according to a NLP tasks category."},
        {"role": "user", "content": prompt}
    ]
    functions = [
        {"name": "classify_prompts", "parameters": schema}
    ]
    function_call = {
        "name": "classify_prompts",
    }
    response = openai.chat.completions.create(
        model=model,
        messages=messages,
        functions=functions,
        function_call=function_call,
        temperature=0,  # this is the degree of randomness of the model's output\n",
    )
    return response.choices[0].message.function_call.arguments


## Read json files

In [24]:
import json

self_instruct = open("../data/self_instruct/all_instances_82K.json")
dynosaur = open("../data/dynosaur/instructions-full.json")

self_instruct_all_instructions = json.load(self_instruct)
dynosaur_all_instructions = json.load(dynosaur)

In [25]:
print(f"""
Self Instruct: {len(self_instruct_all_instructions)}
Dynosaur: {len(dynosaur_all_instructions)}
""")


Self Instruct: 82439
Dynosaur: 5740


In [26]:
import random

self_instruct_random_instructions = random.choices(self_instruct_all_instructions, k=200)
dynosaur_random_instructions = random.choices(dynosaur_all_instructions, k=200)

In [29]:
self_instruct_instructions = [instruction["instruction"] for instruction in self_instruct_random_instructions]
dynosaur_instructions = [instruction["instruction"] for instruction in dynosaur_random_instructions]

print(f"""
Self Instruct: {len(self_instruct_instructions)}\n{self_instruct_instructions[0:2]}
Dynosaur: {len(dynosaur_instructions)}\n{dynosaur_instructions[0:2]}
""")


Self Instruct: 200
['Read in a number of words, and output their frequency. output one word per line, with the most frequent word first.', 'List three benefits of running.']
Dynosaur: 200
['Given a code snippet, identify the size of the code.', 'Given a woodworking machine, determine if it can be made to run on a single-phase 240V, 20A circuit and what is involved in doing so.']


## Categorize

In [30]:
# Self Instruct
self_instruct_results = []
for instruction in self_instruct_instructions:
    prompt = f"""The instruction is delimited by triple backticks.

```
{instruction}
```
"""

    self_instruct_results.append(json.loads(get_completion(prompt, model="gpt-4")))

self_instruct_results[0:2]

[{'result': [{'nlptask': 'Text Classification', 'numberOfPrompts': 1}]},
 {'result': [{'nlptask': 'Text Generation', 'numberOfPrompts': 1}]}]

In [None]:
# Dynosaur
dynosaur_results = []
for instruction in dynosaur_instructions:
    prompt = f"""The instruction is delimited by triple backticks.

```
{instruction}
```
"""
    
    dynosaur_results.append(json.loads(get_completion(prompt, model="gpt-4")))
    
dynosaur_results[0:2]

## Sort

In [32]:
self_instruct_categories = [item["result"][0]["nlptask"] for item in self_instruct_results]
dynosaur_categories = [item["result"][0]["nlptask"] for item in dynosaur_results]

print(f"""
Self Instruct: {self_instruct_categories[0:2]}
Dynosaur: {dynosaur_categories[0:2]}
""")


Self Instruct: ['Text Classification', 'Text Generation']
Dynosaur: ['Question Answering', 'Question Answering']


In [33]:
from collections import Counter

self_instruct_sorted = Counter(self_instruct_categories)
dynosaur_sorted = Counter(dynosaur_categories)

print(f"""
Self Instruct: {self_instruct_sorted.most_common()[0:8]}
Dynosaur: {dynosaur_sorted.most_common()[0:8]}
""")



Self Instruct: [('Text Generation', 42), ('Question Answering', 34), ('Text Classification', 31), ('Sentiment Analysis', 22), ('Information Extraction', 20), ('Text Analysis', 6), ('Not an NLP task', 4), ('Not a NLP task', 4)]
Dynosaur: [('Information Extraction', 39), ('Question Answering', 37), ('Text Classification', 27), ('Text Generation', 24), ('Question Generation', 18), ('Natural Language Inference', 6), ('Sentiment Analysis', 6), ('Code Understanding', 5)]


In [34]:
self_instruct_sorted.most_common()

[('Text Generation', 42),
 ('Question Answering', 34),
 ('Text Classification', 31),
 ('Sentiment Analysis', 22),
 ('Information Extraction', 20),
 ('Text Analysis', 6),
 ('Not an NLP task', 4),
 ('Not a NLP task', 4),
 ('Semantic Similarity', 2),
 ('Machine Translation', 2),
 ('Text Transformation', 2),
 ('Non-NLP Task', 2),
 ('Language Identification', 2),
 ('Text Summarization', 2),
 ('Text Completion', 2),
 ('Sentence Classification', 1),
 ('Mathematical Reasoning', 1),
 ('Programming', 1),
 ('Grammar Check', 1),
 ('None', 1),
 ('Not applicable', 1),
 ('Tokenization', 1),
 ('Fact Checking', 1),
 ('Text Cleaning', 1),
 ('Task-oriented dialogue systems', 1),
 ('Machine Translation Evaluation', 1),
 ('Text Similarity', 1),
 ('Text Processing', 1),
 ('Spell Checking', 1),
 ('string matching', 1),
 ('Question Generation', 1),
 ('Math Word Problems', 1),
 ('Information Retrieval', 1),
 ('Code Understanding', 1),
 ('Named Entity Recognition', 1),
 ('Speech Recognition', 1),
 ('Word Genera