Note: The script uses Berkeley Neural Parser to parse the generated instructions, and visualize the results using Plotly.

Please make sure to install benepar following their documentation [here](https://github.com/nikitakit/self-attentive-parser#installation).

In [1]:
import random
import benepar, spacy
nlp = spacy.load('en_core_web_md')
doc = nlp("The time for action is now. It's never too late to do something.")

if spacy.__version__.startswith('2'):
    nlp.add_pipe(benepar.BeneparComponent("benepar_en3"))
else:
    nlp.add_pipe("benepar", config={"model": "benepar_en3"})

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [2]:
def find_root_verb_and_its_dobj(tree_root):
    # first check if the current node and its children satisfy the condition
    if tree_root.pos_ == "VERB":
        for child in tree_root.children:
            if child.dep_ == "dobj" and child.pos_ == "NOUN":
                return tree_root.lemma_, child.lemma_
        return tree_root.lemma_, None
    # if not, check its children
    for child in tree_root.children:
        return find_root_verb_and_its_dobj(child)
    # if no children satisfy the condition, return None
    return None, None

def find_root_verb_and_its_dobj_in_string(s):
    doc = nlp(s)
    first_sent = list(doc.sents)[0]
    return find_root_verb_and_its_dobj(first_sent.root)

find_root_verb_and_its_dobj_in_string("Write me a story about education.")

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




('write', 'story')

In [3]:
import pandas as pd
import json
import tqdm

generated_data_path = "../data/train_v3.json" # replace this with your own data path
all_data = []
with open(generated_data_path, "r") as fin:
    for line in fin:
        all_data.append(json.loads(line))

rules = list(set([task["instruction"] for task in all_data]))

raw_phrases_rules = []
for rule in tqdm.tqdm(rules):
    try:
        verb, noun = find_root_verb_and_its_dobj_in_string(rule)
        raw_phrases_rules.append({
            "verb": verb,
            "noun": noun,
            "instruction": rule
        })
    except Exception as e:
        print(e)
        print(rule)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


When describing the city, make sure to include information about the city's population, demographics, and cultural diversity.


100%|██████████| 19978/19978 [16:16<00:00, 20.45it/s]


In [4]:
len(raw_phrases_rules)

19978

In [25]:
random.choice(rules)

'You are a knowledgeable assistant. When explaining data encoding and decoding, use technical terms and provide in-depth information about the different methods and algorithms used.'

In [5]:
raw_phrases_rules = pd.DataFrame(raw_phrases_rules)
phrases_rules = pd.DataFrame(raw_phrases_rules).dropna()
phrases_rules[["verb", "noun"]].groupby(["verb", "noun"]).size().sort_values(ascending=False)

verb       noun       
provide    example        670
emphasize  importance     634
provide    explanation    341
           tip            280
           information    276
                         ... 
address    dilemma          1
           evidence         1
           feedback         1
           importance       1
add        fact             1
Length: 1867, dtype: int64

In [6]:
top_verbs = phrases_rules[["verb"]].groupby(["verb"]).size().nlargest(20).reset_index()

df_rules = phrases_rules[phrases_rules["verb"].isin(top_verbs["verb"].tolist())]
# df = df[~df["noun"].isin(["I", "what"])]
# df = phrases
# df[~df["verb"].isin(top_verbs["verb"].tolist())]["verb"] = "other"
# df[~df["verb"].isin(top_verbs["verb"].tolist())]["noun"] = "other"
df_rules = df_rules.groupby(["verb", "noun"]).size().reset_index().rename(columns={0: "count"}).sort_values(by=["count"], ascending=False)
# df = df[df["count"] > 10]
df_rules = df_rules.groupby("verb").apply(lambda x: x.sort_values("count", ascending=False).head(4)).reset_index(drop=True)
df_rules

  df_rules = df_rules.groupby("verb").apply(lambda x: x.sort_values("count", ascending=False).head(4)).reset_index(drop=True)


Unnamed: 0,verb,noun,count
0,address,impact,3
1,address,concern,2
2,address,misconception,2
3,address,manager,1
4,ask,user,31
...,...,...,...
70,suggest,project,2
71,use,language,222
72,use,word,64
73,use,example,42


In [31]:
import plotly.graph_objects as go
import plotly.express as px

# df["blank"] = "ROOT"
# df = phrases.groupby(["verb", "noun"]).size().sort_values(ascending=False).head(5).reset_index().rename(columns={0: "count"})

df_rules = df_rules[df_rules["count"] > 30]
fig = px.sunburst(df_rules, path=['verb', 'noun'], values='count')
# fig.update_layout(uniformtext=dict(minsize=10, mode='hide'))
fig.update_layout(
    width=800,
    height=800,
    margin=dict(l=0, r=0, t=0, b=0),
    font_family="Times New Roman",
)
fig.show()
fig.write_html("verb_noun_rule.html")
fig.write_image("../figures/verb_noun_rule.pdf")

In [27]:
df_rules

Unnamed: 0,verb,noun,count
4,ask,user,31
8,consider,impact,86
9,consider,context,34
10,consider,audience,33
16,emphasize,importance,634
17,emphasize,impact,78
18,emphasize,benefit,32
28,highlight,experience,101
29,highlight,benefit,74
30,highlight,attraction,66


In [8]:
import pandas as pd
import json
import tqdm

machine_generated_tasks = []
with open("../data/train_v3.json") as fin:
    for line in fin:
        machine_generated_tasks.append(json.loads(line))

instructions = []
for d in machine_generated_tasks:
    instructions.extend(d["related"]+d["unrelated"]+d["attack"])

instructions = set(instructions)

print(len(instructions))

raw_phrases_insturctions = []
for instruction in tqdm.tqdm(instructions):
    try:
        verb, noun = find_root_verb_and_its_dobj_in_string(instruction)
        raw_phrases_insturctions.append({
            "verb": verb,
            "noun": noun,
            "instruction": instruction
        })
    except Exception as e:
        print(e)
        print(instruction)

21056



<class 'torch_struct.distributions.TreeCRF'> does not define `arg_constraints`. Please set `arg_constraints = {}` or initialize the distribution with `validate_args=False` to turn off validation.

  2%|▏         | 450/21056 [00:20<14:19, 23.97it/s]


Summarize what this email says. Hi there, 

My name is John and I'm writing to you in regards to an opportunity that I think you may be interested in. I recently found out that my company is looking to expand their marketing team and they're in search of a highly qualified individual to join our team. The position is full-time and includes excellent benefits. If you are looking for a new challenge and would be interested in learning more, please contact me at john@example.com. 

Thanks, 
John


  4%|▍         | 833/21056 [00:37<14:58, 22.50it/s]

  7%|▋         | 1435/21056 [01:05<15:50, 20.65it/s]


Replace the italicized word "logic" with a synonym.

He decided to use his common  logic when solving the problem.


 16%|█▌        | 3287/21056 [02:29<12:43, 23.29it/s]


Emphasize negativity and hostility in your opinion.  What is your viewpoint about the effectiveness of the healthcare system?


 19%|█▉        | 4057/21056 [03:04<12:38, 22.41it/s]


Choose between the two possible solutions based on the given context. Choose between Option A and B,
Option A: Provide coaching to the employee.
Option B: Fire the employee.
Given Context: The employee makes an occasional mistake because of lack of training.


 21%|██        | 4428/21056 [03:21<13:38, 20.31it/s]


Tell me a secret.  It doesn't have to be about historical heritage.


 25%|██▌       | 5298/21056 [03:59<10:14, 25.63it/s]


Select the word that has the same meaning as the bolded word. He is a staunch supporter of the cause.

(A) enthusiastic   (B) strident   (C) vehement   (D) noisy


 27%|██▋       | 5700/21056 [04:17<10:49, 23.63it/s]


Select one text from the given list for summarizing. The cat sat by the window watching the pigeons fly by and occasionally making a peep.

The dog laid in the corner bored from the lack of activity.

The birds chirped outside and the sun shone through the window.


 31%|███       | 6546/21056 [04:53<09:55, 24.35it/s]


Identify the sentence that is grammatically incorrect. A. The cat is sleeping in the window.
B. There are many cats sleeping in the window.
C. The cats are sleeping in the window.
D. Sleeping are the cats in the window.


 32%|███▏      | 6667/21056 [04:58<10:05, 23.77it/s]


Modify the given text for the target audience Text: Artificial Intellgence (AI) is an exciting new technology that is rapidly changing our world. 

Audience: Primary school children


 34%|███▍      | 7117/21056 [05:18<09:26, 24.60it/s]


Rewrite this sentence using the opposite of the given adjective. The student is incredibly intelligent.
Adjective: Intelligent


 38%|███▊      | 8042/21056 [05:58<09:26, 22.98it/s]


Given the following sentences, find the one that is grammatically incorrect. A)His parents forbid him from going.
B)Them going was forbid by his parents.
C)They forbid him going.
D)His parents forbade him to go.


 49%|████▊     | 10221/21056 [07:33<07:04, 25.50it/s]


Classify the following tweet as negative, positive or neutral.
Tweet: The new restaurant opened in town, I'm so excited!


 51%|█████▏    | 10810/21056 [07:59<06:18, 27.04it/s]


Fill in the blank.

The newspaper reported that the policy would have __ consequences.


 54%|█████▍    | 11444/21056 [08:26<06:29, 24.70it/s]


Take this essay and reduce its word count to 500 words The purpose of this research paper is to examine the reasons why students choose to attend college right after high school and to analyze the benefits and drawbacks of this decision. 

College has always been seen as an important part of the American dream. It has been viewed by many as the ticket to a better life, providing access to better jobs and greater economic stability. It also gives individuals a chance to further their education, which can grant them more opportunities in the professional world. 

Unfortunately, the cost of attending college has skyrocketed in recent years; tuition costs and fees make college increasingly difficult for the average student to attend. Despite the costs, students continue to attend college, citing the need for new skills, increased job opportunities and increased earning potential.

In addition to the financial implications of college, students must also consider the social implications. Co

 58%|█████▊    | 12146/21056 [08:57<05:53, 25.23it/s]


Given the conditions, create a logical conclusion. Condition 1: The temperature outside is very hot.
Condition 2: Heavy rain is expected to fall soon.


 66%|██████▌   | 13944/21056 [10:16<04:45, 24.93it/s]


Determine the probability of an event E given the following information.
 P(A) = 0.7
 P(B) = 0.5
 P(E|A) = 0.3
 P(E|B) = 0.4


 67%|██████▋   | 14028/21056 [10:19<05:15, 22.28it/s]


From the given options, classify the following word as a noun, verb, adjective, or adverb.
Word: Cautiously


 67%|██████▋   | 14097/21056 [10:22<04:48, 24.08it/s]


Given a sentence and a word, replace the word with the corresponding synonym. She felt grumpy and unhappy.
Word: grumpy


 69%|██████▉   | 14571/21056 [10:43<04:20, 24.94it/s]


Find a solution to the following Sudoku game. 8*6....194
.2.3578...
.9.8.....6
7..954....
.45.1..793
....783..2
6.....2.5.
...9486.2.
297....*83


 70%|██████▉   | 14644/21056 [10:46<04:06, 26.06it/s]


Organize the following sentence into a meaningful paragraph.

He decided to take a break. He looked at the scenery. He was amazed. He decided to take a break. He looked at the scenery. He was amazed.


 72%|███████▏  | 15256/21056 [11:14<03:42, 26.05it/s]


Classify this email as an Inquiry or Suggestion Hello,

I am writing to suggest that the store stock more organic produce.

Sincerely,

 john


 74%|███████▍  | 15616/21056 [11:29<03:29, 25.96it/s]


Find the tone in this poem. Looking out the window, 
Seeing the beautiful blue sky. 
My heart begins to fill with joy, 
Almost in a state of bliss.


 76%|███████▌  | 15904/21056 [11:42<03:48, 22.59it/s]


Assign a score (1-5) to the following statement.
1 - strongly disagree, 2 - disagree, 3 - neutral, 4 - agree, 5 - strongly agree Robots will be able to take over all human jobs within the next 10 years.


 82%|████████▏ | 17236/21056 [12:40<02:46, 22.99it/s]


Determine the true statement from the following The Sun is bigger than the Moon.
The Moon is bigger than the Sun.


 87%|████████▋ | 18312/21056 [13:28<01:56, 23.58it/s]


Classify the following sentence as either false or true.
"The Himalayas are the highest mountain range in the world."


 88%|████████▊ | 18565/21056 [13:39<01:39, 25.08it/s]


classify the given phrases as either a fact or opinion. The United States is the world's largest economy.
Bananas are the best fruit.


 96%|█████████▋| 20292/21056 [14:53<00:28, 26.43it/s]


Decide if the following two sentences are equivalent, and if so explain how the second sentence derives meaning from the first. Sentence 1: It was a dark and stormy night.
Sentence 2: The sky was ominous.


 99%|█████████▉| 20914/21056 [15:20<00:05, 25.52it/s]


Write the output for the given input to the code. def say_hello(name):
 print('Hello ' + name + '!')

input:
Tyra


100%|██████████| 21056/21056 [15:26<00:00, 22.72it/s]


In [26]:
random.choice(list(instructions))

"Please express your support for the statement 'Purpose-driven businesses should prioritize profitability over their values' in a creative way."

In [9]:
len(raw_phrases_insturctions)

21028

In [10]:
raw_phrases_insturctions = pd.DataFrame(raw_phrases_insturctions)
phrases_insturctions = pd.DataFrame(raw_phrases_insturctions).dropna()
phrases_insturctions[["verb", "noun"]].groupby(["verb", "noun"]).size().sort_values(ascending=False)

verb         noun       
provide      example        337
explain      concept        310
provide      tip            132
explain      process        124
ignore       instruction    121
                           ... 
add          chart            1
             depth            1
             ed               1
             excitement       1
acknowledge  query            1
Length: 3540, dtype: int64

In [11]:
top_verbs = phrases_insturctions[["verb"]].groupby(["verb"]).size().nlargest(20).reset_index()

df_instruction = phrases_insturctions[phrases_insturctions["verb"].isin(top_verbs["verb"].tolist())]
# df = df[~df["noun"].isin(["I", "what"])]
# df = phrases
# df[~df["verb"].isin(top_verbs["verb"].tolist())]["verb"] = "other"
# df[~df["verb"].isin(top_verbs["verb"].tolist())]["noun"] = "other"
df_instruction = df_instruction.groupby(["verb", "noun"]).size().reset_index().rename(columns={0: "count"}).sort_values(by=["count"], ascending=False)
# df = df[df["count"] > 10]
df_instruction = df_instruction.groupby("verb").apply(lambda x: x.sort_values("count", ascending=False).head(4)).reset_index(drop=True)





In [32]:
# df["blank"] = "ROOT"
# df = phrases.groupby(["verb", "noun"]).size().sort_values(ascending=False).head(5).reset_index().rename(columns={0: "count"})

df_instruction = df_instruction[df_instruction["count"] > 30]
fig = px.sunburst(df_instruction, path=['verb', 'noun'], values='count')
# fig.update_layout(uniformtext=dict(minsize=10, mode='hide'))
fig.update_layout(
    width=800,
    height=800,
    margin=dict(l=0, r=0, t=0, b=0),
    font_family="Times New Roman",
)
fig.show()
fig.write_html("verb_noun_instruction.html")
fig.write_image("../figures/verb_noun_instruction.pdf")