<a href="https://colab.research.google.com/github/meghorikawa/ULLM/blob/main/ULLMs_Groq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install groq

Collecting groq
  Downloading groq-0.11.0-py3-none-any.whl.metadata (13 kB)
Collecting httpx<1,>=0.23.0 (from groq)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->groq)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->groq)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading groq-0.11.0-py3-none-any.whl (106 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.5/106.5 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB

In [None]:
import pandas as pd
from google.colab import userdata
import os
from groq import Groq
import json
import time


api_key = userdata.get('groq_api_key')
%env GROQ_API_KEY=$api_key #input your own Groq API key here

env: GROQ_API_KEY=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


In [None]:
# Read in data
df = pd.read_csv('/content/AnnotatedData.csv') # load in data from contents file
google_sheet = "/content/prompt_examples.csv"
df.head()

In [None]:
# split df into two dfs with messages belonging to the two different tasks: "Comparing Athletes" and "Buying a Birthday Present"
mask = df['task'] == "Comparing Athletes"
athletes_df = df[mask]
bday_df = df[~mask]

In [None]:
class IterDialog:
  """
  An Iterator to go through all the messages in a dialog dataframe and
  output the history of the corresponding dialog as a list of messages,
  with the interaction in question as the last index
  """

  def __init__(self, df):
    """
    Initialize the iterator with the dataframe and the pointer as the index of
    the first row
    """
    self.df = df
    self.pointer = None

  def __next__(self):
    """
    Gets the history of the next message in the dataframe and advances the pointer.
    The message in question is the last item in the returned list.
    """

    if self.pointer == None:
      self.pointer = df.index[0]
    else:
      self.pointer += 1

    history = []
    row = self.df.loc[self.pointer]
    cur_id = row['dialog_id']
    # a smaller df containing only the current conversation
    cur_dialog_df = self.df[(self.df['dialog_id'] == cur_id)]

    # if there are rows with a lower index and the same dialog_id, those messages came first. Add those to the history.
    for _, prev_row in cur_dialog_df[cur_dialog_df.index < self.pointer].iterrows():
        history.append(f"BOT: {prev_row['bot_elicitation']}")
        history.append(f"STUDENT: {prev_row['user_input']}")

    # add bot_elicitation and user input at current point
    history.append(f"BOT: {row['bot_elicitation']}")
    history.append(f"STUDENT: {row['user_input']}")


    return history

  def __len__(self):
      return len(self.df)

  def __iter__(self):
    return self

  def get_target_constr(self):
    """
    For getting the target construct associated ith this turn in conversation.
    Returns None when there is no target construct being elicted
    """
    target_constr = self.df.loc[self.pointer]["target_constr"]
    return None if (pd.isna(target_constr) or target_constr == None) else target_constr

  def has_next(self):
    """
    Whether the iterator has a next message
    """
    return self.pointer != self.df.index[-1]

  def restart(self):
    """
    Moves pointer to beginning
    """
    self.pointer = None



In [None]:
def example_maker(task,  prompt_type, num):
  """
  Method to compile examples based on the task and type of prompting
  ______________

  Parameters:
    task: either 'Birthday Present' or 'Comparing Athletes'
    prompt_type: COT for Chain of Thought or FS for few-shot without CoT.
    num: number of examples
  returns a string of the formatted examples to be included in the prompt.
  """
  df = pd.read_csv(google_sheet)
  df = df[df['Task'] == task].reset_index(drop=True)
  prompt_examples = ''
  i = 0

  for i in range(min(num, len(df))):

    # Fill in the COT JSON example output first.
    json_cot = {}
    json_fs = {}

    context = df.loc[i, 'dialogue_context']
    response = df.loc[i, 'student_input']

    json_cot["relevance_thought"] = df.loc[i, "relevance_cot"]
    json_cot["relevance"] = str(df.loc[i, 'relevant']) # Needs to be cast as string to use json.dumps

    json_cot["construct_thought"] = "None" if pd.isna(df.loc[i, "target_construct_cot"]) else df.loc[i, "target_construct_cot"]
    json_cot["construct"] = "None" if pd.isna(df.loc[i, 'target_construct']) else df.loc[i, 'target_construct']

    # "Birthday Present" requires no factuality check
    if task == "Comparing Athletes":
      json_cot["factual_thought"] = df.loc[i, "factual_cot"]
      json_cot["factual"] = str(df.loc[i, "factual"])

    # Fill in the few-shot JSON by removing the keys with chains of thought
    for key, value in json_cot.items():
      if "thought" not in key:
        json_fs[key] = str(value)

    if prompt_type == "FS":
      json_example = json_fs
    else:
      json_example = json_cot

    # Get a string from the JSON example
    json_example = json.dumps(json_example)

    string = f"\nGiven the following CONTEXT:\n\n{context}\n{response}\n\n The OUTPUT should be: \n{json_example}\n\n"

    prompt_examples += string
    i += 1

  return prompt_examples

In [None]:
### Task instructions

#"Birthday Gift"
bg_task_instructions = '''Task Instructions: Your friend Max is having a birthday soon! Let's go shopping to buy him a present. Ask the shop assistant to help you find a gift he will like.
Remember that Max likes video games, music, and movies.
Here is some language that may help you:
You can use the present progressive form when you are searching for something
I am looking for a birthday gift for my friend.
I am searching for a birthday present for my friend.
You can use a comparative adjective to ask for more options
Do you have something cheaper?'''

#"Comparing athletes"
ca_task_instructions = '''
You and Max are discussing the results of the school Decathlon, a competition with 10 events. You watched Day 1, but Max missed it. Explain what happened on Day 1.
Here is some language that may help you:
Peter did better than Andy in the Day 1 events.
Peter ran faster than Andy in the 100 meters.

{"events": [
    {
      "event": "100 metres",
      "peter": "13 seconds",
      "andy": "13.5 seconds",
    },
    {
      "event": "Long jump",
      "peter": "168 cm",
      "andy": "170 cm"
    },
    {
      "event": "Shot put",
      "peter": "9.1 m",
      "andy": "8.9 m"
    },
    {
      "event": "High jump",
      "peter": "130 m",
      "andy": "150 m"
    },
    {
      "event": "400 metres",
      "peter": "85 seconds",
      "andy": "100 seconds"
    }
  ]
}
'''

In [None]:
# JSON Formats for the prompt.
COT_JSON = '''{
  "relevance_thought": <Step-by-step reasoning for whether the student's answer was relevant to the question being asked>,
  "relevance": <Boolean for whether the answer is relevant or not>,
  "factual_thought": <Step-by-step reasoning for whether the student's last answer is factual, comparing the information given in the task instructions and the one privided by the student.>,
  "factual": <Boolean for whether the answer is factual or not>,
  "construct_thought": <Step-by-step reasoning for whether the student's last answer contains the specified target construct>,
  "construct_present": <Boolean for whether the answer contains the target construct or not, or "None" if the target construct is None>

  ""
}'''

FS_JSON = '''{
  "relevance": Boolean,
  "factual": Boolean,
  "construct_present": True/False or None if the target construct is None
}'''

def prompt_builder(examples, task, context, student_input, target_construct, prompt_type):

  """
  Prompt-builder method
  parameters:
  Examples : the examples to be included in the prompt
  task: the name of the task
  dialogue_context: The previous dialogue context to be included in the prompt
  student_input: the student input to be included in the prompt
  target_construct: the target construct to be included in the prompt
  p_type: the type of prompting, either COT or FS
  returns: the prompt to be sent to the LLM
  """
  if prompt_type == 'COT':
    JSON = COT_JSON
  if prompt_type == 'FS':
    JSON = FS_JSON


  factual_prompt = ""
  if task == "Birthday Present":
    instructions = bg_task_instructions
  else:
    instructions = ca_task_instructions
    # only "Comparing Athletes" requires JSON with factuality information. Add this to the prompt if it's this task
    factual_prompt = "Finally, also inlcude information about whether the student's input is factually correct based on the task instructions."


  prompt = f'''
[INST]
You are a helpful teaching assistant for English as a second language.
ESL students are talking to a chatbot that works with string-matching to complete a task.
The chatbot does not always recognize when a user's answer is valid. When it does not, it asks the question again or tries to elicit an answer in a different way.


Based on the conversation history and the instructions of the chatbot task, your job is to output a JSON object, stating whether the last student response is relevant.
Importantly, we define "relevant" as providing a plausible answer to the question with all necessary information and which will not interrupt the flow of the conversation.
The JSON object should also state whether the message at hand contains a given grammatical construct. If the target construct is "None", the value of this property should be the string "None".
One property in a JSON object must never have more than one value.  The JSON must be valid, so all strings MUST be surrounded by double quotes (") and not by single quotes. Double quotes in strings should be escaped.
{factual_prompt}

Ignore typos and grammar issues. Answer based on what you understand the student's answer to be meant to represent.

The JSON object should follow this schema:

{JSON}
______
Here are some examples for how to do this:

{examples}
______

Now it is your turn. Here is all the information you need.
______
TASK INSTRUCTIONS:
{instructions}
______
CONVERSATION HISTORY:
{context}
______
STUDENT RESPONSE:
{student_input}
______
CONSTRUCT:
{str(target_construct)}

[/INST]
  '''

  return prompt

In [None]:
# Initialize Groq client with API key (obtained from Groq playground: )
client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

In [None]:
def get_completion(prompt):
  completion = client.chat.completions.create(
    model="mixtral-8x7b-32768",
    messages=[
        {
            "role": "user",
            "content": prompt
        }
    ],
    temperature=0,
    top_p=1,
    stream=False,
    response_format={"type": "json_object"},
    stop=None,
  )

  return completion.choices[0].message.content


In [None]:
athletes_iter = IterDialog(athletes_df)
bday_iter = IterDialog(bday_df)

prompt_type = "FS" # NOTE: change to COT once FS is done
# Loop over tasks and number of examples
for iterator in [bday_iter, athletes_iter]:
  if iterator == athletes_iter:
    task = "Comparing Athletes"
    df = athletes_df
  else:
    task = "Birthday Present"
    df = bday_df

  # iterate throgh different numbers for the k-shot examples
  for k in range(0, 6):
    examples = example_maker(task, prompt_type, k)

    while iterator.has_next():
      # save completion in dataframe
      condition = task.replace(" ", "") + "_" + prompt_type + "_" + str(k) # prompttype_k value (e.g. Birthday_Present_FS_3)

      # if it's the first time this condition is used, add the corresponding column to the dataframe
      if condition not in df:
        print(condition)
        df[condition] = ""

      history = next(iterator)

      context = history[:-1]
      student_input = history[-1]
      target_constr = iterator.get_target_constr()
      prompt = prompt_builder(examples, task, context, student_input, target_constr, prompt_type)

      if df.loc[iterator.pointer, condition] == "" or pd.isna(df.loc[iterator.pointer, condition]): # Only get completion if it has not been done before
        start = time.time()
        print(iterator.pointer)
        completion = get_completion(prompt)

        df.loc[iterator.pointer, condition] = completion

        time.sleep(0.25) # limit the number of requests per minute to prevent issues with rate limits
        end = time.time()
        print("The completion took " + str(end - start))
    iterator.restart()

24
The completion took 0.7326431274414062
25
The completion took 0.7105712890625
26
The completion took 0.6947355270385742
27
The completion took 0.9924561977386475
28
The completion took 13.791840076446533
29
The completion took 15.92470669746399
30
The completion took 18.91845726966858
31
The completion took 18.870135068893433
32
The completion took 13.789597034454346
33
The completion took 12.812562465667725
34
The completion took 13.843945980072021
35
The completion took 13.838682651519775
36
The completion took 14.918617010116577
37
The completion took 15.877822399139404
38
The completion took 16.097659587860107
39
The completion took 15.820312976837158
40
The completion took 16.426499366760254
41
The completion took 14.952352523803711
42
The completion took 16.78404927253723
43
The completion took 16.91515588760376
44
The completion took 17.840951204299927
45
The completion took 13.797117233276367
46
The completion took 14.162178993225098
47
The completion took 13.926829099655151

In [None]:
# save dataframes to CSV
bday_df.to_csv("bday_df.csv", sep=',', encoding='utf-8')

In [None]:
# save dataframe to CSV
athletes_df.to_csv("athletes_df.csv", sep=',', encoding='utf-8')