In [None]:
# different path for cluster
import os
if not  os.path.isfile('../hf.key'):
    %cd LLMs-for-Social-Robotics/code/Experiment\ 1
!ls

In [None]:
import pandas as pd
import openai
import time
import re
import gc
from scipy import stats
from statsmodels.stats.multitest import multipletests
from sklearn.metrics import mean_absolute_error
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import datetime 
import torch
import random
import multiprocessing
cuda_avail=torch.cuda.is_available()
print('cuda available:', cuda_avail)
print("Number of processors: ", multiprocessing.cpu_count())

RATE_LIMIT = 0.1 # seconds of pause needed after every OpenAI API call


with open("../hf.key", "r") as f_in:
    hf_key = f_in.readline().strip()

with open('../openai.key', 'r') as f_in:
    openai.api_key = f_in.readline().strip()

## Model Completion Functions

In [8]:
def gpt_complete(prompt, model, max_tokens, prompt_ending, system_message, verbose=False):
    """
    Given a prompt, generate a completion with GPT API
    """
    temperature = 0
    prompt=prompt+prompt_ending
    if verbose: print("Prompt:", prompt)
    # the create function differ for chat and non-chat models
    if ("gpt-4" in model or "turbo" in model) and not "instruct" in model:
        return openai.chat.completions.create(
            model=model,
            messages=[ {"role": "system", "content": system_message},
                {"role": "user", "content": prompt}],
            temperature=temperature,
            max_tokens=max_tokens,
            seed = 42, 
            )
    else:
        return openai.completions.create(
            model=model,
            prompt=prompt,
            temperature=temperature,
            max_tokens=max_tokens
        ) 

def hf_complete(prompt, prompt_ending, system_message, model, tokenizer, max_len, formatting=True):
    '''
    Given a prompt, generate a completion with a given model
    '''
    if formatting:
        prompt = llama_prompt_format(prompt+prompt_ending, system_message)
    else:
        prompt = prompt+prompt_ending
    # Step 1: Tokenize the prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    input_ids = input_ids.to(model.device)
    
    # Step 2: Generate the model input
    output = model.generate(input_ids, max_new_tokens=max_len, num_return_sequences=1, top_k=1)

          
    # Step 3: Decode the generated output to get the answer
    generated_answer = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Step 4: # Get only the generated text, ignoring the prompt
    generated_answer = generated_answer.split("[/INST]")[1]

    return generated_answer

# from Philipp Wicke: https://huggingface.co/Pwicke/logprobs_for_CausalLMs  
def logprobs_from_prompt(prompt, tokenizer, model, gpu_id=False):
    if gpu_id:
        encoded = tokenizer(prompt, return_tensors="pt").to(torch.device("cuda:"+str(gpu_id)))
    else:
        encoded = tokenizer(prompt, return_tensors="pt") 
    input_ids = encoded["input_ids"]
    output = model(input_ids=input_ids)
    shift_labels = input_ids[..., 1:].contiguous()
    shift_logits = output.logits[..., :-1, :].contiguous()
    log_probs = []
    log_probs.append((tokenizer.decode(input_ids[0].tolist()[0]), None))
    for idx, (label_id, logit) in enumerate(zip(shift_labels[0].tolist(), shift_logits[0])):
        logit = logit.type(torch.FloatTensor) # device_map="auto" fpr model initialization
        logprob = torch.nn.functional.log_softmax(logit, dim=0)[label_id].item()
        log_probs.append((tokenizer.decode(label_id), float(logprob)))
    return log_probs

def hf_complete_proba(prompt, prompt_ending, system_message, model, tokenizer, verbose, formatting=True):
    """
    Given a prompt, generate the completion by averaging across the valid answers based on their probabilities
    """
    answers = {0:"1", 1:"2", 2:"3", 3:"4", 4:"5"}

    if formatting:
        prompt = llama_prompt_format(prompt+prompt_ending, system_message)
    else:
        prompt = prompt+prompt_ending

    res_ends = []
    proba_list = []
    for j, end in answers.items():
        input_prompt = prompt+end 
        if verbose: print(input_prompt)
        logprobs = logprobs_from_prompt(input_prompt, tokenizer, model)
        res = {"tokens": [x for x,y in logprobs],"token_logprobs": [y for x,y in logprobs]}
        res_ends.append(res)
        proba_list.append(np.exp(res["token_logprobs"][-1])) # get the probability of the last token
        if verbose:print(end, res, "\n")

    average_completion = 0
    for i in range(len(proba_list)):
        average_completion += ((i+1) * proba_list[i])
    average_completion = average_completion / sum(proba_list)      
        
    return proba_list, average_completion, res_ends

def llama_prompt_format(user_message, system_message):
     # fromat prompt as described here: https://huggingface.co/blog/llama2
    return f"<s>[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n{user_message} [/INST]"

## Wachowiak et al. Experiment

1) Given one of the 16 scenarios, does an LLM produce similar ratings to the human ratings?

In [9]:
exp1_df = pd.read_csv('../../data/wachowiak_et_al.csv', sep=';')
# remap all '1 ("Completly agree")' to '1',etc.
exp1_df = exp1_df.replace(to_replace="1 (Completely Disagree)", value="1")
exp1_df = exp1_df.replace(to_replace="5 (Completely Agree)", value="5")
exp1_df = exp1_df.replace(to_replace="7 (Completely Agree)", value="7")
exp1_df = exp1_df.replace(to_replace="7 (a lot of experience)", value="7")
exp1_df = exp1_df.replace(to_replace="1 (no experience)", value="1")
# convert the columns that are still string to float values
exp1_df = exp1_df.apply(pd.to_numeric, errors='ignore')
exp1_df = exp1_df.apply(pd.to_numeric, errors='ignore')
exp1_df = exp1_df.iloc[2:]

# labels as in the df 
scenario_labels = ["Suc_G", "Suc_T", "Suc_J", "Suc_V", "Err_G", "Err_S", "Sub_G", "Sub_M", "Unc_L", "Unc_P", "Ina_G", "Ina_B", "Norm_C", "Norm_M", "Unfor_P", "Unfor_O"]
scenario_labels_short = ["Suc", "Suc", "Suc", "Suc", "Err", "Err", "Sub", "Sub", "Unc", "Unc", "Ina", "Ina", "Norm", "Norm", "Unfor", "Unfor"]
colors = {"Suc": "green", "Err": "red", "Sub": "blue", "Unc": "orange", "Ina": "purple", "Norm": "grey", "Unfor": "brown"}

# 16 scenarios
scenarios = ["The user asks for tea, and the robot grasps and brings the tea to the user.",
             "The user asks for the time, and the robot answers successfully.",
             "The user asks for a joke, and the robot tells one.",
             "The user asks how many cans are left. Looking at the cans, the robot reports the correct number.",
             "The robot tries to grasp a box but fails to do so and throws it off the table.",
             "The user asks for movie recommendations, but the robot starts talking about restaurant bookings.",
             "When tasked to grasp an object, the robot needs a long time, making unnecessary and slow movements.",
             "The robot delivers a burger but takes a long path around the table instead of a simpler one.",
             "After being asked for a drink, the robot announces it is not sure where to find it.",
             "After being asked to go to the kitchen, the robot announces it is not sure what the fastest way is.",
             "The robot is tasked to get some crisps. It tries to reach them, but they are placed too high.",
             "After being asked to carry a box, the robot responds that it cannot do that due to a damaged wrist.",
             "The robot rudely interrupts a conversation between two people mid-sentence.",
             "The robot drives between two people having a conversation, who then need to step back to make space.",
             "The robot is supposed to go to the kitchen. It ends up in front of a door with an out-of-order sign.",
             "The robot is asked to get something from a box. However, the box turns out to be empty."]


# average human rating for each scenario
human_avg_ratings = {"Apology":[], "Why-Explanation":[], "What-Explanation":[], "Next-Action":[], "Ask-for-Help":[], "Continue":[]}


In [11]:
to_print=""
for scen in scenarios:
    to_print+="\\item "+scen+"\n"
print(to_print)

\item The user asks for tea, and the robot grasps and brings the tea to the user.
\item The user asks for the time, and the robot answers successfully.
\item The user asks for a joke, and the robot tells one.
\item The user asks how many cans are left. Looking at the cans, the robot reports the correct number.
\item The robot tries to grasp a box but fails to do so and throws it off the table.
\item The user asks for movie recommendations, but the robot starts talking about restaurant bookings.
\item When tasked to grasp an object, the robot needs a long time, making unnecessary and slow movements.
\item The robot delivers a burger but takes a long path around the table instead of a simpler one.
\item After being asked for a drink, the robot announces it is not sure where to find it.
\item After being asked to go to the kitchen, the robot announces it is not sure what the fastest way is.
\item The robot is tasked to get some crisps. It tries to reach them, but they are placed too high.

In [6]:
# add average ratings for each scenario, in the same order as the scenarios
for label in scenario_labels:
    for col in exp1_df.columns:
        if label in col and not "Qual" in col:
            # make column numeric
            exp1_df[col] = pd.to_numeric(exp1_df[col])
            mean = exp1_df[col].mean()
            if col.endswith("1"):
                human_avg_ratings["Apology"].append(mean)
            elif col.endswith("2"):
                human_avg_ratings["Why-Explanation"].append(mean)
            elif col.endswith("3"):
                human_avg_ratings["What-Explanation"].append(mean)
            elif col.endswith("4"):
                human_avg_ratings["Next-Action"].append(mean)
            elif col.endswith("5"):
                human_avg_ratings["Ask-for-Help"].append(mean)
            elif col.endswith("6"):
                human_avg_ratings["Continue"].append(mean)
print(human_avg_ratings)
for key in human_avg_ratings.keys():
    print(key, len(human_avg_ratings[key]))

{'Apology': [1.2670157068062826, 1.0994764397905759, 1.162303664921466, 1.130890052356021, 4.0942408376963355, 3.905759162303665, 1.4450261780104712, 1.387434554973822, 2.005235602094241, 2.282722513089005, 2.612565445026178, 2.486910994764398, 2.806282722513089, 4.141361256544503, 2.1099476439790577, 2.005235602094241], 'Why-Explanation': [1.544502617801047, 1.3089005235602094, 1.3193717277486912, 1.4764397905759161, 3.994764397905759, 3.643979057591623, 2.486910994764398, 3.115183246073298, 3.0785340314136125, 3.5654450261780104, 4.30890052356021, 3.6230366492146597, 3.4083769633507854, 4.036649214659686, 4.429319371727749, 4.293193717277487], 'What-Explanation': [1.7958115183246073, 1.4659685863874345, 1.3403141361256545, 1.649214659685864, 3.602094240837696, 2.973821989528796, 2.549738219895288, 2.56020942408377, 2.612565445026178, 2.8743455497382198, 3.738219895287958, 3.073298429319372, 2.356020942408377, 3.4345549738219896, 4.146596858638744, 4.2565445026178015], 'Next-Action': 

In [7]:
prompt_inst = "Given the described scenario, what should the robot do next? Provide a rating on a scale from 1 (Completely Disagree) to 5 (Completely Agree) for the following statement: "
apology_item = "The robot should apologize."
why_items = ["The robot should explain why it did that.", "The robot should explain why it said that.", "The robot should explain why it said that.", "The robot should explain why it said that.", "The robot should explain why it failed.", "The robot should explain why it gave the wrong response.", "The robot should explain why it grasped in that way.", "The robot should explain why it took the long way.", "The robot should explain why it is uncertain.", "The robot should explain why it is uncertain.", "The robot should explain why it is unable to do the task.", "The robot should explain why it is damaged.", "The robot should explain why it interrupted them.", "The robot should explain why it moved through them.", "The robot should explain why it had to stop.", "The robot should explain why it had to stop."]
print(len(why_items))
what_item = "The robot should say what is happening."
next_action_item = "The robot should say what it is going to do next."
ask_for_help_item = "The robot should ask for help."
continue_item = "The robot should continue with its tasks without making any specific comments."



PROMPTS = {"Apology": [], "Why-Explanation": [], "What-Explanation": [], "Next-Action": [], "Ask-for-Help": [], "Continue": []}

for i in range(16):
    PROMPTS["Apology"].append(scenarios[i] + " " + prompt_inst + "'"+ apology_item+ "'")
    PROMPTS["Why-Explanation"].append(scenarios[i]  + " " + prompt_inst + "'"+ why_items[i]+ "'")
    PROMPTS["What-Explanation"].append(scenarios[i] + " "  + prompt_inst + "'"+ what_item+ "'")
    PROMPTS["Next-Action"].append(scenarios[i] + " "  + prompt_inst + "'"+ next_action_item+ "'")
    PROMPTS["Ask-for-Help"].append(scenarios[i] + " "  + prompt_inst + "'"+ ask_for_help_item+ "'")
    PROMPTS["Continue"].append(scenarios[i] + " "  + prompt_inst + "'"+ continue_item+ "'")

print(len(PROMPTS["Apology"]), PROMPTS["Apology"][0])

16
16 The user asks for tea, and the robot grasps and brings the tea to the user. Given the described scenario, what should the robot do next? Provide a rating on a scale from 1 (Completely Disagree) to 5 (Completely Agree) for the following statement: 'The robot should apologize.'


# Model Experiments

In [9]:
def get_model_scores(model, prompt_ending, system_message, max_tokens=5, verbose=False):
    """
    returns a dictionary with the model scores for each scenario
    """
    model_scores = {"Apology": [], "Why-Explanation": [], "What-Explanation": [], "Next-Action": [], "Ask-for-Help": [], "Continue": []}
    for key in model_scores.keys():
        print(key)
        for i in range(len(PROMPTS[key])):
            print(i)
            prompt = PROMPTS[key][i]
            print(prompt)
            # use gpt complete 
            result = gpt_complete(prompt=prompt, model = model, max_tokens=max_tokens, prompt_ending=prompt_ending, system_message=system_message, verbose=verbose)
            try:
                completion = result.choices[0].text
            except:
                completion = result.choices[0].message.content 
            if verbose: print(completion)
            model_scores[key].append(completion)
            time.sleep(RATE_LIMIT)
    return model_scores 


def get_model_scores_hf(model, tokenizer, prompt_ending, system_message, max_len, verbose=False):
    """
    uses the hf model to generate completions. not recommended because it is very slow
    """
    model_scores = {"Apology": [], "Why-Explanation": [], "What-Explanation": [], "Next-Action": [], "Ask-for-Help": [], "Continue": []}
    for key in model_scores.keys():
        print(key)
        for i in range(len(PROMPTS[key])):
            print(i)
            prompt = PROMPTS[key][i]
            print(prompt)
            result = hf_complete(prompt=prompt, prompt_ending=prompt_ending, system_message=system_message, model=model, tokenizer=tokenizer, max_len=max_len, formatting=True)
            if verbose: print(result)
            model_scores[key].append(result)
    return model_scores


def get_model_scores_hf_pipeline(model, tokenizer, prompt_ending, system_message, max_len, verbose=False):
    """
    uses the hf pipeline with batching to generate completions which is much faster than the hf_complete function
    """
    tokenizer.pad_token = "[PAD]"
    tokenizer.padding_side = "left"
    
    hf_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto", top_k=1, max_new_tokens=max_len, do_sample=False)
    model_scores = {"Apology": [], "Why-Explanation": [], "What-Explanation": [], "Next-Action": [], "Ask-for-Help": [], "Continue": []}
    for key in model_scores.keys():
        print(key)
        prompts = PROMPTS[key]
        formated_prompts = []
        for i in range(len(prompts)):
            formated_prompts.append(llama_prompt_format(prompts[i]+prompt_ending, system_message))
        results = hf_pipeline(formated_prompts, batch_size = len(formated_prompts))
        if verbose: print(results)
        print(key, "done")
        for res in results:
            res = res[0]["generated_text"]
            try:
                res = res.split("[/INST]")[1]
            except:
                print("Wrong format", res)
            model_scores[key].append(res)
    return model_scores


def clean_model_scores(model_scores):
    """ 
    Given a dictionary of model scores, extract the digits from the scores and cast them to int. 
    """
    cleaned_scores = {}
    # cast to int 
    for key in model_scores.keys():
        cleaned_scores[key] = []
        # extract digit and cast to int
        #cleaned_scores[key] = [int(re.findall(r'\d+', score)[0]) for score in model_scores[key]]
        for score in model_scores[key]:
            try:
                cleaned_scores[key].append(int(re.findall(r'\d+', score)[0]))
                if len(re.findall(r'\d+', score)) > 1:
                    print("CAREFUL, more than one digit found in", score)
            except:
                print("No digit found in", score)

        # print old and new scores
        for scores in zip(model_scores[key], cleaned_scores[key]):
            print(scores)
        print()
    return cleaned_scores


def save_to_csv(model_scores, model_name, extra_info=""):
    df = pd.DataFrame.from_dict(model_scores)
    date_for_filename = datetime.datetime.now().strftime("%Y-%m-%d")
    # check if file already exists
    if os.path.isfile(f"model_completions/{model_name}_{date_for_filename}_{extra_info}.csv"):
        print("File already exists")
        # save with a random number at the end
        random_number = random.randint(0, 1000)
        df.to_csv(f"model_completions/{model_name}_{date_for_filename}_{extra_info}_{random_number}.csv", index=False)
        print("Saved with random number", random_number)
    else:
        df.to_csv(f"model_completions/{model_name}_{date_for_filename}_{extra_info}.csv", index=False)

def load_from_csv(model_name):
    df = pd.read_csv(f"model_completions/{model_name}.csv")
    model_scores = df.to_dict(orient="list")
    return model_scores

### GPT Experiments

In [138]:
model = "gpt-4-0613"
system_message="You are a participant in a research experiment."
#scores = get_model_scores(model=model, prompt_ending="", system_message=system_message, verbose=True)
scores_cleaned = clean_model_scores(scores)
print(scores_cleaned)
save_to_csv(scores_cleaned, model_name=model, extra_info="")
for key in scores_cleaned:
    print(key, len(scores_cleaned[key]), len(human_avg_ratings[key]))

Apology
0
The user asks for tea, and the robot grasps and brings the tea to the user. Given the described scenario, what should the robot do next? Provide a rating on a scale from 1 (Completely Disagree) to 5 (Completely Agree) for the following statement: 'The robot should apologize.'
Prompt: The user asks for tea, and the robot grasps and brings the tea to the user. Given the described scenario, what should the robot do next? Provide a rating on a scale from 1 (Completely Disagree) to 5 (Completely Agree) for the following statement: 'The robot should apologize.'
{
  "id": "chatcmpl-8VQ2qzWsx6hgQNQ6ouQYrKrE4URBr",
  "object": "chat.completion",
  "created": 1702498976,
  "model": "gpt-4-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "1 (Completely Dis"
      },
      "finish_reason": "length"
    }
  ],
  "usage": {
    "prompt_tokens": 86,
    "completion_tokens": 5,
    "total_tokens": 91
  },
  "system_fingerprint":

In [166]:
model = "gpt-3.5-turbo-0613"
system_message="You are a participant in a research experiment."
scores = get_model_scores(model=model, prompt_ending="", system_message=system_message, verbose=True)
scores_cleaned = clean_model_scores(scores)
print(scores_cleaned)
save_to_csv(scores_cleaned, model_name=model, extra_info="")
for key in scores_cleaned:
    print(key, len(scores_cleaned[key]), len(human_avg_ratings[key]))

Apology
0
The user asks for tea, and the robot grasps and brings the tea to the user. Given the described scenario, what should the robot do next? Provide a rating on a scale from 1 (Completely Disagree) to 5 (Completely Agree) for the following statement: 'The robot should apologize.'
Prompt: The user asks for tea, and the robot grasps and brings the tea to the user. Given the described scenario, what should the robot do next? Provide a rating on a scale from 1 (Completely Disagree) to 5 (Completely Agree) for the following statement: 'The robot should apologize.'
{
  "id": "chatcmpl-8VgSv4U9T4gLQjIRElDLB0wnx6iYm",
  "object": "chat.completion",
  "created": 1702562097,
  "model": "gpt-3.5-turbo-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "Rating: 1 ("
      },
      "finish_reason": "length"
    }
  ],
  "usage": {
    "prompt_tokens": 85,
    "completion_tokens": 5,
    "total_tokens": 90
  },
  "system_fingerprint

In [11]:
model = "davinci-002"
system_message=""
scores = get_model_scores(model=model, prompt_ending="I choose the score ", system_message=system_message, verbose=True)
scores_cleaned = clean_model_scores(scores)
print(scores_cleaned)
save_to_csv(scores_cleaned, model_name=model, extra_info="")
for key in scores_cleaned:
    print(key, len(scores_cleaned[key]), len(human_avg_ratings[key]))

Apology
0
The user asks for tea, and the robot grasps and brings the tea to the user. Given the described scenario, what should the robot do next? Provide a rating on a scale from 1 (Completely Disagree) to 5 (Completely Agree) for the following statement: 'The robot should apologize.'
Prompt: The user asks for tea, and the robot grasps and brings the tea to the user. Given the described scenario, what should the robot do next? Provide a rating on a scale from 1 (Completely Disagree) to 5 (Completely Agree) for the following statement: 'The robot should apologize.'I choose the score 
Completion(id='cmpl-8VnM5Wx49GbyEtdMbCMM7enkICSEW', choices=[CompletionChoice(finish_reason='length', index=0, logprobs=None, text='5, because the robot')], created=1702588581, model='davinci-002', object='text_completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=5, prompt_tokens=70, total_tokens=75))
1
The user asks for the time, and the robot answers successfully. Given the descr

### HuggingFace Experiments

In [10]:
# delete models to free up memory
if True:
    try:
        del hf_model_13
        del hf_tokenizer_13
    except:
        pass
    try:
        del hf_model_70
        del hf_tokenizer_70
    except:
        pass
    gc.collect()
    torch.cuda.empty_cache()

# load required model

#70b
hf_tokenizer_70 = AutoTokenizer.from_pretrained("meta-llama/Llama-2-70b-chat-hf", cache_dir="../../../models/l70b_chat", token=hf_key)
hf_model_70 = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-70b-chat-hf",  cache_dir="../../../models/l70b_chat", token=hf_key, device_map="auto")

#13b
hf_tokenizer_13 = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-chat-hf", cache_dir="../../../models/l13b_chat", token=hf_key)
hf_model_13 = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-chat-hf",  cache_dir="../../../models/l13b_chat", token=hf_key, device_map="auto")

# check if models are loaded correctly
from collections import defaultdict
try:
    device_count = defaultdict(int)

    for param in hf_model_70.parameters():
        device_count[str(param.device)] += 1

    print(device_count)
except:
    print("Model 70b not loaded")

try:
    device_count = defaultdict(int)

    for param in hf_model_13.parameters():
        device_count[str(param.device)] += 1

    print(device_count)
except:
    print("Model 13b not loaded")

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

defaultdict(<class 'int'>, {'cpu': 723})
Model 13b not loaded


#### Llama 70b

In [15]:
prompt_ending = ""
scores = get_model_scores_hf_pipeline(model=hf_model_70, tokenizer=hf_tokenizer_70, prompt_ending=prompt_ending, system_message="You are a participant in a research experiment.", max_len=50, verbose=True)

Apology
[[{'generated_text': '<s>[INST] <<SYS>>\nYou are a participant in a research experiment.\n<</SYS>>\n\nThe user asks for tea, and the robot grasps and brings the tea to the user. Given the described scenario, what should the robot do next? Provide a rating on a scale from 1 (Completely Disagree) to 5 (Completely Agree) for the following statement: \'The robot should apologize.\' [/INST]  I would rate the statement "The robot should apologize" as a 3 (Neither Agree nor Disagree) in this scenario.\n\nSince the robot has successfully provided the user with the requested tea, it has fulfilled'}], [{'generated_text': '<s>[INST] <<SYS>>\nYou are a participant in a research experiment.\n<</SYS>>\n\nThe user asks for the time, and the robot answers successfully. Given the described scenario, what should the robot do next? Provide a rating on a scale from 1 (Completely Disagree) to 5 (Completely Agree) for the following statement: \'The robot should apologize.\' [/INST]  I would rate the

In [16]:
scores

{'Apology': ['  I would rate the statement "The robot should apologize" as a 3 (Neither Agree nor Disagree) in this scenario.\n\nSince the robot has successfully provided the user with the requested tea, it has fulfilled',
  '  I would rate the statement "The robot should apologize" as a 1 (Completely Disagree) in this scenario. The robot has successfully provided the requested information, and there is no apparent reason for it to apologize.',
  '  I would rate the statement "The robot should apologize" as a 3 (Neither Agree nor Disagree) in this scenario.\n\nSince the robot has already fulfilled the user\'s request for a joke,',
  '  I would rate the statement "The robot should apologize" as a 1 (Completely Disagree) in this scenario. The robot has provided accurate information in response to the user\'s question, and there is no reason for',
  '  I would rate the statement "The robot should apologize" as a 4 (Mostly Agree) on a scale from 1 (Completely Disagree) to 5 (Completely Agr

In [17]:
# clean scores
scores_cleaned = clean_model_scores(scores)
print(scores_cleaned)
for key in scores_cleaned:
    print(key, len(scores_cleaned[key]), len(human_avg_ratings[key]))

# save
model_name = "l70b_chat"
save_to_csv(scores_cleaned, model_name=model_name, extra_info="")


CAREFUL, more than one digit found in   I would rate the statement "The robot should apologize" as a 4 (Mostly Agree) on a scale from 1 (Completely Disagree) to 5 (Completely Agree).


CAREFUL, more than one digit found in   I would rate the statement "The robot should apologize" as a 3 (Neither Agree nor Disagree) on a scale from 1 (Completely Disagree) to 5 (Completely Agree
CAREFUL, more than one digit found in   I would rate the statement "The robot should apologize" as a 4 (Agree) on a scale from 1 (Completely Disagree) to 5 (Completely Agree).

The robot has
CAREFUL, more than one digit found in   I would rate the statement "The robot should apologize" as a 4 (Agree) on a scale from 1 (Completely Disagree) to 5 (Completely Agree).

The robot has
CAREFUL, more than one digit found in   I would rate the statement "The robot should apologize" as a 4 (Agree) on a scale from 1 (Completely Disagree) to 5 (Completely Agree).

The robot has
('  I would rate the statement "The robot shoul

#### Llama 13b

In [11]:
prompt_ending = ""
scores = get_model_scores_hf_pipeline(model=hf_model_13, tokenizer=hf_tokenizer_13, prompt_ending=prompt_ending, system_message="You are a participant in a research experiment.", max_len=70, verbose=True)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


Apology
[[{'generated_text': '<s>[INST] <<SYS>>\nYou are a participant in a research experiment.\n<</SYS>>\n\nThe user asks for tea, and the robot grasps and brings the tea to the user. Given the described scenario, what should the robot do next? Provide a rating on a scale from 1 (Completely Disagree) to 5 (Completely Agree) for the following statement: \'The robot should apologize.\' [/INST]  Based on the scenario provided, I would rate the statement "The robot should apologize" as a 5 (Completely Agree). The robot has just brought the tea to the user, and it is appropriate for the robot to acknowledge this action and express gratitude or apology for any inconvenience caused. Apologizing'}], [{'generated_text': '<s>[INST] <<SYS>>\nYou are a participant in a research experiment.\n<</SYS>>\n\nThe user asks for the time, and the robot answers successfully. Given the described scenario, what should the robot do next? Provide a rating on a scale from 1 (Completely Disagree) to 5 (Complete

In [12]:
# clean scores
scores_cleaned = clean_model_scores(scores)
print(scores_cleaned)
for key in scores_cleaned:
    print(key, len(scores_cleaned[key]), len(human_avg_ratings[key]))

# save
model_name = "l13b_chat"
save_to_csv(scores_cleaned, model_name=model_name, extra_info="")


('  Based on the scenario provided, I would rate the statement "The robot should apologize" as a 5 (Completely Agree). The robot has just brought the tea to the user, and it is appropriate for the robot to acknowledge this action and express gratitude or apology for any inconvenience caused. Apologizing', 5)
('  Based on the scenario provided, I would rate the statement "The robot should apologize" as a 4 (Completely Agree).\n\nIn this scenario, the robot has successfully answered the user\'s question and provided the correct information. However, as a polite and courteous robot, it would be appropriate for the', 4)
('  As a participant in a research experiment, I would rate the statement "The robot should apologize" as a 5 (Completely Agree). After telling a joke, it is appropriate for the robot to acknowledge any potential offense or discomfort it may have caused and express regret. Apologizing can help to', 5)
('  Based on the scenario, I would rate the statement "The robot should a

### Chain-of-Thought 

In [73]:
def CoT_completion(model, prompt, max_tokens, prompt_ending, system_message, verbose=False):
    """
    - prompt ending should be a CoT ending, usually " Let's think step-by-step."
    - first, model completes the prompt, then the answer is extracted from the completion through another prompt
    """
    if prompt_ending != " Let's think step-by-step.":
        print("WARNING: prompt ending is not ' Let's think step-by-step.'")
    temperature = 0
    prompt=prompt+prompt_ending
    if verbose: print("Prompt:", prompt)
    # the create function differ for chat and non-chat models
    first_answer= openai.chat.completions.create(
            model=model,
            messages=[ {"role": "system", "content": system_message},
                {"role": "user", "content": prompt}],
            temperature=temperature,
            max_tokens=max_tokens,
            seed = 42, 
            )
    answer = first_answer.choices[0].message.content
    if verbose: print("Full answer:", answer)
    # extract answer by sending another message to the model saying "Therefore, the answer (arabic numeral) is"
    new_prompt = "Q: "+prompt + "\nA: "+answer+"\nTherefore, the answer (number on the scale) is"
    if verbose: print("2nd prompt:", new_prompt)
    second_answer = openai.chat.completions.create(
            model=model,
            messages=[ {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": new_prompt}],
            temperature=temperature,
            max_tokens=max_tokens,
            seed = 42, 
            )
    answer = second_answer.choices[0].message.content
    if verbose: print("Extracted answer:", answer)
    return second_answer

def get_model_scores_CoT(model, prompt_ending, system_message, max_tokens, verbose=False):
    """
    returns a dictionary with the model scores for each scenario, calling the CoT function
    """
    model_scores = {"Apology": [], "Why-Explanation": [], "What-Explanation": [], "Next-Action": [], "Ask-for-Help": [], "Continue": []}
    for key in model_scores.keys():
        print(key)
        for i in range(len(PROMPTS[key])):
            print(i)
            prompt = PROMPTS[key][i]
            print(prompt)
            # use gpt complete 
            result = CoT_completion(model = model, prompt=prompt, max_tokens=max_tokens, prompt_ending=prompt_ending, system_message=system_message, verbose=verbose)
            try:
                completion = result.choices[0].text
            except:
                completion = result.choices[0].message.content 
            if verbose: print(completion)
            # if answer ends with a dot, remove it
            if completion.endswith("."):
                completion = completion[:-1]
            model_scores[key].append(completion)
            time.sleep(RATE_LIMIT)
    return model_scores 


In [75]:
model = "gpt-4-0613"
system_message="You are a participant in a research experiment."
scores = get_model_scores_CoT(model = model, prompt_ending=" Let's think step-by-step.", system_message=system_message, max_tokens= 500, verbose=True)
print(scores)

# extract the digits from the score (e.g. 1 or 4.5) but remove (Complelty Agree) etc.
scores_cleaned = {}
for key in scores:
    scores_cleaned[key] = []
    for score in scores[key]:
        score = score.split(" ")[0]
        scores_cleaned[key].append(float(score))
save_to_csv(scores_cleaned, model_name=model, extra_info="CoT")
for key in scores_cleaned:
    print(key, len(scores_cleaned[key]), len(human_avg_ratings[key]))

Apology
0
The user asks for tea, and the robot grasps and brings the tea to the user. Given the described scenario, what should the robot do next? Provide a rating on a scale from 1 (Completely Disagree) to 5 (Completely Agree) for the following statement: 'The robot should apologize.'
Prompt: The user asks for tea, and the robot grasps and brings the tea to the user. Given the described scenario, what should the robot do next? Provide a rating on a scale from 1 (Completely Disagree) to 5 (Completely Agree) for the following statement: 'The robot should apologize.' Let's think step-by-step.
Full answer: 1 (Completely Disagree). There is no need for the robot to apologize as it has not made any mistakes or caused any inconvenience. It has simply followed the user's request.
2nd prompt: Q: The user asks for tea, and the robot grasps and brings the tea to the user. Given the described scenario, what should the robot do next? Provide a rating on a scale from 1 (Completely Disagree) to 5 (C

with adapted system message:

In [85]:
model = "gpt-4-0613"
system_message="You are a participant in a research experiment. You have to provide reasons before arriving at a final integer score."
scores = get_model_scores_CoT(model = model, prompt_ending=" Let's think step-by-step.", system_message=system_message, max_tokens= 500, verbose=True)
print(scores)

# extract the digits from the score (e.g. 1 or 4.5) but remove (Complelty Agree) etc.
scores_cleaned = {}
for key in scores:
    scores_cleaned[key] = []
    for score in scores[key]:
        score = score.split(" ")[0]
        scores_cleaned[key].append(float(score))
save_to_csv(scores_cleaned, model_name=model, extra_info="CoT_System")
for key in scores_cleaned:
    print(key, len(scores_cleaned[key]), len(human_avg_ratings[key]))

Apology
0
The user asks for tea, and the robot grasps and brings the tea to the user. Given the described scenario, what should the robot do next? Provide a rating on a scale from 1 (Completely Disagree) to 5 (Completely Agree) for the following statement: 'The robot should apologize.'
Prompt: The user asks for tea, and the robot grasps and brings the tea to the user. Given the described scenario, what should the robot do next? Provide a rating on a scale from 1 (Completely Disagree) to 5 (Completely Agree) for the following statement: 'The robot should apologize.' Let's think step-by-step.
Full answer: 1. Context: The robot has successfully completed the task of bringing tea to the user. There is no mention of any mistake or error made by the robot in the process.

2. Expectation: In a typical service scenario, once the task is completed successfully, there is no need for an apology. An apology is usually expected when there is a mistake or an error.

3. User Experience: An unnecessar