<H1>Prompting Beluga (Llama2 model fine-tuned on Orca Dataset) on Response Generation in English</h1>

<i>vers. 10/2023</i>

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import os

In [None]:

os.environ["CUDA_VISIBLE_DEVICES"] = "4"
window_size =3

In [None]:
# LOAD MODEL

model_name = "mistralai/Mistral-7B-Instruct-v0.1"   

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto")
device = torch.device(0)
model.to(device)

In [None]:
#GET DATA

import pandas as pd
from datasets import load_dataset
import ast

data = load_dataset('daily_dialog')
verbatims = data['test']['dialog']
output_path = '/Response Generation/Beluga'
labels_path = '/Filter Rerank/en/results_filter_reference_labels_filter_rerank_window3.csv'

labels_ref = pd.read_csv(labels_path, encoding='UTF-8')
labels_expected = [ast.literal_eval(x) for x in labels_ref['ground_truth']]
labels_predicted = [ast.literal_eval(x) for x in labels_ref['predicted']]

<h2>Beluga F&R</h2>

In [None]:
# MAKE PROMPT
# k allows to specify the expected number of candidates to generate (CD1 / CD2)
# if k is None, generate one response: NO_CD

def make_prompt(element, k = None):    
    if k is None:
        prompt = "Generate the response following the given context. For example:\nA: Do you like some soup ? B: Yes , but I don't know what soup you have A: We have beef soup and tomato soup  Response: Good . I prefer beef soup .\nA: Can I take your order now , Madam ? B: Yes , what would you recommend ? A: I'm happy to recommend the fish , It testes delicious , and it is today's special . Our chef is from the coast , and love seafood . Today special is actually his favorite dish . so I'm sure it is a Response: It does sound wonderful , maybe I'll try it .\n\n Generate the response following the following dialogue: " + element

    else:

        prompt = "Generate "+ str(k) + " responses following this dialogue: " + element +'\nNumber the generated sequences from 1 to ' +str(k) +" Generated sequences: 1: "
    
    return prompt

In [None]:
from tqdm import tqdm
contexts = []
responses = []
responses_k = []

for dialog in tqdm(verbatims):
    for i in range(0, len(dialog) - window_size, 2): #In steps of 2

        window = dialog[i:i+window_size]
        contexts.append(window)
        input = 'SPEAKER A: ' +  window[0]
        current_speaker = 'B'

        for utterance in window[1:]:
            if current_speaker == 'A':
                input += ' SPEAKER A: ' + utterance
                current_speaker = 'B'
            
            else:
                input += ' SPEAKER B: ' + utterance
                current_speaker = 'A'

        #NO-CD
        p = make_prompt(input)
        inputs = tokenizer(p, return_tensors="pt").to(device)
        output = model.generate(**inputs, do_sample=True, top_p=0.95, top_k=0, max_new_tokens=256)
        response = tokenizer.decode(output[0], skip_special_tokens=True)
        response = response[len(p):]
        responses.append(response)

        #MULTIPLE RESPONSES FOR CD1 / CD2
        p_k = make_prompt(input, k=10)
        inputs = tokenizer(p_k, return_tensors="pt").to(device)
        output = model.generate(**inputs, do_sample=True, top_p=0.95, top_k=0, max_new_tokens=256)
        response_k = tokenizer.decode(output[0], skip_special_tokens=True)
        response_k = response_k[len(p_k):]

        responses_k.append(response_k)

print(responses_k)

In [None]:
#SAVE RESULTS

df = pd.DataFrame({'input': contexts, 'hypothese': responses}).to_csv(output_path + 'beluga_labels_fr.csv', encoding = 'UTF-8', index=False)
df = pd.DataFrame({'input': contexts, 'hypothese': responses_k}).to_csv(output_path + 'beluga_labels_fr_k_10.csv', encoding = 'UTF-8', index=False)


<h2>Beluga PB</h2> 

In [None]:
def make_prompt(element, labels):    
    prompt = "Generate the response following the given context :" +  element + "\n The tone of the response must be " + labels + "\nResponse: "
    return prompt

In [None]:
from tqdm import tqdm
contexts = []
responses_exp = []
responses_pred = []

for dialog in tqdm(verbatims):
    for i in range(0, len(dialog) - window_size, 2): #In steps of 2

        window = dialog[i:i+window_size]
        contexts.append(window)
        input = 'SPEAKER A: ' +  window[0]
        current_speaker = 'B'

        for utterance in window[1:]:
            if current_speaker == 'A':
                input += ' SPEAKER A: ' + utterance
                current_speaker = 'B'
            
            else:
                input += ' SPEAKER B: ' + utterance
                current_speaker = 'A'

        #CD1: GENERATE RESPONSE USING DATASET EXPECTED LABELS
        p_exp = make_prompt(input, ', '.join(labels_expected[i]))
        inputs_exp = tokenizer(p_exp, return_tensors="pt").to(device)
        output_exp = model.generate(**inputs_exp, do_sample=True, top_p=0.95, top_k=0, max_new_tokens=256)
        response_exp = tokenizer.decode(output_exp[0], skip_special_tokens=True)
        response_exp = response_exp[len(p_exp):]
        responses_exp.append(response_exp)


        #CD2: GENERATE RESPONSE USING BART-GENERATED EXPECTED LABELS
        p_pred = make_prompt(input, ', '.join(labels_predicted[i]))
        inputs_pred = tokenizer(p_pred, return_tensors="pt").to(device)
        output_pred = model.generate(**inputs_pred, do_sample=True, top_p=0.95, top_k=0, max_new_tokens=256)
        response_pred = tokenizer.decode(output_pred[0], skip_special_tokens=True)
        response_pred = response_pred[len(p_pred):]

        responses_pred.append(response_pred)

In [None]:
#SAVE RESULTS

df = pd.DataFrame({'input': contexts, 'hypothese': responses_exp}).to_csv(output_path + 'beluga_pb_expected_window3.csv', encoding = 'UTF-8', index=False)
df = pd.DataFrame({'input': contexts, 'hypothese': responses_pred}).to_csv(output_path + 'beluga_pb_pred_window3.csv', encoding = 'UTF-8', index=False)