<H1>Training Beluga (Llama2 model fine-tuned on Orca Dataset) on Multi-Label Sequence Generation in English</h1>
We load the data and prompt Beluga to generate a sequence of labels associated with the next utterance.


<i>vers. 10/2023</i>

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import pandas as pd


import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [2]:
# GET MODEL AND TOKENIZER
model_name= "stabilityai/StableBeluga-13B"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
device = torch.device(0)
model.to(device)

In [None]:
# OUTPUT PATH
output_path = "/Beluga" 

import os
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [None]:
# GET DATA
path= "/data/daily_dialog_test_next_window3.csv"
test = pd.read_csv(path, encoding = "UTF-8")

verbatims = test['text']
labels = test['label']

In [None]:
#EXAMPLE: MAKE PROMPT

message = "We consider the following labels: 'inform', 'question', 'directive', 'commissive', 'neutral', 'anger', 'disgust', 'fear', 'happiness', 'sadness' and 'surprise'. \n Predict the sequence of labels associated with the utterance that follows the given dialogue. \n\n For example, Dialogue: 'Good morning , sir . Is there a bank near here ?\nLabels: 'inform'. What labels are associated with the utterance following this dialogue: Dialogue: " + verbatims[10]
#prompt = f"{system_prompt}### User: {message}\n\n### Assistant: "
inputs = tokenizer(message, return_tensors="pt").to("cuda")

print(verbatims[10])
print(labels[10])

In [None]:
# EXAMPLE: GENERATE THE SEQUENCE
output = model.generate(**inputs, do_sample=True, top_p=0.95, top_k=0, max_new_tokens=256)
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response[len(message):])

In [None]:
# FUNCTION THAT TAKES THE CONTEXT AND TRANSFORMS IT INTO THE PROMPT FOR THE MODEL

def make_prompt(element, k = None):    
    if k is not None: #NO_CD: without conditioning on the length of the expected sequence of labels
        prompt = "Generate the response following the given context. For example: Contex: '" + element

    else: # CD: we condition by restraining the sequence generation on its expected lenght
        if k == 1:
            prompt = "Predict the label associated with the utterance that follows the given dialogue.\nWe consider the following labels: 'inform', 'question', 'directive', 'commissive', 'neutral', 'anger', 'disgust', 'fear', 'happiness', 'sadness' and 'surprise'. The answer must be one or a sequence of multiple labels from this list.\n\n\Here are a few examples,\nDialogue: 'Good morning , sir . Is there a bank near here ?\nLabels: 'inform'.\n\n Dialogue: 'Is it far ?'\nLabels:'inform'\n Diloague: 'No , It's only about five minutes walk.'\nLabels: 'inform', 'happiness'.\n\n\nWhat label is associated with the utterance following this dialogue: Dialogue: " + element

        else:
            prompt = "Predict the sequence of " + str(k) + " labels associated with the utterance that follows the given dialogue.\nWe consider the following labels: 'inform', 'question', 'directive', 'commissive', 'neutral', 'anger', 'disgust', 'fear', 'happiness', 'sadness' and 'surprise'. The answer must be one or a sequence of multiple labels from this list.\n\Here are a few examples,\nDialogue: 'Good morning , sir . Is there a bank near here ?\nLabels: 'inform'.\n\n Dialogue: 'Is it far ?'\nLabels:'inform'\n Diloague: 'No , It's only about five minutes walk.'\nLabels: 'inform', 'happiness'.\n\n\nWhat " + str(k) + " labels are associated with the utterance following this dialogue: Dialogue: " + element
    
    return prompt


In [None]:
#GENERATION LOOP, ITERATES OVER EVERY TEST SAMPLE TO GENERATE THE EXPECTED LABELS SEQUENCE
#IN THE SAME LOOP WE GENERATE THE NO-CD LABELS AND THE CD LABELS

from tqdm import tqdm
responses = []
responses_k = []

for i in tqdm(range(len(verbatims))):
    input = verbatims[i].split('<SEP>')
    input = 'SPEAKER A: ' + input [0] + ' SPEAKER B: ' + input[1] + ' SPEAKER A: ' + input [2]
    label = labels[i].split('+')

    # NO_CD
    p = make_prompt(input)
    inputs = tokenizer(p, return_tensors="pt").to(device)
    output = model.generate(**inputs, do_sample=True, top_p=0.95, top_k=0, max_new_tokens=256)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    response = response[len(p):]
    responses.append(response)

    #CD
    p_k = make_prompt(input, len(label))
    inputs = tokenizer(p_k, return_tensors="pt").to(device)
    output = model.generate(**inputs, do_sample=True, top_p=0.95, top_k=0, max_new_tokens=256)
    response_k = tokenizer.decode(output[0], skip_special_tokens=True)
    response_k = response_k[len(p):]

    responses_k.append(response_k)

In [None]:
#SAVE RESULTS

pd.DataFrame({'input': verbatims, 'hypothese': responses}).to_csv(output_path + '/beluga_labels_window3.csv', encoding = 'UTF-8', index=False) #NO-CD
pd.DataFrame({'input': verbatims, 'hypothese': responses_k}).to_csv(output_path + '/beluga_labels__window3_k.csv', encoding = 'UTF-8', index=False) #CD