<H1>Parse Beluga Results</h1>
Extract from the Beluga outputs the actual responses.


<i>vers. 10/2023</i>

In [2]:
import pandas as pd
from tqdm import tqdm
import re
import os 
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "4"
device = torch.device(0)

#solo refers to BELUGA NO-CD (one response is generated), multi refers to BELUGA CD1/2 when it generated N=10 candidates

beluga_solo = 'Response Generation/beluga/beluga_labels.csv'
beluga_multi = 'Response Generation/beluga/beluga_labels_k_10.csv'

df_solo = pd.read_csv(beluga_solo, encoding = 'UTF-8')
df_multi = pd.read_csv(beluga_multi, encoding = 'UTF-8')

#References
references= 'Response Generation/bart/bart_epochs_10_generated_responses_daily_dialog_window3_new.csv'
refs = pd.read_csv(references, encoding = 'UTF-8')['actual_responses']

<h2>Solo Results</h2>

In [2]:
df_solo.head()

Unnamed: 0,input,hypothese
0,"['Hey man , you wanna buy some weed ? ', ' Som...","SPEAKER B: No thanks, man , I don't do that ..."
1,"[' Weed ! You know ? Pot , Ganja , Mary Jane s...","\n\n Response: I appreciate your offer, but I ..."
2,[' I also have blow if you prefer to do a few ...,"SPEAKER B: I am good thank you , I don't do ..."
3,[' Come on man ! I even got dope and acid ! Tr...,"\nResponse: I'm sorry , but I don't do drugs.\..."
4,[' I got my connections ! Just tell me what yo...,\n\nResponse: Just a minute. While we're waiti...


In [None]:
empty = 0
indicators = ['Response: ', 'response: ', 'SPEAKER A: ', 'SPEAKER B: ', 'A: ', 'B: ']
responses = []

for i, row in tqdm(df_solo.iterrows()):
    hypothese = row['hypothese']
    if isinstance(hypothese, str):
        hypothese = hypothese.strip()
        flag = False
        idxs = []
        inds = []
        for x in indicators:
            if x in hypothese:
                flag = True
                indicator_idx = [m.start() for m in re.finditer(x, hypothese)]
                print(indicator_idx)
                for j in indicator_idx:
                    idxs.append(j)
                    inds.append(x)

        if flag:    
            max_idx = idxs.index(max(idxs))
            max_indicator = inds[max_idx]

            response = hypothese[idxs[max_idx] + len(max_indicator):]
            responses.append(response.strip())

        else:
            if len(hypothese) > 3:
                responses.append(hypothese)
            
            else:
                responses.append('None')
                empty += 1

    else:
        empty += 1
        responses.append('None')


print('EMPTY PREDS : ', empty, ' OUT OF ', len(df_solo))

In [None]:
responses[:10]

In [5]:
output_path = 'Response Generation/beluga/'

file_generated = output_path + "beluga_generated_responses_daily_dialog_window_3_N1"

df = pd.DataFrame({'input': df_solo['input'], 'hypothese': responses})
#df.to_csv(file_generated+'.csv', index=False, encoding='UTF-8')

In [6]:
generated_responses = responses
actual_responses = refs

assert(len(generated_responses)==len(actual_responses))

In [None]:
#IMPORT METRICS

import evaluate
sacrebleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
chrf = evaluate.load("chrf")

In [None]:
# PEEK AT THE RESULTS
actual_responses = [[res] for res in actual_responses] #Refs must be in a list of list of str

print(df_solo['input'][:5])
print(generated_responses[:5])
print(actual_responses[:5])


In [None]:
#COMPUTE METRICS

bleu_score = sacrebleu.compute(predictions=generated_responses, references=actual_responses)

rouge_score = rouge.compute(predictions=generated_responses, references=actual_responses)

bert_score = bertscore.compute(predictions=generated_responses, references=actual_responses, lang='en')
precision = bert_score['precision']
recall = bert_score['recall']
f1 = bert_score['f1']
avg_precision_bert = sum(precision) / len(precision)
avg_recall_bert = sum(recall) / len(recall)
avg_f1_bert = sum(f1) / len(f1)

chrf_score = chrf.compute(predictions=generated_responses, references=actual_responses)

In [None]:
#WRITE METRICS FOR SOLO
import sys

fout = open(file_generated+".txt", "w")
print('Bleu score: \n', bleu_score) #Range from 0 to 100
print('Rouge score: \n', rouge_score)
print('Bert score: \n', bert_score)
print('Avg precision Bert score: ', avg_precision_bert)
print('Avg recall Bert score: ', avg_recall_bert)
print('Avg f1 Bert score: ', avg_f1_bert)
print('chrf score: \n', chrf_score)
fout.close()

<h2>MULTI Results</h2>

In [33]:
df_multi.head()

Unnamed: 0,input,hypothese
0,"['Hey man , you wanna buy some weed ? ', ' Som...","Hey man, you wanna buy some weed? 2: No, I'm ..."
1,"[' Weed ! You know ? Pot , Ganja , Mary Jane s...","Weed ! You know ? Pot , Ganja , Mary Jane som..."
2,[' I also have blow if you prefer to do a few ...,2: 3: 4: 5: 6: 7: 8: 9: 10:
3,[' Come on man ! I even got dope and acid ! Tr...,1. SPEAKER A: Come on man! I even got dope and...
4,[' I got my connections ! Just tell me what yo...,I want some juicy gossip! 2: I want the late...


In [None]:
def find_sentences(hypothese, indexes, indicators):
    clean_responses=[]
    count = len([indexes[x] for x in range(len(indexes)) if indexes[x] >=0])

    if count <2:
        print('wtf')

    if indexes[0] == -1 and indexes[1] != -1:
        res= hypothese[:indexes[1]].strip()
        if len(res) > 2:
            clean_responses.append(res)
            count += 1

    i = 0
    while i <len(indexes):
        if indicators[i] != -1:
            j = i + 1
            flag = False
            while j < len(indexes) and not flag:
                if indexes[j] != -1:
                    res = hypothese[indexes[i]+len(indicators[i]): indexes[j]].strip()
                    if len(res) >2:
                        clean_responses.append(res)
                    flag = True
                    
                j+= 1

        i+=1
    
    if indexes[len(indexes)-1] != -1:
        res= hypothese[indexes[len(indexes)-1]+len(indicators[len(indexes)-1]):].strip()
        if len(res) > 2:
            clean_responses.append(res)
    
    if len(clean_responses) != 10:
        add_n = 10 - len(clean_responses)
        clean_responses += ['None']*add_n

    assert(len(clean_responses)==10)
    #print(count, len(clean_responses))
    return clean_responses




empty = 0
#indicators = [str(k) + ':' for k in range(11)] + [str(k) + ')' for k in range(11)] + [str(k) + '.' for k in range(11)]
#indicators = ['1:', '1)', '1.', '1'] + ['2:', '2)', '2.', '2']
indicators = [':', ')', '.', ' ']


responses = []

for i, row in tqdm(df_multi.iterrows()):
    hypothese = row['hypothese']
    if isinstance(hypothese, str):
        hypothese = hypothese.strip()
        idxs = []
        indic = []
        for x in range(1, 11):
            flag = False
            j = 0
            while j <len(indicators) and not flag:
                indicator = indicators[j]
                if str(x) +indicator in hypothese:
                    new_idx = hypothese.index(str(x) +indicator)
                    idxs.append(new_idx)
                    indic.append(str(x) +indicator)
                    flag = True
                
                j+= 1
            
            if not flag:
                idxs.append(-1)
                indic.append(-1)

        if idxs == [-1]*10:
            responses.append(['None']*10)
            empty += 1
        
        else:
            response = find_sentences(hypothese, idxs, indic)
            responses.append(response)


    else:
        empty += 1

print('EMPTY PREDS : ', empty, ' OUT OF ', len(df_solo))

In [None]:
from statistics import mean

lens = [len(x) for x in responses]
mean(lens)

In [None]:
not_null = []
for res in responses:
    nn =  [r for r in res if r != 'None']
    not_null.append(nn)

lens_not_null = [len(x) for x in not_null]
mean(lens_not_null)    

In [None]:
import pandas as pd

file_generated = 'Response Generation/responses/beluga_generated_multiple_responses_daily_dialog_window3_N10'
print(file_generated)
N=10

df = pd.DataFrame({'inputs': df_multi['input'],'actual responses':actual_responses})

for res in range(N):
    df['generated_responses_'+ str(res)] = [x[res] for x in responses]

df.head()

In [66]:
df.to_csv(file_generated +'.csv', index = False, encoding = 'UTF-8')

<h2>Prompt Based Approach: Expected vs Predicted Labels </h3>

In [14]:
import pandas as pd

beluga_expected = 'Response Generation/beluga/beluga_labels_expected.csv'
beluga_predicted = 'Response Generation/beluga/beluga_labels_pred.csv'

references= 'Response Generation/bart/bart_epochs_10_generated_responses_daily_dialog_window3_new.csv'

df_expected = pd.read_csv(beluga_expected, encoding = 'UTF-8')
df_predicted = pd.read_csv(beluga_predicted, encoding = 'UTF-8')
refs = pd.read_csv(references, encoding = 'UTF-8')['actual_responses']

In [None]:
empty = 0
#indicators = ['Response: ', 'response: ', 'SPEAKER A: ', 'SPEAKER B: ', 'A: ', 'B: ']
responses = []

mode = 'predicted' #'expected' #

if mode == 'predicted':
    df = df_predicted

else:
    df = df_expected


for i, row in tqdm(df.iterrows()):
    hypothese = row['hypothese']
    if isinstance(hypothese, str):
        hypothese = hypothese.strip()
        flag = False

        if len(hypothese) > 2:
            responses.append(hypothese)


        else:
            print(i, hypothese.strip())
            empty += 1
            responses.append('None')

    else:
        empty += 1
        print(i, hypothese)

        responses.append('None')


print('EMPTY PREDS : ', empty, ' OUT OF ', len(df_solo))

In [35]:
generated_responses = responses
actual_responses = refs

In [17]:
file_generated = "Filter Rerank/en/results_filter/beluga_final_responses_"+mode

df = pd.DataFrame({'input': df_solo['input'], 'hypothese': responses})

df.head(n=10)

Unnamed: 0,input,hypothese
0,"['Hey man , you wanna buy some weed ? ', ' Som...",What are you talking about? I don't understand...
1,"[' Weed ! You know ? Pot , Ganja , Mary Jane s...","Uh, no, I'm good on that front. Thanks though."
2,[' I also have blow if you prefer to do a few ...,"No thanks, I am good with what I have."
3,[' Come on man ! I even got dope and acid ! Tr...,Can you give me some information about where y...
4,[' I got my connections ! Just tell me what yo...,"Alright, what are you after? Let me know what ..."
5,"['The taxi drivers are on strike again . ', ' ...",What is the reason behind the taxi drivers' st...
6,"[""We've managed to reduce our energy consumpti...",How have you invested in a heat recovery syste...
7,"["" Mainly because we've invested in a heat rec...",Investing in a heat recovery system has allowe...
8,"['Believe it or not , tea is the most popular ...",Do you know which type of tea is most popular ...
9,[' Right . And China is the homeland of tea . ...,"I have actually never tasted any Chinese tea, ..."


In [18]:
df.to_csv(file_generated+'.csv', index=False, encoding='UTF-8')

In [19]:
bleu_score = sacrebleu.compute(predictions=generated_responses, references=actual_responses)

rouge_score = rouge.compute(predictions=generated_responses, references=actual_responses)

bert_score = bertscore.compute(predictions=generated_responses, references=actual_responses, lang='en')
precision = bert_score['precision']
recall = bert_score['recall']
f1 = bert_score['f1']
avg_precision_bert = sum(precision) / len(precision)
avg_recall_bert = sum(recall) / len(recall)
avg_f1_bert = sum(f1) / len(f1)

chrf_score = chrf.compute(predictions=generated_responses, references=actual_responses)

In [20]:
import sys

fout = open(file_generated+".txt", "w")
print('Bleu score: \n', bleu_score) #Range from 0 to 100
print('Rouge score: \n', rouge_score)
print('Bert score: \n', bert_score)
print('Avg precision Bert score: ', avg_precision_bert)
print('Avg recall Bert score: ', avg_recall_bert)
print('Avg f1 Bert score: ', avg_f1_bert)
print('chrf score: \n', chrf_score)
fout.close()