<H1>Filter & Rerank: Ranking & Selecting Candidate Responses using Multi-Label Sequences</h1>
We look at the candidate responses generated previously. For each sample, for each candidate, we use BERT Current to extract the list of labels displayed in the candidate response. We compare this sequence to the "expected sequence of labels": observed in the dataset for CD1, generated by BART NO-CD for CD2. This allows the algorithm to select the top-scoring candidate.


<i>vers. 10/2023</i>

<h2> Imports</h2>

In [18]:
import pandas as pd
from tqdm import tqdm
import torch
import os
import ast

In [21]:
# IMPORT METRICS
import evaluate, sys
from statistics import mean
from sklearn.metrics.pairwise import cosine_similarity

sacrebleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
chrf = evaluate.load("chrf")

#COSINE SIMILARITY
#!pip install sentence_transformers
from sentence_transformers import SentenceTransformer, util
cosine_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [19]:
#CUDA

os.environ["CUDA_VISIBLE_DEVICES"] = "4"

In [22]:
#MODEL

#BERT CURRENT
model_name_or_path = "/path/to/BERT/Current" #TODO
model_name = "bert-base"

prediction_model_path = 'path/to/BART/NO-CD' #TODO
prediction_model_name = "facebook/bart-base"

output_path = '/Filter Rerank//results_filter_window3'

#RESPONSES CANDIDATES
references_path_bart = "Response Generation/responses/bart_generated_multiple_responses_daily_dialog_window3_N10.csv"
references_path_bart_large = "Response Generation/responses/bart_LARGE_generated_multiple_responses_daily_dialog_window3_N10.csv"

references_path_dialoGPT = "Response Generation/responses/dialogpt_generated_multiple_responses_daily_dialog_window3_N10.csv"
references_path_dialoGPT_large = "Response Generation/responses/dialogpt_LARGE_generated_multiple_responses_daily_dialog_window3_N10.csv"


references_path_gpt2 = "Response Generation/responses/gpt_generated_multiple_responses_daily_dialog_window3_N10.csv"
references_path_gpt2_large = "Response Generation/responses/gpt2_LARGE_generated_multiple_responses_daily_dialog_window3_N10.csv"

references_path_beluga = "Response Generation/responses/beluga_generated_multiple_responses_daily_dialog_window3_N10.csv"   

In [23]:
#LIST OF CONFIGURATION: MODELS, CD1/CD2 FOR LOOPING THE PROCESS

models_list= {'BART': references_path_bart, 'BART_Large': references_path_bart_large, 'dialoGPT': references_path_dialoGPT, 'dialoGPT_Large': references_path_dialoGPT_large, 'GPT2': references_path_gpt2, 'GPT2_Large': references_path_gpt2_large, 'Beluga': references_path_beluga}
modes = ['CD1', 'CD2']

In [None]:
df.head()

In [None]:
#GET LABELS FROM DATASET FOR CD1 

labels = pd.read_csv('Labels Prediction/data/daily_dialog_test_next_window3.csv', encoding = 'UTF-8')
text = labels['text'].tolist()
labels = labels['label'].tolist()
print(len(text))

<h2> LOAD MODELS & GET REFERENCES FOR LABELS SEQUENCES</h2>

In [None]:
#LOAD THE MODEL BERT CURRENT
#Setting `problem_type` to be "multi_label_classification" makes sure we use the appropriate loss function, BCEWithLogitsLoss
#The output layer has `len(labels)` output neurons, and we set the id2label and label2id mappings.

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
import numpy as np

config = AutoConfig.from_pretrained(model_name_or_path)
id2label = [config.id2label[key] for key in sorted(config.id2label.keys(), key=lambda t: int(t))]
id2label = np.asarray(id2label)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path)
device = torch.device(0)
model = model.to(device)
t = model.eval()

In [None]:
#LOAD THE MODEL: BART NO-CD

from transformers import AutoConfig, AutoModelForSeq2SeqLM
import numpy as np

encoder_max_length = 256  
decoder_max_length = 64

prediction_config = AutoConfig.from_pretrained(prediction_model_path)
prediction_tokenizer = AutoTokenizer.from_pretrained(prediction_model_path, use_fast=True)
prediction_model = AutoModelForSeq2SeqLM.from_pretrained(prediction_model_path)
prediction_model = prediction_model.to(device)
prediction_t = prediction_model.eval()

In [None]:
#FUNCTIONS TO PREDICT THE LABELS FOR CD2

import re

def find_true_pred(pred, labels):
    match = []
    for x in pred:
        flag = False
        idx = 0
        while idx < len(labels) and not flag:
            label = labels[idx]
            if x in label or label in x or x == label:
                match.append(label)
                flag = True
            
            idx += 1
    
    if not flag:
        print('NO')
        
    return match


def get_labels(preds, labels):
    preds = [x.split('+') if '+' in x else x.split(', ') for x in preds]

    clean_preds = []
    c =0

    for pred in preds:
        x = find_true_pred(pred, labels)

        if len(x) <1:
            #print(pred)
            c += 1
        clean_preds.append(x)

    
    print("NUMBR OF EMPTY PREDS : ", c)
                
    return clean_preds


def generate_summary_k(test_samples):
    samples = test_samples
    generated_output_str = []

    for i in tqdm(range(len(samples))):
        inputs = prediction_tokenizer(
            samples[i],
            padding="max_length",
            truncation=True,
            max_length=encoder_max_length,
            return_tensors="pt",
        )
        input_ids = inputs.input_ids.to(model.device)
        attention_mask = inputs.attention_mask.to(model.device)
        outputs = prediction_model.generate(input_ids, attention_mask=attention_mask)
        output_str = prediction_tokenizer.batch_decode(outputs, skip_special_tokens=True)
        output_str = output_str[0].replace('+', ', ')
        generated_output_str.append(output_str)
        
    return generated_output_str

In [None]:
# GET OR MAKE THE REFERENCE LABELS FILE

#If the labels expected for CD2 have already been generated by BART NO-CD, set to False
make_references = True

labels_path = output_path + 'reference_labels_filter_rerank_window3.csv'


#IF MAKE REFERENCES, BART NO-CD WILL GENERATE THE SEQUENCES OF LABELS FOR ALL THE TEST SAMPLES
if make_references:
    acts = ['inform', 'question', 'directive', 'commissive']
    emos= ['neutral', 'anger', 'disgust', 'fear', 'happiness', 'sadness','surprise']
    labels_list = acts + emos

    #Get Labels from dataset
    labels = pd.read_csv('Labels Prediction/data/daily_dialog_test.csv', encoding = 'UTF-8')
    ref = pd.read_csv(references_path_dialoGPT, encoding = 'UTF-8')

    text = labels['text'].tolist()
    labels = labels['label'].tolist()

    df_labels_expected = []
    for i, row in tqdm(ref.iterrows()):
        flag = True
        j =0
        while j < len(text) and flag:
            if row['actual responses'] == text[j]:
                df_labels_expected.append([labels[j]])
                flag= False
            j +=1

        if flag == True:
            print('Interesting...')

    contexts = df['inputs']
    contexts = [ast.literal_eval(c)[-1] for c in contexts]
    contexts = [n.strip() for n in contexts]

    df_labels_predicted = generate_summary_k(contexts)
    df_labels_predicted = get_labels(df_labels_predicted, labels_list)

    #SAVE LABELS
    if len(df_labels_expected) == len(df) and len(df_labels_predicted) == len(df):
        print('Saving Labels...')
        df_labels = pd.DataFrame({'inputs':ref['actual responses'], 'ground_truth': df_labels_expected, 'predicted': df_labels_predicted})
        df_labels.to_csv(labels_path, encoding='UTF-8', index=False)

    else:
        print('Error')

In [None]:
# OPEN THE LABELS FILE 

labels_ref = pd.read_csv(labels_path, encoding='UTF-8')
labels_ref.head()

<h2> FILTER & RERANK LOOP</h2>

In [27]:
# PREDICT LABELS OF CURRENT CANDIDATE USING BERT CURRENT

def predict_sentence(text, k=None, verbose=False):
    features = tokenizer(text, return_tensors="pt", truncation=True)
    features = features.to(device)

    with torch.inference_mode():
        outputs = model(**features)
        logits = outputs[0]
        logits = logits.sigmoid()

    logits = logits.detach().cpu().numpy()

    # sort results by descending order
    pred_scores = np.sort(logits)[:, ::-1]
    pred_ids = np.argsort(logits)[:, ::-1]

    pred_scores = pred_scores[0]
    pred_labels = id2label[pred_ids[0]]
    
    if k is not None:
        pred_scores = pred_scores[:k]
        pred_labels = pred_labels[:k]
    
    if verbose:
        print(f'"{text}"')
        for i, (s, l) in enumerate(zip(pred_scores, pred_labels)):
            print(f"{l:30} : {s}")
        print()
    
    return pred_labels, pred_scores

In [28]:
#DEFINE NORMALISED LEVENSHTEIN AS THE SIMILARITY METRIC

from textdistance import levenshtein

def similarity(reference, results):
    return levenshtein.normalized_similarity(results,reference)


In [29]:
#FUNCTIONS


#GET first N candidates
def filter(candidates, k=10):
    topk= candidates[:k]
    return topk


#Select only the predictions that are above the fixed threshold
def select_pred(predictions, scores, threshold):
    selected = [predictions[x] for x in range(len(predictions)) if scores[x] >= threshold]
    return selected


#Get similarity scores between the labels of the candidates and the labels expected
def get_scores(candidates, reference):
    similarity_scores = []
    for candidate in candidates:
        results, scores = predict_sentence(candidate)
        results = select_pred(results, scores, 0.7)
        if len(results) < 1:
            results = select_pred(results, scores, 0.5)

        #print(results, reference)
        similarity_scores.append(similarity(reference, results))


    return similarity_scores


#Rank the list of candidates according to their similarity scores
def rerank(candidates, scores):
    ranked_scores = scores
    ranked_scores.sort(reverse=True)
    ranked_candidates = []
    #print(ranked_scores)

    for score in ranked_scores:
        idx = scores.index(score)
        ranked_candidates.append(candidates[idx])
    return ranked_candidates, ranked_scores


In [30]:
#From the reference file, gets the candidates response associated with each test sample
def get_candidates(df, columns):
    candidates = []
    for i in range(len(df)):
        candidate = [df[j][i] for j in columns]
        candidates.append([c for c in candidate if isinstance(c, str)])
    return candidates

In [31]:
#DEFINE FILTER AND RERANK FUNCTION FOR ONE REFERENCE FILE

def filter_rerank(df):

    #GET RESPONSES, INPUTS AND LABELS
    references = df['actual responses']
    inputs = df['inputs']
    labels = df['labels']

    #ONLY KEEP CANDIDATES, AND LABELS
    columns = [x for x in df.columns if x not in ["actual responses", 'inputs']]
    print(columns)

    #GET CANDIDATES
    candidates = get_candidates(df, columns)
    #print(candidates[0])

    #INITIALISE RESULTS DICT
    results = {'input':[], 'reference': [], 'prediction': [], 'similarity': []}

    #FOR EACH TEST SAMPLE, SELECT THE FINAL CANDIDATE
    for i in tqdm(range(len(df))):
        current_candidates = filter(candidates[i])
        current_reference = references[i]
        current_labels = labels[i]
        current_input = inputs[i]

        #FOR EACH CANDIDATE, GET THEIR LABELS AND THEIR SCORES TO COMPUTE SIMILARITY
        current_scores = get_scores(current_candidates, current_labels)

        #RERANK THE CANDIDATES WITH THEIR SIMILARITY SCORES
        if len(current_scores) > 0:
            ranked_candidates, ranked_scores = rerank(current_candidates, current_scores)
        else:
            ranked_candidates, ranked_scores = current_candidates, current_scores

        #ADD TO RESULTS DICT
        results['input'].append(current_input)
        results['reference'].append(current_reference)

        if len(ranked_candidates) > 0:
            results['prediction'].append(ranked_candidates[0])
            results['similarity'].append(ranked_scores[0])
            
        else:
            results['prediction'].append('None')
            results['similarity'].append(0.0)

    return results

In [None]:
#LOOP THE PROCESS ON ALL CONFIGURATIONS OF MODEL / CD1/2

for model_name, reference_path in models_list.items():
    print(model_name, reference_path) # FOR EACH SET OF RESULT

    for mode in modes:
        print('CURRENT MODE: ', mode)
        df = pd.read_csv(reference_path, encoding = 'UTF-8')

        if mode == "CD1":
            df['labels'] = [ast.literal_eval(x) for x in labels_ref['ground_truth']]
            
        else:
            df['labels'] = [ast.literal_eval(x) for x in labels_ref['predicted']]

        print('Computing results...')

        #CALL FILTER AND RERANK FUNCTION
        results = filter_rerank(df)

        #SAVE FINAL CANDIDATES
        output_path = '/Filter Rerank/results_filter_window3/'
        file_generated = output_path + model_name + "_final_responses_window3_" + mode
        print(file_generated)

        print('Saving results...')
        df_new = pd.DataFrame(results)
        df_new.to_csv(file_generated +'.csv', index = False, encoding = 'UTF-8')


        generated_responses =results['reference']
        actual_responses= results['prediction']

        #GET METRICS
        print('Computing scores...')
        bleu_score = sacrebleu.compute(predictions=generated_responses, references=actual_responses)

        rouge_score = rouge.compute(predictions=generated_responses, references=actual_responses)

        bert_score = bertscore.compute(predictions=generated_responses, references=actual_responses, lang='en')
        precision = bert_score['precision']
        recall = bert_score['recall']
        f1 = bert_score['f1']
        avg_precision_bert = sum(precision) / len(precision)
        avg_recall_bert = sum(recall) / len(recall)
        avg_f1_bert = sum(f1) / len(f1)

        chrf_score = chrf.compute(predictions=generated_responses, references=actual_responses)    

        #COSINE SIMILARITY
        embedding_1= cosine_model.encode(generated_responses)
        embedding_2 = cosine_model.encode(actual_responses)

        cosine = [util.pytorch_cos_sim(embedding_1[i], embedding_2[i]) for i in range(len(generated_responses))]

        new_cosine = []

        for x in cosine:
            new_cosine.append(x[0].item())

        mean_cosine = mean(new_cosine)
        
        #WRITE IN TXT FILE
        print('Saving results...')
        fout = open(file_generated+".txt", "w")
        fout.write('Bleu score: {} \n '.format(bleu_score)) #Range from 0 to 100
        fout.write('Rouge score: {} \n'.format(rouge_score))
        fout.write('Bert score:  {} \n'.format(bert_score))
        fout.write('Avg precision Bert score: {} \n'.format(avg_precision_bert))
        fout.write('Avg recall Bert score: {} \n'.format(avg_recall_bert))
        fout.write('Avg f1 Bert score: {} \n'.format(avg_f1_bert))
        fout.write('chrf score: {} \n'.format(chrf_score))
        fout.write('average similarity: {}\n'.format(mean(results['similarity'])))
        fout.write('average cosine similarity: {} \n'.format(mean_cosine))
        fout.close()