In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import evaluate


In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "madhavappaneni/t5-small-empathetic-dialogues"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)


In [12]:
def generate_response(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    outputs = model.generate(input_ids,
                             max_length=30,
                             do_sample=True,
                             num_beams=5,
                             no_repeat_ngram_size=2)
    generated_text = tokenizer.decode(
        outputs[0], skip_special_tokens=True)
    return generated_text


generate_response("My car crashed yesterday")


"Oh no! I'm sorry. What happened?"

In [13]:
import pandas as pd

test_df = pd.read_pickle('./datasets/test.pkl')

test_df.shape


(5242, 1)

In [14]:
reference_sentences = []
generated_sentences = []
start = 1
end = 5
for index, row in test_df[start:end].iterrows():
    input_text, label = row['conversation']['input_text'], row['conversation']['label']
    generated_sentences.append(generate_response(input_text))
    reference_sentences.append(label)

print(generated_sentences)
print(reference_sentences)


["I'm sorry to hear that.", "That's awesome! What kind of chocolate did you get?", "That's a good thing. I'm sure he'll do well.", "I'm sorry to hear that."]
["Why did you feel guilty? People really shouldn't drive drunk.", 'Wow that must have been a surprise for you', 'you got a great husband', "I know how you feel, it's terrible. I actually moved to a completely different state due to hi taxes."]


In [15]:
batch_size = 100
num_batches = int(len(test_df) / batch_size)

for batch in range(num_batches):
    start_index = batch * batch_size
    # start_index = batch * batch_size + 2600
    end_index = min(start_index + batch_size, len(test_df))
    batch_data = test_df[start_index:end_index]
    reference_sentences = []
    generated_sentences = []
    for index, row in batch_data.iterrows():
        input_text, label = row['conversation']['input_text'], row['conversation']['label']
        generated_sentences.append(generate_response(input_text))
        reference_sentences.append(label)
    df = pd.DataFrame({'reference_sentences': reference_sentences,
                       'generated_sentences': generated_sentences})
    df.to_pickle(f'./files/ref_gen_{start_index}.pkl')


Token indices sequence length is longer than the specified maximum sequence length for this model (1879 > 512). Running this sequence through the model will result in indexing errors


In [16]:
import os
import pandas as pd

# Create an empty list to store the DataFrames
df_list = []

directory = './files/'

# Loop through all the files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.pkl'):
        # Read the file as a DataFrame and append to the list
        filepath = os.path.join(directory, filename)
        df_list.append(pd.read_pickle(filepath))

# Concatenate all the DataFrames in the list into a single DataFrame
merged_df = pd.concat(df_list, ignore_index=True)
merged_df.shape
merged_df.to_pickle('./files/ref_gen_master.pkl')


In [17]:
merged_df = pd.read_pickle('./files/ref_gen_master.pkl')
merged_df.head()


Unnamed: 0,reference_sentences,generated_sentences
0,Well if you stay in touch then I'm sure she'll...,That's a good idea. I'm sorry to hear that.
1,What book do you enjoy reading?,That's great. What kind of wine do you have?
2,"No, actually I'm under a lot of stress right n...",I love historical novels and legal dramas. I'm...
3,Where is your home village?,That's so sad! I'm sorry to hear that. What di...
4,"That's a shame. I""m sorry to hear that.",That's a lot of fun. I'm so glad you're able t...


In [18]:
reference_sentences = list(merged_df['reference_sentences'])
generated_sentences = list(merged_df['generated_sentences'])
len(reference_sentences), len(generated_sentences)


(5200, 5200)

#BLEU


In [19]:
bleu = evaluate.load("bleu", module_type="metric")
results = bleu.compute(predictions=generated_sentences,
                       references=reference_sentences)

results['bleu']


0.017902194965308524

#BLEURT


In [20]:
# bleurt = evaluate.load("bleurt", module_type="metric")
# results = bleurt.compute(predictions=generated_sentences,
#                          references=reference_sentences)


#BERT


In [21]:
from bert_score import score

precision, recall, f1 = score(
    generated_sentences, reference_sentences, lang='en', verbose=False)

print(f"BERT score (F1): {f1.mean().item():.2f}")


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERT score (F1): 0.86


#ROUGE


In [22]:
import evaluate
rouge = evaluate.load('rouge')
rouge_score = rouge.compute(
    predictions=generated_sentences, references=reference_sentences)
rouge_score


{'rouge1': 0.1705165416871462,
 'rouge2': 0.03395053421928472,
 'rougeL': 0.15224178051421672,
 'rougeLsum': 0.15217369523103003}

#Perplexity


In [None]:
# perplexity = evaluate.load("perplexity", module_type="metric")
# perplexity_results = perplexity.compute(
#     predictions=generated_sentences, model_id='gpt2')
# print(f"Perplexity score: {perplexity_results['mean_perplexity']}")

# TODO:
# This code takes a lot of time to run as it compares with GPT2 response. Perplexity can also be calculated from loss. Check that possibility
