In [None]:
!pip3 install rouge_score nltk absl-py transformers evaluate sentencepiece
!pip3 install bleurt



In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import evaluate
import os
import pandas as pd


In [3]:
model_name = "madhavappaneni/t5-small-empathetic-dialogues"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)


In [4]:
def generate_response(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    outputs = model.generate(input_ids,
                             min_length=20,
                             max_length=30,
                             do_sample=True,
                             num_beams=3,
                             no_repeat_ngram_size=2)
    generated_text = tokenizer.decode(
        outputs[0], skip_special_tokens=True)
    return generated_text


generate_response("My car crashed yesterday")


"Oh no! That's terrible! Did you get a new one or did you miss it?"

In [13]:
import pandas as pd

test_df = pd.read_pickle('./datasets/test.pkl')

test_df.shape


(5242, 1)

In [14]:
reference_sentences = []
generated_sentences = []
start = 2200
end = 2215
input_texts = []
for index, row in test_df[start:end].iterrows():
    input_text, label = row['conversation']['input_text'], row['conversation']['label']
    generated_sentences.append(generate_response(input_text))
    reference_sentences.append(label)
    input_texts.append(row['conversation']['input_text'])

# print(generated_sentences)
# print(reference_sentences)

# for x, y, z in zip(input_texts, reference_sentences, generated_sentences):
#     print(x)
#     print(y)
#     print(z)
#     print()


In [15]:
batch_size = 100
num_batches = int(len(test_df) / batch_size)

for batch in range(num_batches):
    start_index = batch * batch_size
    # start_index = batch * batch_size + 2600
    end_index = min(start_index + batch_size, len(test_df))
    batch_data = test_df[start_index:end_index]
    reference_sentences = []
    generated_sentences = []
    for index, row in batch_data.iterrows():
        input_text, label = row['conversation']['input_text'], row['conversation']['label']
        generated_sentences.append(generate_response(input_text))
        reference_sentences.append(label)
    df = pd.DataFrame({'reference_sentences': reference_sentences,
                       'generated_sentences': generated_sentences})
    df.to_pickle(f'./files/ref_gen_{start_index}.pkl')


Token indices sequence length is longer than the specified maximum sequence length for this model (1879 > 512). Running this sequence through the model will result in indexing errors


In [16]:
df_list = []

directory = './files/'

for filename in os.listdir(directory):
    if filename.endswith('.pkl'):
        filepath = os.path.join(directory, filename)
        df_list.append(pd.read_pickle(filepath))

merged_df = pd.concat(df_list, ignore_index=True)
merged_df.shape
merged_df.to_pickle('./files/ref_gen_master.pkl')


In [7]:
merged_df = pd.read_pickle('./files/ref_gen_master.pkl')
merged_df.head()


Unnamed: 0,reference_sentences,generated_sentences
0,Well if you stay in touch then I'm sure she'll...,That's a good idea. I'm sorry to hear that.
1,What book do you enjoy reading?,That's great. What kind of wine do you have?
2,"No, actually I'm under a lot of stress right n...",I love historical novels and legal dramas. I'm...
3,Where is your home village?,That's so sad! I'm sorry to hear that. What di...
4,"That's a shame. I""m sorry to hear that.",That's a lot of fun. I'm so glad you're able t...


In [8]:
reference_sentences = list(merged_df['reference_sentences'])
generated_sentences = list(merged_df['generated_sentences'])
len(reference_sentences), len(generated_sentences)


(5200, 5200)

#BLEU


In [11]:
bleu = evaluate.load("bleu", module_type="metric")

bleu1 = bleu.compute(predictions=generated_sentences,
                     references=reference_sentences, max_order=1)
bleu2 = bleu.compute(predictions=generated_sentences,
                     references=reference_sentences, max_order=2)
bleu3 = bleu.compute(predictions=generated_sentences,
                     references=reference_sentences, max_order=3)
bleu4 = bleu.compute(predictions=generated_sentences,
                     references=reference_sentences, max_order=4)

print('BLEU 1', bleu1['bleu'])
print('BLEU 2', bleu2['bleu'])
print('BLEU 3', bleu3['bleu'])
print('BLEU 4', bleu4['bleu'])


print('Average BLEU:', (bleu1['bleu'] +
      bleu2['bleu'] + bleu3['bleu'] + bleu4['bleu'])/4)


BLEU 1 0.14640645018745724
BLEU 2 0.05559884389157614
BLEU 3 0.02949287368717347
BLEU 4 0.017902194965308524
Average BLEU: 0.06235009068287885


#BLEURT


In [11]:
from bleurt import score

scorer = score.BleurtScorer()
scores = scorer.score(references=reference_sentences, candidates=generated_sentences)
print(sum(scores)/len(scores))

INFO:tensorflow:No checkpoint specified, defaulting to BLEURT-tiny.
INFO:tensorflow:Reading checkpoint C:\Users\Debolina\AppData\Roaming\Python\Python311\site-packages\bleurt\test_checkpoint.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint dbleurt_tiny
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:dbleurt_tiny
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:512
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.
INFO:tensorflow:BLEURT initialized.
-0.5719969442758995


#BERT


In [21]:
from bert_score import score

precision, recall, f1 = score(
    generated_sentences, reference_sentences, lang='en', verbose=False)

print(f"BERT score (F1): {f1.mean().item():.2f}")


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERT score (F1): 0.89


#ROUGE


In [22]:
rouge = evaluate.load('rouge')
rouge_score = rouge.compute(
    predictions=generated_sentences, references=reference_sentences)
rouge_score


{'rouge1': 0.2745165416871462,
 'rouge2': 0.17395053421928472,
 'rougeL': 0.15224178051421672,
 'rougeLsum': 0.15217369523103003}