In [None]:
!pip3 install rouge_score nltk absl-py


In [12]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import evaluate


In [350]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "madhavappaneni/t5-small-chit-chat"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)


In [351]:
def generate_response(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    outputs = model.generate(input_ids, max_length=30,
                             do_sample=True, num_beams=5, no_repeat_ngram_size=2)
    generated_text = tokenizer.decode(
        outputs[0], skip_special_tokens=True)
    return generated_text


generate_response("How are you?")


'How are you?'

In [352]:
import pandas as pd

test_df = pd.read_pickle('./datasets/test.pkl')

test_df.shape


(6662, 1)

In [360]:
reference_sentences = []
generated_sentences = []
start = 1
end = 5
for index, row in test_df[start:end].iterrows():
    input_text, label = row['conversation']['input_text'], row['conversation']['label']
    generated_sentences.append(generate_response(input_text))
    reference_sentences.append(label)

print(generated_sentences)
print(reference_sentences)


["I'm not. I haven't heard of a lot of people.", "I don't know if I'm going to go to church in about 20mins!", "Oh my goodness! I'm up to you! What's your favorite thing about you?", 'Nice!']
['I am doing really well thanks for asking Are you just relaxing or doing anything fun today?', 'Oh that is exciting! Yeah real quick then BYU did this so they can get conversation data to help improve AI units to be more capable of conversation Rather than just one party talking the whole time', 'Yeah its pretty cool I am just spending the next few hours relaxing', 'Yeah It is always nice to take a break I dont really ever take enough']


In [361]:
for batch in range(67):
    start_index = batch * 100
    end_index = min(start_index + 100, 6662)
    batch_data = test_df[start_index:end_index]
    reference_sentences = []
    generated_sentences = []
    for index, row in batch_data.iterrows():
        input_text, label = row['conversation']['input_text'], row['conversation']['label']
        generated_sentences.append(generate_response(input_text))
        reference_sentences.append(label)
        df = pd.DataFrame({'reference_sentences': reference_sentences,
                           'generated_sentences': generated_sentences})
        df.to_pickle('./files/ref_gen_'+str(start_index)+'.pkl')


Token indices sequence length is longer than the specified maximum sequence length for this model (558 > 512). Running this sequence through the model will result in indexing errors


In [None]:
batch_size = 100
num_batches = int(len(test_df) / batch_size)

for batch in range(num_batches):
    start_index = batch * batch_size
    end_index = min(start_index + batch_size, len(test_df))
    batch_data = test_df[start_index:end_index]
    reference_sentences = []
    generated_sentences = []
    for index, row in batch_data.iterrows():
        input_text, label = row['conversation']['input_text'], row['conversation']['label']
        generated_sentences.append(generate_response(input_text))
        reference_sentences.append(label)
    df = pd.DataFrame({'reference_sentences': reference_sentences,
                       'generated_sentences': generated_sentences})
    df.to_pickle(f'./files/ref_gen_{start_index}.pkl')


In [362]:
import os
import pandas as pd

# Create an empty list to store the DataFrames
df_list = []

directory = './files/'

# Loop through all the files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.pkl'):
        # Read the file as a DataFrame and append to the list
        filepath = os.path.join(directory, filename)
        df_list.append(pd.read_pickle(filepath))

# Concatenate all the DataFrames in the list into a single DataFrame
merged_df = pd.concat(df_list, ignore_index=True)
merged_df.shape
merged_df.to_pickle('./files/ref_gen_master.pkl')


In [366]:
merged_df = pd.read_pickle('./files/ref_gen_master.pkl')
merged_df.head()


Unnamed: 0,reference_sentences,generated_sentences
0,Nice! So you are planning on going to the part...,I'm glad you have a good day!
1,"Haha, maybe you can find a girlfriend there. W...","Ya, it will be fun to meet up with all the peo..."
2,What if it's a dude. I think what we could hav...,Haha yeah I think that would be a lot of fun. ...
3,"try it. It would be very casual, and she would...",I dont know what happened on the test. I feel ...
4,"As for the sixty eight, welcome to all of my c...","I asked her if she wanted to, but she was a girl."


In [367]:
reference_sentences = list(merged_df['reference_sentences'])
generated_sentences = list(merged_df['generated_sentences'])
len(reference_sentences), len(generated_sentences)


(6742, 6742)

#BLEU


In [375]:
bleu = evaluate.load("bleu", module_type="metric")
results = bleu.compute(predictions=generated_sentences,
                       references=reference_sentences)

results['bleu']


0.0043790118008002815

#BLEURT


In [370]:
# bleurt = evaluate.load("bleurt", module_type="metric")
# results = bleurt.compute(predictions=generated_sentences,
#                          references=reference_sentences)


#BERT


In [371]:
from bert_score import score

precision, recall, f1 = score(
    generated_sentences, reference_sentences, lang='en', verbose=False)

print(f"BERT score (F1): {f1.mean().item():.2f}")


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERT score (F1): 0.84


#ROUGE


In [373]:
import evaluate
rouge = evaluate.load('rouge')
rouge_score = rouge.compute(
    predictions=generated_sentences, references=reference_sentences)
rouge_score


{'rouge1': 0.1079494561188625,
 'rouge2': 0.015083678226932799,
 'rougeL': 0.09158095584013573,
 'rougeLsum': 0.09160558517419273}

#Perplexity


In [20]:
# perplexity = evaluate.load("perplexity", module_type="metric")
# perplexity_results = perplexity.compute(
#     predictions=generated_sentences, model_id='gpt2')
# print(f"Perplexity score: {perplexity_results['mean_perplexity']}")

# TODO:
# This code takes a lot of time to run as it compares with GPT2 response. Perplexity can also be calculated from loss. Check that possibility


Using pad_token, but it is not set yet.


  0%|          | 0/7 [00:00<?, ?it/s]

Perplexity score: 47.30846137523651
