### Reference: 

https://huggingface.co/MaRiOrOsSi/t5-base-finetuned-question-answering

In [1]:
from  transformers  import  AutoTokenizer, AutoModelWithLMHead, pipeline
import pandas as pd

In [2]:
model_name = "MaRiOrOsSi/t5-base-finetuned-question-answering"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
model = AutoModelWithLMHead.from_pretrained(model_name)



In [5]:
question = "Who graduated from QUT?"
context = "The name of the scientist is Dr. Bashar. He graduated from QUT."
input = f"question: {question} context: {context}"
encoded_input = tokenizer([input],
                             return_tensors='pt',
                             max_length=512,
                             truncation=True)
output = model.generate(input_ids = encoded_input.input_ids,
                            attention_mask = encoded_input.attention_mask)
output = tokenizer.decode(output[0], skip_special_tokens=True)
print(output)

Dr. Bashar




In [6]:
from datasets import load_dataset
squad = load_dataset("squad")

Found cached dataset parquet (C:/Users/basharm/.cache/huggingface/datasets/parquet/plain_text-d0ce9b3222e19e32/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)


  0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
squad['validation']

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10570
})

In [8]:
def predict_answer(question, context):
    input = f"question: {question} context: {context}"
    encoded_input = tokenizer([input],
                             return_tensors='pt',
                             max_length=512,
                             truncation=True)
    output = model.generate(input_ids = encoded_input.input_ids,
                            attention_mask = encoded_input.attention_mask)
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    
    return output

count = 0
out_rows = []
for row in squad['validation']:
    question = row['question']
    #print(question)
    context = row['context']
    #print(context)
    ans = row['answers']['text'][0]
    #print(ans)
    pred_ans = predict_answer(question, context)
    #print(pred_ans)
    out_row = {'answer': ans, 'pred_ans': pred_ans}
    out_rows.append(out_row)
    count += 1
    if count%100==0:
        print('Completed Respon Count', count)
    
df_out_val = pd.DataFrame(out_rows)



Completed Respon Count 100
Completed Respon Count 200
Completed Respon Count 300
Completed Respon Count 400
Completed Respon Count 500
Completed Respon Count 600
Completed Respon Count 700
Completed Respon Count 800
Completed Respon Count 900
Completed Respon Count 1000
Completed Respon Count 1100
Completed Respon Count 1200
Completed Respon Count 1300
Completed Respon Count 1400
Completed Respon Count 1500
Completed Respon Count 1600
Completed Respon Count 1700
Completed Respon Count 1800
Completed Respon Count 1900
Completed Respon Count 2000
Completed Respon Count 2100
Completed Respon Count 2200
Completed Respon Count 2300
Completed Respon Count 2400
Completed Respon Count 2500
Completed Respon Count 2600
Completed Respon Count 2700
Completed Respon Count 2800
Completed Respon Count 2900
Completed Respon Count 3000
Completed Respon Count 3100
Completed Respon Count 3200
Completed Respon Count 3300
Completed Respon Count 3400
Completed Respon Count 3500
Completed Respon Count 3600
C

In [9]:
df_out_val

Unnamed: 0,answer,pred_ans
0,Denver Broncos,Denver Broncos
1,Carolina Panthers,Denver Broncos
2,"Santa Clara, California","Santa Clara, California"
3,Denver Broncos,Denver Broncos
4,gold,gold
...,...,...
10565,kilogram-force,Kilowatt-force
10566,kilopond,Kilopond
10567,slug,metric slug
10568,kip,Kilowatt-force


In [10]:
df_out_val.to_csv('T5_squad_valid.csv', index=None)

In [11]:
def exact_match(pred_tokens, true_tokens):
    '''
    A straightforward way to check the equality of the two lists in Python 
    is by using the equality == operator. 
    When the equality == is used on the list type in Python, 
    it returns True if the lists are equal and False if they are not.
    '''
    return int(pred_tokens==true_tokens)

def half_exact_match(pred_tokens, true_tokens):
    '''
    A straightforward way to check the equality of the two lists in Python 
    is by using the equality == operator. 
    When the equality == is used on the list type in Python, 
    it returns True if the lists are equal and False if they are not.
    '''
    if len(pred_tokens)<=1 or len(true_tokens)<=1:
        return int(pred_tokens==true_tokens) 
    else:
        return int(pred_tokens[0]==true_tokens[0] or pred_tokens[-1]==true_tokens[-1])
    
def any_token_match(pred_tokens, true_tokens):
    common_tokens = set(pred_tokens) & set(true_tokens)
    return int(len(common_tokens)>0)
    

def get_prec_rec_f1(pred_tokens, true_tokens):
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(true_tokens) == 0:
        prec = rec = f1 = 1
        return prec, rec, f1
    
    common_tokens = set(pred_tokens) & set(true_tokens)
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        prec = rec = f1 = 0
        return prec, rec, f1
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(true_tokens)
    f1 = 2 * (prec * rec) / (prec + rec)
    
    return prec, rec, f1

In [13]:
match = 0
half_match = 0
any_match = 0
prec = 0
rec = 0
f1 = 0
count = 0
for idx in df_out_val.index:

    pred_tokens = df_out_val.loc[idx, 'pred_ans'].split()
    true_tokens = df_out_val.loc[idx, 'answer'].split()
    
    match +=  exact_match(pred_tokens, true_tokens)
    half_match += half_exact_match(pred_tokens, true_tokens)
    any_match += any_token_match(pred_tokens, true_tokens)
    
    score =  get_prec_rec_f1(pred_tokens, true_tokens)
    prec += score[0]
    rec += score[1]
    f1 += score[2]
    count += 1

    
import datetime
now = datetime.datetime.now()
string = ''
string += '========={}========\n'.format(now)
string += '========={}========\n'.format(now)
string += 'exact_match: '+str(match/count)+'\n'
string += 'half_exact_match: '+str(half_match/count)+'\n'
string += 'any_match: '+str(any_match/count)+'\n'
string += 'recall: '+str(rec/count)+'\n'
string += 'precision: '+str(prec/count)+'\n'
string += 'f1: '+str(f1/count)+'\n\n'
print(string)

with open('Report_T5_Squad.txt', 'a+') as FO:
    FO.write(string)

exact_match: 0.38420056764427624
half_exact_match: 0.584484389782403
any_match: 0.7371807000946073
recall: 0.6144846712364499
precision: 0.6243597824517078
f1: 0.6001928249863442


