<a href="https://colab.research.google.com/github/omarkapur-mids/w266-project/blob/phillip/T5_DROP_F1_EM_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using T5 on DROP

#### Package installs

In [1]:
# !pip install --quiet transformers
# !pip install --quiet sentencepiece
# !pip install --quiet wget
# !pip install --quiet datasets
# #!pip install --quiet ipywidgets
# #!pip install --quiet tensorflow


#### check gpu

In [2]:
# !nvidia-smi

#### Download allennlp drop_eval module

https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py

In [3]:
!python -m wget https://raw.githubusercontent.com/allenai/allennlp-reading-comprehension/master/allennlp_rc/eval/drop_eval.py -o drop_eval.py


Saved under drop_eval (58).py


#### set directories

In [4]:
!mkdir data

data_dir = "./data"
log_dir = f"{data_dir}/experiments/t5/logs"
save_path = f"{data_dir}/experiments/t5/models"
cache_path_train = f"{data_dir}/cache/t5.train"
cache_path_test = f"{data_dir}/cache/t5.test"

mkdir: cannot create directory ‘data’: File exists


#### load packages

In [5]:
from transformers import T5Tokenizer, TFT5ForConditionalGeneration
import tensorflow as tf
import tensorflow.keras as keras
import drop_eval
import pandas as pd
import numpy as np
import json
from datasets import Dataset, load_dataset
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import datetime


%load_ext tensorboard


run_toy = True

In [6]:
drop_eval.get_metrics(predicted=["1, 2 3 4?", "this"],gold=["1, 2 3 4?","0"])

(0.0, 0.5)

In [7]:
# output_1 = model.predict(tf_train_ds.take(1))

In [8]:
# for x in output_1:
#   print(x)

#### Define model class

In [9]:
class T5forDrop(TFT5ForConditionalGeneration):
    def __init__(self, *args, log_dir=None, cache_dir= None, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_tracker= tf.keras.metrics.Mean(name='loss')
        self.F1_tracker= tf.keras.metrics.Mean(name='F1')
        self.EM_tracker= tf.keras.metrics.Mean(name='EM')        

    
    @tf.function
    def train_step(self, data):
        x = data
        y_true = x["labels"]
        with tf.GradientTape() as tape:
            outputs = self(x, training=True)
            logits = outputs['logits']
            y_pred = tf.math.argmax(tf.nn.softmax(logits,axis=2), axis = 2, output_type=tf.int32)
            loss = tf.reduce_mean(outputs['loss'])            
            grads = tape.gradient(loss, self.trainable_variables)

        # Calculate F1 and EM Metrics
        # Create a word mask to not count the padding/sentence tokens
        recall_word_mask = tf.math.logical_and(
                    tf.math.not_equal(y_true,0)
                    ,tf.math.not_equal(y_true,1))

        precision_word_mask = tf.math.logical_and(
                    tf.math.not_equal(y_pred,0)
                    ,tf.math.not_equal(y_pred,1))

        # match the tokens
        match_token = tf.math.equal(y_true,y_pred)
        recall_match = tf.math.logical_and(match_token,recall_word_mask) 
        precision_match = tf.math.logical_and(match_token,precision_word_mask) 

        # calculate score
        precision_array = tf.math.reduce_sum(tf.cast(precision_match, tf.int32) ,axis=1)/tf.math.reduce_sum(tf.cast(precision_word_mask, tf.int32) ,axis=1)
        recall_array = tf.math.reduce_sum(tf.cast(recall_match, tf.int32) ,axis=1)/tf.math.reduce_sum(tf.cast(recall_word_mask, tf.int32) ,axis=1)

        P = tf.math.reduce_mean(precision_array)
        R = tf.math.reduce_mean(recall_array)

        EM = tf.math.reduce_mean(
            tf.cast(tf.math.logical_and(
                    tf.math.equal(precision_array,1)
                    ,tf.math.equal(recall_array,1)), tf.int32))
        F1 = 2*(P*R)/(P+R)
        
        '''
        Note, since FP and FN are the same, 
        the F1 score is the same as Precision, Recall, 
        which is the average "match" score
        '''

        y_true = tf.reshape(y_true, [-1, 1])

        self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
        lr = self.optimizer._decayed_lr(tf.float32)

        self.loss_tracker.update_state(loss)
        self.F1_tracker.update_state(F1)   
        self.EM_tracker.update_state(EM)           
        self.compiled_metrics.update_state(y_true, logits)
        metrics = {m.name: m.result() for m in self.metrics}
        metrics.update({'lr': lr})
        
        return metrics

    def test_step(self, data):
        x = data
        y_true = x["labels"]
        outputs = self(x, training=True)
        logits = outputs['logits']
        y_pred = tf.math.argmax(tf.nn.softmax(logits,axis=2), axis = 2, output_type=tf.int32)
        loss = tf.reduce_mean(outputs['loss'])      
        
        # Calculate F1 and EM Metrics
        # Create a word mask to not count the padding/sentence tokens
        recall_word_mask = tf.math.logical_and(
                    tf.math.not_equal(y_true,0)
                    ,tf.math.not_equal(y_true,1))

        precision_word_mask = tf.math.logical_and(
                    tf.math.not_equal(y_pred,0)
                    ,tf.math.not_equal(y_pred,1))

        # match the tokens
        match_token = tf.math.equal(y_true,y_pred)
        recall_match = tf.math.logical_and(match_token,recall_word_mask) 
        precision_match = tf.math.logical_and(match_token,precision_word_mask) 

        # calculate score
        precision_array = tf.math.reduce_sum(tf.cast(precision_match, tf.int32) ,axis=1)/tf.math.reduce_sum(tf.cast(precision_word_mask, tf.int32) ,axis=1)
        recall_array = tf.math.reduce_sum(tf.cast(recall_match, tf.int32) ,axis=1)/tf.math.reduce_sum(tf.cast(recall_word_mask, tf.int32) ,axis=1)

        P = tf.math.reduce_mean(precision_array)
        R = tf.math.reduce_mean(recall_array)

        EM = tf.math.reduce_mean(
            tf.cast(tf.math.logical_and(
                    tf.math.equal(precision_array,1)
                    ,tf.math.equal(recall_array,1)), tf.int32))
        F1 = 2*(P*R)/(P+R)

        y_true = tf.reshape(y_true, [-1, 1])

        self.loss_tracker.update_state(loss)
        self.F1_tracker.update_state(F1)   
        self.EM_tracker.update_state(EM)           
        self.compiled_metrics.update_state(y_true, logits)
        metrics = {m.name: m.result() for m in self.metrics}        
        return metrics


        EM = tf.math.reduce_mean(
            tf.cast(tf.math.logical_and(
                    tf.math.equal(precision_array,1)
                    ,tf.math.equal(recall_array,1)), tf.int32))

#### Import model and tokenizer

In [10]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5forDrop.from_pretrained('t5-small')#,return_dict=True)

All model checkpoint layers were used when initializing T5forDrop.

All the layers of T5forDrop were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5forDrop for predictions without further training.


#### Import data

In [11]:
train_dataset_full = load_dataset('drop', split='train')
valid_dataset_full = load_dataset('drop', split='validation')

train_dataset_full.features

Using custom data configuration default
Reusing dataset drop (/root/.cache/huggingface/datasets/drop/default/0.1.0/393cc04823935c1302a6a7e380cdbe9f452d37858ea276409787c983748eae25)
Using custom data configuration default
Reusing dataset drop (/root/.cache/huggingface/datasets/drop/default/0.1.0/393cc04823935c1302a6a7e380cdbe9f452d37858ea276409787c983748eae25)


{'answers_spans': Sequence(feature={'spans': Value(dtype='string', id=None), 'types': Value(dtype='string', id=None)}, length=-1, id=None),
 'passage': Value(dtype='string', id=None),
 'query_id': Value(dtype='string', id=None),
 'question': Value(dtype='string', id=None),
 'section_id': Value(dtype='string', id=None)}

#### Reduce data to toy size

In [12]:

if run_toy:
    toy_train_df = train_dataset_full.to_pandas()
    toy_train_df = toy_train_df.head(4)

    toy_valid_df = valid_dataset_full.to_pandas()
    toy_valid_df = toy_valid_df.head(4)

    train_dataset = Dataset.from_pandas(toy_train_df)
    valid_dataset = Dataset.from_pandas(toy_valid_df)
else:
    train_dataset = train_dataset_full
    valid_dataset = valid_dataset_full

#### check out one record

In [13]:
data = next(iter(valid_dataset))
print("Example data from the dataset: \n", data)

Example data from the dataset: 
 {'section_id': 'nfl_1184', 'query_id': 'f37e81fa-ef7b-4583-b671-762fc433faa9', 'passage': " Hoping to rebound from their loss to the Patriots, the Raiders stayed at home for a Week 16 duel with the Houston Texans.  Oakland would get the early lead in the first quarter as quarterback JaMarcus Russell completed a 20-yard touchdown pass to rookie wide receiver Chaz Schilens.  The Texans would respond with fullback Vonta Leach getting a 1-yard touchdown run, yet the Raiders would answer with kicker Sebastian Janikowski getting a 33-yard and a 30-yard field goal.  Houston would tie the game in the second quarter with kicker Kris Brown getting a 53-yard and a 24-yard field goal. Oakland would take the lead in the third quarter with wide receiver Johnnie Lee Higgins catching a 29-yard touchdown pass from Russell, followed up by an 80-yard punt return for a touchdown.  The Texans tried to rally in the fourth quarter as Brown nailed a 40-yard field goal, yet the

#### set parameters

In [14]:
warmup_steps = 10 #1e4
batch_size = 4
encoder_max_len = 250
decoder_max_len = 54
buffer_size = 1000
ntrain = len(train_dataset)
nvalid = len(valid_dataset)
steps = int(np.ceil(ntrain/batch_size))
valid_steps = int(np.ceil(nvalid/batch_size))
print("Total Steps: ", steps)
print("Total Validation Steps: ", valid_steps)

Total Steps:  1
Total Validation Steps:  1


#### Preprocess data

In [15]:
def encode(example,
           encoder_max_len=encoder_max_len, decoder_max_len=decoder_max_len):
  
    context = example['passage']
    question = example['question']
    
    answer = example['answers_spans']['spans']
#     answer_type = example['answers_spans']['types']
    
    question_plus = f"answer_me: {str(question)}"
    question_plus += f" context: {str(context)}"
    
    answer_plus = ', '.join([i for i in list(answer)])
    answer_plus = f"{answer_plus}"
    
    encoder_inputs = tokenizer(question_plus, truncation=True, 
                               return_tensors='tf', max_length=encoder_max_len,
                              padding=True)
    
    decoder_inputs = tokenizer(answer_plus, truncation=True, 
                               return_tensors='tf', max_length=decoder_max_len,
                              padding=True)
    
    input_ids = encoder_inputs['input_ids'][0]
    input_attention = encoder_inputs['attention_mask'][0]
    target_ids = decoder_inputs['input_ids'][0]
    target_attention = decoder_inputs['attention_mask'][0]
    
    outputs = {'input_ids':input_ids, 'attention_mask': input_attention, 
               'labels':target_ids, 'decoder_attention_mask':target_attention}
    return outputs
    
    

In [16]:
train_ds = train_dataset.map(encode)
valid_ds = valid_dataset.map(encode)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [17]:
def to_tf_dataset(dataset):  
    columns = ['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask']
    dataset.set_format(type='tensorflow', columns=columns)
    return_types = {'input_ids':tf.int32, 'attention_mask':tf.int32, 
                'labels':tf.int32, 'decoder_attention_mask':tf.int32,  }
    return_shapes = {'input_ids': tf.TensorShape([None]), 'attention_mask': tf.TensorShape([None]), 
                  'labels': tf.TensorShape([None]), 'decoder_attention_mask':tf.TensorShape([None])}
    ds = tf.data.Dataset.from_generator(lambda : dataset, return_types, return_shapes)
    return ds

In [18]:
tf_train_ds = to_tf_dataset(train_ds)
tf_valid_ds = to_tf_dataset(valid_ds)

In [19]:
def create_dataset(dataset, cache_path=None, batch_size=4, 
                   buffer_size= 1000, shuffling=True):    
    if cache_path is not None:
        dataset = dataset.cache(cache_path)        
    if shuffling:
        dataset = dataset.shuffle(buffer_size)
    dataset = dataset.padded_batch(batch_size)
#     dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

In [20]:
tf_train_ds= create_dataset(tf_train_ds, batch_size=batch_size, 
                         shuffling=True, cache_path = None)
tf_valid_ds = create_dataset(tf_valid_ds, batch_size=batch_size, 
                         shuffling=False, cache_path = None)

#### Callbacks and checkpoints

In [21]:
start_profile_batch = steps+10
stop_profile_batch = start_profile_batch + 100
profile_range = f"{start_profile_batch},{stop_profile_batch}"

log_path = log_dir + "/" + datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_path, histogram_freq=1,
                                                     update_freq=20,profile_batch=profile_range)

checkpoint_filepath = save_path + "/" + "T5-{epoch:04d}-{val_loss:.4f}.ckpt"
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

callbacks = [tensorboard_callback, model_checkpoint_callback] 


#### Compile and run model

In [22]:
# learning_rate = CustomSchedule()
# learning_rate = 0.001  # Instead set a static learning rate
optimizer = tf.keras.optimizers.Adam()#learning_rate)

In [23]:
model.compile(optimizer=optimizer)
model.summary()

Model: "t5for_drop"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
shared (TFSharedEmbeddings)  multiple                  16449536  
_________________________________________________________________
encoder (TFT5MainLayer)      multiple                  18881280  
_________________________________________________________________
decoder (TFT5MainLayer)      multiple                  25175808  
Total params: 60,506,630
Trainable params: 60,506,624
Non-trainable params: 6
_________________________________________________________________


In [24]:
# %tensorboard --logdir ./data/experiments/t5/logs

In [25]:
tokenizer.decode(range(100))

'<pad> </s> <unk> X.,s thea: and to of fille int- is de for’i that youd I withn on\'o are iten be The as yourl ( or have at from an was thiser lamring can! will by? notre) wey und has all die but our their A more un dercuin so they one about myul whichà In/hef le out also des It up " timeăif'

In [26]:
model.fit(tf_train_ds, epochs=3, steps_per_epoch=steps, callbacks=callbacks, 
          validation_data=tf_valid_ds, validation_steps=valid_steps)

Epoch 1/3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.




INFO:tensorflow:Assets written to: ./data/experiments/t5/models/T5-0001-3.2598.ckpt/assets


INFO:tensorflow:Assets written to: ./data/experiments/t5/models/T5-0001-3.2598.ckpt/assets


Epoch 2/3






<tensorflow.python.keras.callbacks.History at 0x7fbf26dc6fd0>

In [27]:
model.save_pretrained(save_path)

In [28]:
model.load_weights('./data/experiments/t5/models/tf_model.h5')

In [29]:
def generate_answer(question,passage,model,tokenizer):

    input_text = f"question: {question} context: {passage}"

    input_ids = tokenizer.encode(input_text,return_tensors="tf")  
    outputs = model.generate(input_ids)
    tokenizer.decode(outputs[0])

    return tokenizer.decode(outputs[0])


def predict(df):
    df['pred_answer'] = df.apply(lambda row: generate_answer(row['question'],row['passage'],model,tokenizer),axis=1)
    df['pred_answer'] = df['pred_answer'].str.replace('<pad> ','')
    df['pred_answer'] = df['pred_answer'].str.replace('</s>','')
    return df


def evaluate(df):
    EM = []
    F1 = []
    for predicted,gold in zip(df['pred_answer'],df['answer']):

        metrics = drop_eval.get_metrics(predicted=predicted,gold=gold)

        EM.append(metrics[0])
        F1.append(metrics[1])

    df['EM'] = EM
    df['F1'] = F1
    
    print('Exact Match: {:0.4f}, F1: {:0.4f}'.format(df.EM.mean(),df.F1.mean()))
    return df

In [30]:
train_df = train_ds.to_pandas()
valid_df = valid_ds.to_pandas()
train_df.head()

Unnamed: 0,section_id,query_id,passage,question,answers_spans,input_ids,attention_mask,labels,decoder_attention_mask
0,nfl_2201,f16c0ee7-f131-4a8b-a6ac-4d275ea68066,"To start the season, the Lions traveled south ...",How many points did the buccaneers need to tie...,"{'spans': ['3'], 'types': ['number']}","[1525, 834, 526, 10, 571, 186, 979, 410, 8, 80...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[220, 1]","[1, 1]"
1,nfl_2201,c9582e03-b01b-42ed-83e0-b90a5334aefa,"To start the season, the Lions traveled south ...",How many field goals did the Lions score?,"{'spans': ['2'], 'types': ['number']}","[1525, 834, 526, 10, 571, 186, 1057, 1766, 410...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[204, 1]","[1, 1]"
2,nfl_2201,f703d43d-73fa-4fda-8913-d81bd5569700,"To start the season, the Lions traveled south ...",How long was the Lion's longest field goal?,"{'spans': ['28-yard'], 'types': ['span']}","[1525, 834, 526, 10, 571, 307, 47, 8, 10371, 3...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[2059, 18, 6636, 1]","[1, 1, 1, 1]"
3,nfl_2201,2fd4f473-af2b-44ce-929a-20c82fa6be2c,"To start the season, the Lions traveled south ...",Who caught the touchdown for the fewest yard?,"{'spans': ['Mike Williams'], 'types': ['span']}","[1525, 834, 526, 10, 2645, 4682, 8, 19396, 21,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[4794, 6060, 1]","[1, 1, 1]"


#### Predict

In [31]:
train_df = predict(train_df)
valid_df = predict(valid_df)

#### Evaluate

In [32]:
train_df = evaluate(train_df)
valid_df = evaluate(valid_df)

KeyError: ignored