# Using T5 on DROP

In [1]:
!pip install --quiet transformers
!pip install --quiet sentencepiece
!pip install --quiet allennlp_models
!pip install --quiet allennlp



In [2]:
!nvidia-smi

Mon Jul  5 20:14:39 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 461.09       Driver Version: 461.09       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 1080   WDDM  | 00000000:01:00.0  On |                  N/A |
| 27%   35C    P8    12W / 180W |   1732MiB /  8192MiB |     12%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Import model and tokenizer

In [3]:
from transformers import T5TokenizerFast, T5ForConditionalGeneration

tokenizer = T5TokenizerFast.from_pretrained('t5-small')
model =T5ForConditionalGeneration.from_pretrained('t5-small',return_dict=True)

### function to generate answer

In [4]:
def generate_answer(question,passage,model,tokenizer):
    model.eval()
    input_text = f"question: {question} context: {passage} </s>"

    input_ids = tokenizer.encode(input_text,return_tensors="pt")  
    outputs = model.generate(input_ids)
    tokenizer.decode(outputs[0])

    return tokenizer.decode(outputs[0])

### test some questions

In [5]:
passage = "To start the season, the Lions traveled south to Tampa, Florida to take on the Tampa Bay Buccaneers. The Lions scored first in the first quarter with a 23-yard field goal by Jason Hanson. The Buccaneers tied it up with a 38-yard field goal by Connor Barth, then took the lead when Aqib Talib intercepted a pass from Matthew Stafford and ran it in 28 yards. The Lions responded with a 28-yard field goal. In the second quarter, Detroit took the lead with a 36-yard touchdown catch by Calvin Johnson, and later added more points when Tony Scheffler caught an 11-yard TD pass. Tampa Bay responded with a 31-yard field goal just before halftime. The second half was relatively quiet, with each team only scoring one touchdown. First, Detroit's Calvin Johnson caught a 1-yard pass in the third quarter. The game's final points came when Mike Williams of Tampa Bay caught a 5-yard pass.  The Lions won their regular season opener for the first time since 2007"
question = 'How many points did the buccaneers need to tie in the first?'
answer = '3'

input_text = f"question: {question} context: {passage} </s>"

input_ids = tokenizer.encode(input_text,return_tensors="pt")  
outputs = model.generate(input_ids)
tokenizer.decode(outputs[0])

'<pad> 38-yard field goal</s>'

In [6]:
question = 'How many field goals did the Lions score?'
answer = '2'

generate_answer(question,passage,model,tokenizer)


'<pad> 23</s>'

In [7]:
question = "How long was the Lion's longest field goal?"
answer= '28-yard'

generate_answer(question,passage,model,tokenizer)

'<pad> 28</s>'

# Download allennlp drop_eval module

https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py

In [8]:
!python -m wget https://raw.githubusercontent.com/allenai/allennlp-reading-comprehension/master/allennlp_rc/eval/drop_eval.py


Saved under drop_eval (1).py


In [9]:
import drop_eval
import pandas as pd
import json

# Download the dataset

In [10]:
!python -m wget https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip -o drop_dataset.zip
!unzip drop_dataset.zip



Saved under drop_dataset (1).zip


'unzip' is not recognized as an internal or external command,
operable program or batch file.


# Ingest the dataset

In [11]:
with open('drop_dataset/drop_dataset_train.json', 'rb') as f:
    drop_train_dict = json.load(f)
with open('drop_dataset/drop_dataset_dev.json', 'rb') as f:
    drop_dev_dict = json.load(f)

In [12]:
list(drop_train_dict.keys())[:5]

['nfl_2201', 'nfl_478', 'history_690', 'history_2184', 'history_1328']

In [13]:
drop_train_dict['history_269'].keys()

dict_keys(['passage', 'qa_pairs', 'wiki_url'])

In [14]:
drop_train_dict['history_269']['qa_pairs'][0]

{'question': 'How many years span these events?',
 'answer': {'number': '1',
  'date': {'day': '', 'month': '', 'year': ''},
  'spans': []},
 'query_id': '551aab6a-aee5-4df7-b270-99aa39632158'}

In [15]:
len(drop_train_dict['history_269']['qa_pairs'])

12

In [16]:
def import_data(data_dict):
    df = pd.DataFrame()

    for section in data_dict.keys():
        count = 1
        for qaPair in data_dict[section]['qa_pairs']:

            answerWithType = drop_eval.answer_json_to_strings(qaPair['answer'])
            answer = answerWithType[0]
            answer_type = answerWithType[1]

            query_id = qaPair['query_id']
            question = qaPair['question']

            query_name = section + '-Q'+ str(count)
            passage = data_dict[section]['passage']
            url = data_dict[section]['wiki_url']

            count +=1
            temp_df = pd.DataFrame([query_name,query_id,passage,question,answer,answer_type,url]).T.rename(columns={0:'query_name',1:'query_id',2:'passage',3:'question',4:'answer',5:'answer_type',6:'wiki_url'})
            df = pd.concat([df,temp_df])

    print('{:,} records'.format(len(df)))
    return df

In [None]:
print('importing training records...')
drop_train = import_data(drop_train_dict)
print('importing test records...')
drop_dev = import_data(drop_dev_dict)

importing training records...


### Save to pickle

In [None]:
drop_train.to_pickle('drop_train.pkl')
drop_dev.to_pickle('drop_dev.pkl')

#### Read from pickle

### Create a smaller dataframe to work with for now

In [None]:
sample_df = drop_train.head(100).copy()
sample_df = sample_df.reset_index(drop=True)
sample_df.head(2)

### Generate data for sample 

In [None]:
def predict(df):
    df['pred_answer'] = df.apply(lambda row: generate_answer(row['question'],row['passage'],model,tokenizer),axis=1)
    df['pred_answer'] = df['pred_answer'].str.replace('<pad> ','')
    df['pred_answer'] = df['pred_answer'].str.replace('</s>','')
    return df

In [None]:
sample_df = predict(sample_df)
sample_df.head()

In [None]:
drop_train_dict['history_269']['qa_pairs'][0]

In [None]:
def evaluate(df):
    EM = []
    F1 = []
    for predicted,gold in zip(df['pred_answer'],df['answer']):

        metrics = drop_eval.get_metrics(predicted=predicted,gold=gold)

        EM.append(metrics[0])
        F1.append(metrics[1])

    df['EM'] = EM
    df['F1'] = F1
    
    print('Exact Match: {:0.4f}, F1: {:0.4f}'.format(df.EM.mean(),df.F1.mean()))
    return df

In [None]:
sample_df = evaluate(sample_df)
sample_df.head()

# Predict on full training set

In [None]:
drop_train = predict(drop_train)
drop_train = evaluate(drop_train)
drop_train.head()

# Predict on full dev set

In [None]:
drop_dev = predict(drop_dev)
drop_dev = evaluate(drop_dev)
drop_dev.head()