In [1]:
from transformers import T5ForConditionalGeneration

In [13]:
from transformers import T5TokenizerFast

In [25]:
tokenizer = T5TokenizerFast.from_pretrained('t5-small')

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

In [2]:
model = T5ForConditionalGeneration.from_pretrained('/Users/micheleiannello/model/')

In [4]:
from util.model import read_npz
val_ids, val_contexts, val_attention_masks, val_questions = read_npz(path='./data/tokenized-qg/', split='val', task='QG')

In [5]:
# Importing libraries
import os
import numpy as np
import pandas as pd
import torch

In [6]:
class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, ids, contexts, attention_masks, questions):
        'Initialization'
        self.ids = ids
        self.contexts = contexts
        self.attention_masks = attention_masks
        self.questions = questions

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.contexts)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        ID = self.ids[index]
        context = torch.tensor(self.contexts[index], dtype=torch.int32)
        attention_mask = torch.tensor(self.attention_masks[index], dtype=torch.int32)
        question = torch.tensor(self.questions[index], dtype=torch.int32)

        # Pack input and output
        X = (ID, context, attention_mask)
        y = question

        return X, y

In [51]:
val_dataset = Dataset(val_ids[20:24], val_contexts[20:24], val_attention_masks[20:24], val_questions[20:24])

In [52]:
val_params = {
        'batch_size': 4,
        'shuffle': False,
        'num_workers': 0,
        'pin_memory' : True
        }
val_loader = torch.utils.data.DataLoader(val_dataset, **val_params)

In [53]:
device = 'cpu'

In [54]:
for X, Y in val_loader:
    # Unpack input
    _, context, attention_mask = X
    
    question = Y.to(device, dtype = torch.long)
    context = context.to(device, dtype = torch.long)
    attention_mask = attention_mask.to(device, dtype = torch.long)
    
    generated_ids = model.generate(
        input_ids = context,
        attention_mask = attention_mask, 
        max_length=150, 
        num_beams=2,
        repetition_penalty=2.5, 
        length_penalty=1.0, 
        early_stopping=True
        )
    preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
    target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in question]
            

In [55]:
preds

['is a part of the First Year of Studies program?',
 'is a part of the First Year of Studies program?',
 'is a part of the First Year of Studies program?',
 'is a part of the First Year of Studies program?']

In [56]:
target

['What entity provides help with the management of time for new students at Notre Dame?',
 'How many colleges for undergraduates are at Notre Dame?',
 'What was created at Notre Dame in 1962 to assist first year students?',
 'Which organization declared the First Year of Studies program at Notre Dame "outstanding?"']

In [57]:
[tokenizer.decode(c) for c in context]

["All of Notre Dame's undergraduate students are a part of one of the five undergraduate colleges at the school or are in the First Year of Studies program. The First Year of Studies program was established in 1962 to guide incoming freshmen in their first year at the school before they have declared a major. Each student is given an academic advisor from the program who helps them to choose classes that give them exposure to any major in which they are interested. The program also includes a Learning Resource Center which provides time management, collaborative learning, and subject tutoring. This program has been recognized previously, by U.S. News & World Report, as outstanding.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><

In [1]:
from util.preprocessing import read_from_json

In [78]:
df = read_from_json()

In [79]:
df[df['question'] == df['question'].iloc[409]]

Unnamed: 0,id,title,context,question,answer_start,answer_end,answer_text
288,56d43ce42ccc5a1400d830b6,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyoncé release her first solo album?,526,530,2003
409,56d4b9702ccc5a1400d83173,Beyoncé,Beyoncé's first solo recording was a feature o...,When did Beyoncé release her first solo album?,229,242,"June 24, 2003"


In [23]:
import pandas as pd

In [80]:
duplicated = []
for index, row in df[df.duplicated(['question'])].iterrows():
    duplicated.append(df[df['question'] == row['question']])

In [81]:
duplicated

[                           id    title  ... answer_end    answer_text
 288  56d43ce42ccc5a1400d830b6  Beyoncé  ...        530           2003
 409  56d4b9702ccc5a1400d83173  Beyoncé  ...        242  June 24, 2003
 
 [2 rows x 7 columns],
                            id    title  ... answer_end      answer_text
 525  56be9bb83aeaaa14008c915c  Beyoncé  ...         18  January 7, 2012
 529  56bfa087a10cfb14005511d9  Beyoncé  ...         18  January 7, 2012
 
 [2 rows x 7 columns],
                             id  ...                      answer_text
 1027  5733bd9bd058e614000b6199  ...  Spanish word montaña (mountain)
 1031  5733f0e34776f41900661573  ...    from the Spanish word montaña
 
 [2 rows x 7 columns],
                             id            title  ... answer_end answer_text
 1646  56cf66734df3c31400b0d714  Frédéric_Chopin  ...        333        1830
 1650  56d3123b59d6e41400146205  Frédéric_Chopin  ...        333        1830
 
 [2 rows x 7 columns],
                           

In [87]:
dupl = []
for dup in duplicated:
    if len(dup['context'].unique()) < len(dup):
        dupl.append(dup)

In [118]:
dupl

Unnamed: 0,id,title,context,question,answer_start,answer_end,answer_text
1791,56cfdb3e234ae51400d9bf7f,Frédéric_Chopin,Although the two displayed great respect and a...,What was the name of Liszt's mistress?,1168,1182,Marie d'Agoult
1794,56d3227259d6e41400146280,Frédéric_Chopin,Although the two displayed great respect and a...,What was the name of Liszt's mistress?,1168,1182,Marie d'Agoult


In [96]:
dd = []
for d in dupl:
    if len(d) > 2:
        dd.append(d)

In [103]:
duplicated_rows = []
for index, row in df[df.duplicated(['answer_text'])].iterrows():
    if len(df[df['answer_text'] == row['answer_text']])>1:
        duplicated_rows.append(row)