# Tapas

In [1]:
import torch
import pandas as pd
import numpy as np
import pickle, logging, spacy, sys, os, json, requests
import matplotlib.pyplot as plt

from helpers.classes import Collection
from tqdm import tqdm
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
from helpers.cloze_generation import generate_clozes_from_point, named_entity_answer_generator as ne_answer_generator, noun_phrase_answer_generator as np_answer_generator

from helpers.table_processing import preprocess_table

from helpers.language_modelling import run_language_model, summarise_results

df = pd.read_pickle('pickles/dataset_20210625_184837.pkl')
clozes_df = pd.read_json('pickles/clozes_20210715_212425.json')

In [3]:
from transformers import TapasTokenizer, TapasForQuestionAnswering, TapasForMaskedLM, TapasConfig
import pandas as pd

In [4]:
def replace_mask(
    sentence, 
    masks = ['IDENTITYMASK', 'NOUNPHRASEMASK', 'NUMERICMASK', 
        'PLACEMASK', 'TEMPORALMASK', 'THINGMASK']):

    # somewhat hacky
    # checks if sentence contains any of the masks
    # and replaces it with the appropriate tokenizer.mask_token
    x = [sentence.replace(x, tokenizer.mask_token) \
        for x in masks if x in sentence]
    if len(x):
        return x[0]


def find_nth_substring(sentence, substring, n):
    '''
    used internally in multitoken_prediction to find the n-th occurence of a specific substring in a sentence
    returns the starting index of substring
    '''
    start = sentence.find(substring)
    while start >= 0 and n > 1:
        start = sentence.find(substring, start+len(substring))
        n -= 1
    return start

In [5]:
# model_name = 'google/tapas-base-finetuned-wtq'
# tokenizer = TapasTokenizer.from_pretrained(model_name)
# model = TapasForMaskedLM.from_pretrained(model_name)

dir_name = '/Users/pafitis/dev/comp0087/thesis/models/tapas_wtq_wikisql_sqa_inter_masklm_large_reset_pt'
# dir_name = '/Users/pafitis/dev/comp0087/thesis/models/tapas_inter_masklm_base_reset_pt'

config = TapasConfig.from_pretrained(
    f'{dir_name}',from_pt=True)
model = TapasForMaskedLM.from_pretrained(
    f'{dir_name}', config=config)
tokenizer=TapasTokenizer.from_pretrained(
    f'{dir_name}', from_pt=True)
# model = TapasForMaskedLM.from_pretrained(
#     f'{dir_name}/pytorch_model.bin', config=config)
# tokenizer=TapasTokenizer.from_pretrained(
#     f'{dir_name}/tokenizer_config.json', from_pt=True)


# model = TapasForQuestionAnswering.from_pretrained(model_name)

In [6]:
model

TapasForMaskedLM(
  (tapas): TapasModel(
    (embeddings): TapasEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(1024, 1024)
      (token_type_embeddings_0): Embedding(3, 1024)
      (token_type_embeddings_1): Embedding(256, 1024)
      (token_type_embeddings_2): Embedding(256, 1024)
      (token_type_embeddings_3): Embedding(2, 1024)
      (token_type_embeddings_4): Embedding(256, 1024)
      (token_type_embeddings_5): Embedding(256, 1024)
      (token_type_embeddings_6): Embedding(10, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): TapasEncoder(
      (layer): ModuleList(
        (0): TapasLayer(
          (attention): TapasAttention(
            (self): TapasSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=Tr

In [7]:
row = df.iloc[0]
point = row.point
datasets = row.data

In [8]:
clozes = [c for c in generate_clozes_from_point(point, ne_answer_generator)]
# queries = [replace_mask(c.cloze_text) for c in clozes]

# queries = [
#     'What was the approximate gross value added of the UK non-financial business economy in 2019?',
#     'What was the approximate gross value added of the UK non-financial business economy in 2018?',
#     'Was the approximate gross value added of the UK non-financial business economy higher in 2019 than 2018?',
#     'Which year had the highest total turnover?',
#     'Which year had the lowest total turnover?']

queries = [
    f'The approximate gross value added of the UK non-financial business in 2019 was {tokenizer.mask_token}',
    f'The year with the highest total turnover was {tokenizer.mask_token}'
    ]

data = datasets[2].replace('/', '_')[1:]
excel_file = pd.ExcelFile('datasets/' + data + '.xls')

In [9]:
excel_file.sheet_names

['Contents',
 'Non-Financial Business Economy',
 'Section-Division by Region',
 'Region by Section-Division']

In [10]:
data = excel_file.parse('Non-Financial Business Economy')

In [11]:
table = preprocess_table(data).astype(str)
table = table[175:181].reset_index(drop = True)

In [12]:
table

Unnamed: 0,Standard Industrial Classification (Revised 2007) Section,-,Country and Region,Year,Total turnover,Approximate gross value added at basic prices (aGVA),"Total purchases of goods, materials and services",Total employment costs
0,A-S (Part) 2,UK non-financial business economy,United Kingdom,2014,3445024,1087175,2337331,564856
1,A-S (Part) 2,UK non-financial business economy,United Kingdom,2015,3365823,1142070,2200721,591906
2,A-S (Part) 2,UK non-financial business economy,United Kingdom,2016,3525775,1161990,2338100,616451
3,A-S (Part) 2,UK non-financial business economy,United Kingdom,2017,3823162,1218978,2582797,642618
4,A-S (Part) 2,UK non-financial business economy,United Kingdom,2018,4029100,1269185,2735518,661686
5,A-S (Part) 2,UK non-financial business economy,United Kingdom,2019,4099272,1311079,2761846,697774


In [16]:
inputs = tokenizer(
    table=table, queries=queries[0],
    padding='max_length', return_tensors='pt',
    truncation=True)

outputs = model(**inputs)

In [28]:
sentence = queries[0]
verbose = True
_iter = 0
sequence_confidence = 0
sequence_confidences = []

answer_given = []

top_k_vocab = 1
# find where masks are located
is_masked = torch.where(
    inputs.input_ids == tokenizer.mask_token_id, 1, 0
)
masked_idxs = torch.nonzero(is_masked)

# convert to probabilities
probabilities = torch.softmax(
    outputs.logits[0, masked_idxs[:, 1]], dim = 1)
logprobs = torch.log(probabilities)

# obtain k most confident token predictions, work on logprobs to avoid underflow
mask_confidence, token_ids = torch.topk(logprobs, top_k_vocab)
# mask_confidence, token_ids = torch.topk(logprobs, top_k_vocab)

# selects the mask index that correspond to the most confident prediction; I am slicing [0] because of top_k_vocab will return the k most confident possible tokens. ultimately top_k_vocab is not used, but I am keeping it here for future work

most_confident = mask_confidence.argmax(dim = 0)[0].item()
target_token_idx = token_ids[most_confident][0]
target_token = tokenizer.decode(target_token_idx)

# confidence as a proxy of probability
token_confidence = mask_confidence[most_confident][0].item()

# add logprobabilities to obtain sequence probability
sequence_confidence += token_confidence
sequence_confidences.append(token_confidence)

# find start and end index of <mask> to be removed
starting_pos = find_nth_substring(
    sentence, tokenizer.mask_token, most_confident)
ending_pos = starting_pos + len(tokenizer.mask_token)

# construct new version of sentence
# replace mask by predicted token
sentence = sentence[:starting_pos] + \
    target_token + sentence[ending_pos:]

# answer_given = [ (token, position), ... ]
answer_given.append((target_token, starting_pos))

if verbose:
    print(f'Iteration: {_iter}, Predicted Token: {target_token}, Iteration Confidence: {token_confidence}, Confidence (sequence): {sequence_confidence}')
    print(f'Sentence: {sentence}')

Iteration: 0, Predicted Token: [ C L S ], Iteration Confidence: -6.774085521697998, Confidence (sequence): -6.774085521697998
Sentence: The approximate gross value added of the UK non-financial business in 2019 was [ C L S ]


In [27]:
from transformers import TapasTokenizer, TapasForMaskedLM
import pandas as pd

# tokenizer = TapasTokenizer.from_pretrained('google/tapas-base')
# model = TapasForMaskedLM.from_pretrained('google/tapas-base')

dir_name = 'google/tapas-large'
# dir_name = '/Users/pafitis/dev/comp0087/thesis/models/tapas_wtq_wikisql_sqa_inter_masklm_large_reset_pt'
# dir_name = '/Users/pafitis/dev/comp0087/thesis/models/tapas_inter_masklm_base_reset_pt'

config = TapasConfig.from_pretrained(
    f'{dir_name}',from_pt=True)
model = TapasForMaskedLM.from_pretrained(
    f'{dir_name}', config=config)
tokenizer=TapasTokenizer.from_pretrained(
    f'{dir_name}', from_pt=True)

data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
        'Age': ["56", "45", "59"],
        'Number of movies': ["87", "53", "69"]
}
table = pd.DataFrame.from_dict(data)

inputs = tokenizer(table=table, 
        queries="How many [MASK] has George Clooney played in?", return_tensors="pt")
labels = tokenizer(table=table, 
        queries="How many movies has George Clooney played in?", return_tensors="pt")["input_ids"]

outputs = model(**inputs, labels=labels)
last_hidden_states = outputs.last_hidden_state

Downloading: 100%|██████████| 1.43k/1.43k [00:00<00:00, 437kB/s]
Downloading: 100%|██████████| 1.35G/1.35G [07:29<00:00, 3.00MB/s]
Some weights of the model checkpoint at google/tapas-large were not used when initializing TapasForMaskedLM: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing TapasForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TapasForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TapasForMaskedLM were not initialized from the model checkpoint at google/tapas-large and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for 

AttributeError: 'MaskedLMOutput' object has no attribute 'last_hidden_state'

In [30]:
from transformers import AutoTokenizer, AutoModel
  
tokenizer = AutoTokenizer.from_pretrained("google/tapas-large")
model = AutoModel.from_pretrained("google/tapas-large")

data = {
    'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
    'Age': ["56", "45", "59"],
    'Number of movies': ["87", "53", "69"]
    }

queries = [
    "George Clooney played in [MASK] movies?", 
    "Brad Pitt is [MASK] old?"
    ]