# Tapas

In [1]:
import torch
import pandas as pd
import numpy as np
import pickle, logging, spacy, sys, os, json, requests
import matplotlib.pyplot as plt

from helpers.classes import Collection
from tqdm import tqdm
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
from helpers.cloze_generation import generate_clozes_from_point, named_entity_answer_generator as ne_answer_generator, noun_phrase_answer_generator as np_answer_generator

from helpers.table_processing import preprocess_table

from helpers.language_modelling import run_language_model, summarise_results

df = pd.read_pickle('pickles/dataset_20210625_184837.pkl')
clozes_df = pd.read_json('pickles/clozes_20210715_212425.json')

In [3]:
from transformers import TapasTokenizer, TapasForQuestionAnswering, TapasForMaskedLM
import pandas as pd

In [4]:
def replace_mask(
    sentence, 
    masks = ['IDENTITYMASK', 'NOUNPHRASEMASK', 'NUMERICMASK', 
        'PLACEMASK', 'TEMPORALMASK', 'THINGMASK']):

    # somewhat hacky
    # checks if sentence contains any of the masks
    # and replaces it with the appropriate tokenizer.mask_token
    x = [sentence.replace(x, tokenizer.mask_token) \
        for x in masks if x in sentence]
    if len(x):
        return x[0]

In [5]:
model_name = 'google/tapas-base-finetuned-wtq'
# model = TapasForQuestionAnswering.from_pretrained(model_name)
tokenizer = TapasTokenizer.from_pretrained(model_name)
model = TapasForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at google/tapas-base-finetuned-wtq were not used when initializing TapasForMaskedLM: ['output_weights', 'column_output_weights', 'output_bias', 'column_output_bias', 'aggregation_classifier.weight', 'aggregation_classifier.bias', 'tapas.pooler.dense.weight', 'tapas.pooler.dense.bias']
- This IS expected if you are initializing TapasForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TapasForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TapasForMaskedLM were not initialized from the model checkpoint at google/tapas-base-finetuned-wtq and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this mo

In [6]:
row = df.iloc[0]
point = row.point
datasets = row.data

In [7]:
clozes = [c for c in generate_clozes_from_point(point, ne_answer_generator)]
# queries = [replace_mask(c.cloze_text) for c in clozes]

# queries = [
#     'What was the approximate gross value added of the UK non-financial business economy in 2019?',
#     'What was the approximate gross value added of the UK non-financial business economy in 2018?',
#     'Was the approximate gross value added of the UK non-financial business economy higher in 2019 than 2018?',
#     'Which year had the highest total turnover?',
#     'Which year had the lowest total turnover?']

queries = [
    f'The approximate gross value added of the UK non-financial business in 2019 was {tokenizer.mask_token}',
    f'The year with the highest total turnover was {tokenizer.mask_token}'
    ]

data = datasets[2].replace('/', '_')[1:]
excel_file = pd.ExcelFile('datasets/' + data + '.xls')

In [8]:
excel_file.sheet_names

['Contents',
 'Non-Financial Business Economy',
 'Section-Division by Region',
 'Region by Section-Division']

In [9]:
data = excel_file.parse('Non-Financial Business Economy')

In [10]:
table = preprocess_table(data).astype(str)
table = table[175:181].reset_index(drop = True)

In [11]:
table

Unnamed: 0,Standard Industrial Classification (Revised 2007) Section,-,Country and Region,Year,Total turnover,Approximate gross value added at basic prices (aGVA),"Total purchases of goods, materials and services",Total employment costs
0,A-S (Part) 2,UK non-financial business economy,United Kingdom,2014,3445024,1087175,2337331,564856
1,A-S (Part) 2,UK non-financial business economy,United Kingdom,2015,3365823,1142070,2200721,591906
2,A-S (Part) 2,UK non-financial business economy,United Kingdom,2016,3525775,1161990,2338100,616451
3,A-S (Part) 2,UK non-financial business economy,United Kingdom,2017,3823162,1218978,2582797,642618
4,A-S (Part) 2,UK non-financial business economy,United Kingdom,2018,4029100,1269185,2735518,661686
5,A-S (Part) 2,UK non-financial business economy,United Kingdom,2019,4099272,1311079,2761846,697774


In [12]:
inputs = tokenizer(
    table=table, queries=queries[0], padding='max_length', return_tensors='pt',
    truncation=True)
outputs = model(**inputs)

# predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
#     inputs,
#     outputs.logits.detach(),
#     outputs.logits_aggregation.detach())


# predicted_answer_coordinates = tokenizer.convert_logits_to_predictions(
#     inputs, outputs.logits.detach())

In [14]:
sentence = queries[0]
verbose = True
_iter = 0
sequence_confidence = 0
sequence_confidences = []

answer_given = []

top_k_vocab = 1
# find where masks are located
is_masked = torch.where(
    inputs.input_ids == tokenizer.mask_token_id, 1, 0
)
masked_idxs = torch.nonzero(is_masked)

# convert to probabilities
probabilities = torch.softmax(
    outputs.logits[0, masked_idxs[:, 1]], dim = 1)
logprobs = torch.log(probabilities)

# obtain k most confident token predictions, work on logprobs to avoid underflow
mask_confidence, token_ids = torch.topk(logprobs, top_k_vocab)

# selects the mask index that correspond to the most confident prediction; I am slicing [0] because of top_k_vocab will return the k most confident possible tokens. ultimately top_k_vocab is not used, but I am keeping it here for future work
most_confident = mask_confidence.argmax(dim = 0)[0].item()
target_token_idx = token_ids[most_confident][0]
target_token = tokenizer.decode(target_token_idx)

# confidence as a proxy of probability
token_confidence = mask_confidence[most_confident][0].item()

# add logprobabilities to obtain sequence probability
sequence_confidence += token_confidence
sequence_confidences.append(token_confidence)

# find start and end index of <mask> to be removed
starting_pos = find_nth_substring(
    sentence, tokenizer.mask_token, most_confident)
ending_pos = starting_pos + len(tokenizer.mask_token)

# construct new version of sentence
# replace mask by predicted token
sentence = sentence[:starting_pos] + \
    target_token + sentence[ending_pos:]

# answer_given = [ (token, position), ... ]
answer_given.append((target_token, starting_pos))

if verbose:
    print(f'Iteration: {_iter}, Predicted Token: {target_token}, Iteration Confidence: {token_confidence}, Confidence (sequence): {sequence_confidence}')
    print(f'Sentence: {sentence}')

NameError: name 'torch' is not defined

In [15]:
from transformers import TapasTokenizer, TapasForMaskedLM
import pandas as pd
import torch

tokenizer = TapasTokenizer.from_pretrained('google/tapas-base')
model = TapasForMaskedLM.from_pretrained('google/tapas-base')

data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
        'Age': ["56", "45", "59"],
        'Number of movies': ["87", "53", "69"]
}
table = pd.DataFrame.from_dict(data)

inputs = tokenizer(table=table, queries="How many [MASK] has George [MASK] played in?", return_tensors="pt")
labels = tokenizer(table=table, queries="How many movies has George Clooney played in?", return_tensors="pt")["input_ids"]

outputs = model(**inputs, labels=labels)
last_hidden_states = outputs.last_hidden_state

Some weights of the model checkpoint at google/tapas-base were not used when initializing TapasForMaskedLM: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing TapasForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TapasForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TapasForMaskedLM were not initialized from the model checkpoint at google/tapas-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: Expected input batch_size (32) to match target batch_size (34).

In [16]:
outputs = model(**inputs, labels = labels[0][1:-1])
last_hidden_states = outputs.last_hidden_state

AttributeError: 'MaskedLMOutput' object has no attribute 'last_hidden_state'

In [17]:
labels

tensor([[  101,  2129,  2116,  5691,  2038,  2577, 18856,  7828,  3240,  2209,
          1999,  1029,   102,  5889,  2287,  2193,  1997,  5691,  8226, 15091,
          5179,  6584, 14720,  4487,  6178,  9488,  3429,  5187,  2577, 18856,
          7828,  3240,  5354,  6353]])

In [18]:
from transformers import TapasConfig,TapasTokenizer,TapasForMaskedLM
import torch
config = TapasConfig.from_pretrained('google/tapas-base-finetuned-wtq',from_pt=True)
model = TapasForMaskedLM.from_pretrained('google/tapas-base-finetuned-wtq', config=config)
tokenizer=TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq",from_pt=True)
import sys

# outdir = "tmp"

# model.save_pretrained(outdir)
# tokenizer.save_pretrained(outdir)
# config.save_pretrained(outdir)

from transformers import pipeline

nlp = pipeline(task="fill-mask",framework="pt",model=model, tokenizer=tokenizer)
#nlp = pipeline(task="table-question-answering")

import pandas as pd

data= {    "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
    "age": ["56", "45", "59"],
    "number of movies": ["87", "53", "69"],
    "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"]
}

Some weights of the model checkpoint at google/tapas-base-finetuned-wtq were not used when initializing TapasForMaskedLM: ['output_weights', 'column_output_weights', 'output_bias', 'column_output_bias', 'aggregation_classifier.weight', 'aggregation_classifier.bias', 'tapas.pooler.dense.weight', 'tapas.pooler.dense.bias']
- This IS expected if you are initializing TapasForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TapasForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TapasForMaskedLM were not initialized from the model checkpoint at google/tapas-base-finetuned-wtq and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this mo

In [19]:
import numpy as np
table = pd.DataFrame.from_dict(data)


(3, 4)


In [20]:
queries=[
        f"The number of movies Brad Pitt acted in is {tokenizer.mask_token}",
        f"Leonardo di caprio's age is {tokenizer.mask_token}"]