# Tapas

In [1]:
import pandas as pd
import numpy as np
import pickle, logging, spacy, sys, os, json, requests
import matplotlib.pyplot as plt

from helpers.classes import Collection
from tqdm import tqdm
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
from helpers.cloze_generation import generate_clozes_from_point, named_entity_answer_generator as ne_answer_generator, noun_phrase_answer_generator as np_answer_generator

from helpers.table_processing import preprocess_table

from helpers.language_modelling import run_language_model, summarise_results

df = pd.read_pickle('pickles/dataset_20210625_184837.pkl')
clozes_df = pd.read_json('pickles/clozes_20210715_212425.json')

In [3]:
from transformers import TapasTokenizer, TapasForQuestionAnswering, TapasForMaskedLM
import pandas as pd

In [4]:
def replace_mask(
    sentence, 
    masks = ['IDENTITYMASK', 'NOUNPHRASEMASK', 'NUMERICMASK', 
        'PLACEMASK', 'TEMPORALMASK', 'THINGMASK']):

    # somewhat hacky
    # checks if sentence contains any of the masks
    # and replaces it with the appropriate tokenizer.mask_token
    x = [sentence.replace(x, tokenizer.mask_token) \
        for x in masks if x in sentence]
    if len(x):
        return x[0]

In [5]:
model_name = 'google/tapas-base-finetuned-wtq'
# model = TapasForQuestionAnswering.from_pretrained(model_name)
tokenizer = TapasTokenizer.from_pretrained(model_name)
model = TapasForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at google/tapas-base-finetuned-wtq were not used when initializing TapasForMaskedLM: ['output_weights', 'column_output_weights', 'output_bias', 'column_output_bias', 'aggregation_classifier.weight', 'aggregation_classifier.bias', 'tapas.pooler.dense.weight', 'tapas.pooler.dense.bias']
- This IS expected if you are initializing TapasForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TapasForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TapasForMaskedLM were not initialized from the model checkpoint at google/tapas-base-finetuned-wtq and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this mo

In [6]:
row = df.iloc[0]
point = row.point
datasets = row.data

In [7]:
clozes = [c for c in generate_clozes_from_point(point, ne_answer_generator)]
queries = [replace_mask(c.cloze_text) for c in clozes]
data = datasets[2].replace('/', '_')[1:]
excel_file = pd.ExcelFile('datasets/' + data + '.xls')

In [8]:
excel_file.sheet_names

['Contents',
 'Non-Financial Business Economy',
 'Section-Division by Region',
 'Region by Section-Division']

In [9]:
data = excel_file.parse('Non-Financial Business Economy')

In [10]:
table = preprocess_table(data).astype(str)

In [16]:
inputs = tokenizer(
    table=table, queries=queries, padding='max_length', return_tensors='pt',
    truncation=True)
outputs = model(**inputs)

In [17]:
predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
    inputs,
    outputs.logits.detach(),
    outputs.logits_aggregation.detach())

AttributeError: 'MaskedLMOutput' object has no attribute 'logits_aggregation'

In [18]:
outputs.logits

tensor([[[ 0.6350,  1.5058,  1.0321,  ...,  0.6752,  0.7180,  0.2368],
         [ 0.6961,  1.5810,  0.8884,  ...,  0.9269,  0.7713,  0.2906],
         [ 0.6243,  1.5662,  0.7142,  ...,  0.7459,  0.6767,  0.0153],
         ...,
         [ 0.4787,  1.3786,  0.7702,  ...,  0.3810,  0.2486, -0.3435],
         [ 0.4895,  1.2159,  0.4182,  ...,  0.6613,  0.3502, -0.1579],
         [ 0.4811,  1.3290,  0.5368,  ...,  0.3885,  0.2259, -0.3842]],

        [[ 0.3978,  1.0193,  0.2453,  ...,  0.0290,  0.2707, -0.7794],
         [ 0.7116,  1.4247,  0.5465,  ...,  0.4968,  0.3723, -0.3773],
         [ 0.5248,  0.9254,  0.3521,  ..., -0.2599, -0.0825, -1.1861],
         ...,
         [ 0.5370,  1.3793,  0.8873,  ...,  0.0156,  0.2127, -0.4465],
         [ 0.4518,  1.0105,  0.2540,  ...,  0.1124,  0.1644, -0.5547],
         [ 0.6188,  1.3363,  0.6226,  ...,  0.0611,  0.1498, -0.3889]],

        [[ 0.6078,  1.3986,  0.9694,  ...,  0.6749,  0.6821,  0.0290],
         [ 0.6308,  1.4682,  0.8810,  ...,  0