# Tapas

In [1]:
import pandas as pd
import numpy as np
import pickle, logging, spacy, sys, os, json, requests
import matplotlib.pyplot as plt

from helpers.classes import Collection
from tqdm import tqdm
from bs4 import BeautifulSoup
from datetime import datetime

In [2]:
from helpers.cloze_generation import generate_clozes_from_point, named_entity_answer_generator as ne_answer_generator, noun_phrase_answer_generator as np_answer_generator

from helpers.table_processing import preprocess_table

from helpers.language_modelling import run_language_model, summarise_results

df = pd.read_pickle('pickles/dataset_20210625_184837.pkl')
clozes_df = pd.read_json('pickles/clozes_20210715_212425.json')

In [3]:
from transformers import TapasTokenizer, TapasForQuestionAnswering, TapasForMaskedLM
import pandas as pd

In [4]:
def replace_mask(
    sentence, 
    masks = ['IDENTITYMASK', 'NOUNPHRASEMASK', 'NUMERICMASK', 
        'PLACEMASK', 'TEMPORALMASK', 'THINGMASK']):

    # somewhat hacky
    # checks if sentence contains any of the masks
    # and replaces it with the appropriate tokenizer.mask_token
    x = [sentence.replace(x, tokenizer.mask_token) \
        for x in masks if x in sentence]
    if len(x):
        return x[0]

In [5]:
model_name = 'google/tapas-base-finetuned-wtq'
model = TapasForQuestionAnswering.from_pretrained(model_name)
tokenizer = TapasTokenizer.from_pretrained(model_name)
# model = TapasForMaskedLM.from_pretrained(model_name)

In [6]:
row = df.iloc[0]
point = row.point
datasets = row.data

In [7]:
clozes = [c for c in generate_clozes_from_point(point, ne_answer_generator)]
# queries = [replace_mask(c.cloze_text) for c in clozes]

queries = [
    'What was the approximate gross value added of the UK non-financial business economy in 2019?',
    'What was the approximate gross value added of the UK non-financial business economy in 2018?',
    'Was the approximate gross value added of the UK non-financial business economy higher in 2019 than 2018?',
    'Which year had the highest total turnover?',
    'Which year had the lowest total turnover?']

data = datasets[2].replace('/', '_')[1:]
excel_file = pd.ExcelFile('datasets/' + data + '.xls')

In [8]:
excel_file.sheet_names

['Contents',
 'Non-Financial Business Economy',
 'Section-Division by Region',
 'Region by Section-Division']

In [9]:
data = excel_file.parse('Non-Financial Business Economy')

In [10]:
table = preprocess_table(data).astype(str)
table = table[175:181].reset_index(drop = True)

In [11]:
table

Unnamed: 0,Standard Industrial Classification (Revised 2007) Section,-,Country and Region,Year,Total turnover,Approximate gross value added at basic prices (aGVA),"Total purchases of goods, materials and services",Total employment costs
0,A-S (Part) 2,UK non-financial business economy,United Kingdom,2014,3445024,1087175,2337331,564856
1,A-S (Part) 2,UK non-financial business economy,United Kingdom,2015,3365823,1142070,2200721,591906
2,A-S (Part) 2,UK non-financial business economy,United Kingdom,2016,3525775,1161990,2338100,616451
3,A-S (Part) 2,UK non-financial business economy,United Kingdom,2017,3823162,1218978,2582797,642618
4,A-S (Part) 2,UK non-financial business economy,United Kingdom,2018,4029100,1269185,2735518,661686
5,A-S (Part) 2,UK non-financial business economy,United Kingdom,2019,4099272,1311079,2761846,697774


In [12]:
inputs = tokenizer(
    table=table, queries=queries, padding='max_length', return_tensors='pt',
    truncation=True)
outputs = model(**inputs)



predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
    inputs,
    outputs.logits.detach(),
    outputs.logits_aggregation.detach())

In [13]:
predicted_aggregation_indices

[2, 2, 2, 0, 0]

In [14]:
predicted_answer_coordinates

[[(5, 5)], [(4, 5)], [(5, 5)], [(5, 3)], [(1, 3)]]

In [15]:
queries

['What was the approximate gross value added of the UK non-financial business economy in 2019?',
 'What was the approximate gross value added of the UK non-financial business economy in 2018?',
 'Was the approximate gross value added of the UK non-financial business economy higher in 2019 than 2018?',
 'Which year had the highest total turnover?',
 'Which year had the lowest total turnover?']

In [16]:
# let's print out the results:
id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3:"COUNT"}
aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]

answers = []
for coordinates in predicted_answer_coordinates:
  if len(coordinates) == 1:
    # only a single cell:
    answers.append(table.iat[coordinates[0]])
  else:
    # multiple cells
    cell_values = []
    for coordinate in coordinates:
       cell_values.append(table.iat[coordinate])
    answers.append(", ".join(cell_values))

display(table)
print("")
for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
  print(query)
  if predicted_agg == "NONE":
    print("Predicted answer: " + answer)
  else:
    print("Predicted answer: " + predicted_agg + " > " + answer)

Unnamed: 0,Standard Industrial Classification (Revised 2007) Section,-,Country and Region,Year,Total turnover,Approximate gross value added at basic prices (aGVA),"Total purchases of goods, materials and services",Total employment costs
0,A-S (Part) 2,UK non-financial business economy,United Kingdom,2014,3445024,1087175,2337331,564856
1,A-S (Part) 2,UK non-financial business economy,United Kingdom,2015,3365823,1142070,2200721,591906
2,A-S (Part) 2,UK non-financial business economy,United Kingdom,2016,3525775,1161990,2338100,616451
3,A-S (Part) 2,UK non-financial business economy,United Kingdom,2017,3823162,1218978,2582797,642618
4,A-S (Part) 2,UK non-financial business economy,United Kingdom,2018,4029100,1269185,2735518,661686
5,A-S (Part) 2,UK non-financial business economy,United Kingdom,2019,4099272,1311079,2761846,697774



What was the approximate gross value added of the UK non-financial business economy in 2019?
Predicted answer: AVERAGE > 1311079
What was the approximate gross value added of the UK non-financial business economy in 2018?
Predicted answer: AVERAGE > 1269185
Was the approximate gross value added of the UK non-financial business economy higher in 2019 than 2018?
Predicted answer: AVERAGE > 1311079
Which year had the highest total turnover?
Predicted answer: 2019
Which year had the lowest total turnover?
Predicted answer: 2015
