In [2]:
import pandas as pd
import torch
torch.__version__

'1.11.0+cu113'

In [4]:
# pip install torch-scatter -f 'https://data.pyg.org/whl/torch-1.10.2+cu102.html'

In [None]:
train_data = pd.read_csv('data/train.tsv', sep='\t')
train_data.head()

Unnamed: 0,id,annotator,position,question,table_file,answer_coordinates,answer_text
0,nt-639,0,0,where are the players from?,table_csv/203_149.csv,"['(0, 4)', '(1, 4)', '(2, 4)', '(3, 4)', '(4, ...","['Louisiana State University', 'Valley HS (Las..."
1,nt-639,0,1,which player went to louisiana state university?,table_csv/203_149.csv,"['(0, 1)']",['Ben McDonald']
2,nt-639,1,0,who are the players?,table_csv/203_149.csv,"['(0, 1)', '(1, 1)', '(2, 1)', '(3, 1)', '(4, ...","['Ben McDonald', 'Tyler Houston', 'Roger Salke..."
3,nt-639,1,1,which ones are in the top 26 picks?,table_csv/203_149.csv,"['(0, 1)', '(1, 1)', '(2, 1)', '(3, 1)', '(4, ...","['Ben McDonald', 'Tyler Houston', 'Roger Salke..."
4,nt-639,1,2,"and of those, who is from louisiana state univ...",table_csv/203_149.csv,"['(0, 1)']",['Ben McDonald']


As you can see, each row corresponds to a question related to a table.

The position column identifies whether the question is the first, second, ... in a sequence of questions related to a table.

The table_file column identifies the name of the table file, which refers to a CSV file in the table_csv directory.

The answer_coordinates and answer_text columns indicate the answer to the question. The answer_coordinates is a list of tuples, each tuple being a (row_index, column_index) pair. The answer_text column is a list of strings, indicating the cell values.

However, the answer_coordinates and answer_text columns are currently not recognized as real Python lists of Python tuples and strings respectively. Let us convert it to real python list of tuples

In [None]:
import ast

def _parse_answer_coordinates(answer_coordinate_str):
  """
  Parses the answer_coordinates of a question.
  Args:
    answer_coordinate_str: A string representation of a Python list of tuple
      strings.
      For example: "['(1, 4)','(1, 3)', ...]"
  """

  try:
    answer_coordinates = []
    # make a list of strings
    coords = ast.literal_eval(answer_coordinate_str)
    # parse each string as a tuple
    for row_index, column_index in sorted(
        ast.literal_eval(coord) for coord in coords):
      answer_coordinates.append((row_index, column_index))
  except SyntaxError:
    raise ValueError('Unable to evaluate %s' % answer_coordinate_str)
  
  return answer_coordinates


def _parse_answer_text(answer_text):
  """
  Populates the answer_texts field of `answer` by parsing `answer_text`.
  Args:
    answer_text: A string representation of a Python list of strings.
      For example: "[u'test', u'hello', ...]"
    answer: an Answer object.
  """
  try:
    answer = []
    for value in ast.literal_eval(answer_text):
      answer.append(value)
  except SyntaxError:
    raise ValueError('Unable to evaluate %s' % answer_text)

  return answer

train_data['answer_coordinates'] = train_data['answer_coordinates'].apply(lambda coords_str: _parse_answer_coordinates(coords_str))
train_data['answer_text'] = train_data['answer_text'].apply(lambda txt: _parse_answer_text(txt))

train_data.head(10)

Unnamed: 0,id,annotator,position,question,table_file,answer_coordinates,answer_text
0,nt-639,0,0,where are the players from?,table_csv/203_149.csv,"[(0, 4), (1, 4), (2, 4), (3, 4), (4, 4), (5, 4...","[Louisiana State University, Valley HS (Las Ve..."
1,nt-639,0,1,which player went to louisiana state university?,table_csv/203_149.csv,"[(0, 1)]",[Ben McDonald]
2,nt-639,1,0,who are the players?,table_csv/203_149.csv,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[Ben McDonald, Tyler Houston, Roger Salkeld, J..."
3,nt-639,1,1,which ones are in the top 26 picks?,table_csv/203_149.csv,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[Ben McDonald, Tyler Houston, Roger Salkeld, J..."
4,nt-639,1,2,"and of those, who is from louisiana state univ...",table_csv/203_149.csv,"[(0, 1)]",[Ben McDonald]
5,nt-639,2,0,who are the players in the top 26?,table_csv/203_149.csv,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[Ben McDonald, Tyler Houston, Roger Salkeld, J..."
6,nt-639,2,1,"of those, which one was from louisiana state u...",table_csv/203_149.csv,"[(0, 1)]",[Ben McDonald]
7,nt-11649,0,0,what are all the names of the teams?,table_csv/204_135.csv,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[Cordoba CF, CD Malaga, Granada CF, UD Las Pal..."
8,nt-11649,0,1,"of these, which teams had any losses?",table_csv/204_135.csv,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[Cordoba CF, CD Malaga, Granada CF, UD Las Pal..."
9,nt-11649,0,2,"of these teams, which had more than 21 losses?",table_csv/204_135.csv,"[(15, 1)]",[CD Villarrobledo]


Let's create a new dataframe that groups questions which are asked in a sequence related to the table. We can do this by adding a sequence_id column, which is a combination of the id and annotator columns:

In [None]:
def get_sequence_id(example_id, annotator):
  if "-" in str(annotator):
    raise ValueError('"-" not allowed in annotator.')
  return f"{example_id}-{annotator}"

train_data['sequence_id'] = train_data.apply(lambda x: get_sequence_id(x.id, x.annotator), axis=1)
train_data.head()

Unnamed: 0,id,annotator,position,question,table_file,answer_coordinates,answer_text,sequence_id
0,nt-639,0,0,where are the players from?,table_csv/203_149.csv,"[(0, 4), (1, 4), (2, 4), (3, 4), (4, 4), (5, 4...","[Louisiana State University, Valley HS (Las Ve...",nt-639-0
1,nt-639,0,1,which player went to louisiana state university?,table_csv/203_149.csv,"[(0, 1)]",[Ben McDonald],nt-639-0
2,nt-639,1,0,who are the players?,table_csv/203_149.csv,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[Ben McDonald, Tyler Houston, Roger Salkeld, J...",nt-639-1
3,nt-639,1,1,which ones are in the top 26 picks?,table_csv/203_149.csv,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[Ben McDonald, Tyler Houston, Roger Salkeld, J...",nt-639-1
4,nt-639,1,2,"and of those, who is from louisiana state univ...",table_csv/203_149.csv,"[(0, 1)]",[Ben McDonald],nt-639-1


In [None]:
# let's group table-question pairs by sequence id, and remove some columns we don't need 
grouped = train_data.groupby(by='sequence_id').agg(lambda x: x.tolist())
grouped = grouped.drop(columns=['id', 'annotator', 'position'])
grouped['table_file'] = grouped['table_file'].apply(lambda x: x[0])
grouped.head(10)

Unnamed: 0_level_0,question,table_file,answer_coordinates,answer_text
sequence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ns-1292-0,"[who are all the athletes?, where are they fro...",table_csv/204_521.csv,"[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, ...","[[Tommy Green, Janis Dalins, Ugo Frigerio, Kar..."
ns-1292-1,[who competed in the men's 50 kilometer walk a...,table_csv/204_521.csv,"[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, ...","[[Tommy Green, Janis Dalins, Ugo Frigerio, Kar..."
ns-1292-2,"[what competitors were from germany?, who was ...",table_csv/204_521.csv,"[[(3, 1), (5, 1)], [(3, 1)]]","[[Karl Hahnel, Paul Sievert], [Karl Hahnel]]"
ns-3397-0,[what are the number of bronze medals received...,table_csv/204_34.csv,"[[(0, 4), (1, 4), (2, 4), (3, 4), (4, 4), (5, ...","[[0, 5, 3, 5, 5, 1, 0, 0, 0, 0, 3, 2, 1, 1, 1,..."
ns-3397-1,[what are the countries of the 2004 african ju...,table_csv/204_34.csv,"[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, ...","[[Algeria, Tunisia, Egypt, Cameroon, Morocco, ..."
ns-3397-2,"[what are all the nations listed?, how many br...",table_csv/204_34.csv,"[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, ...","[[Algeria, Tunisia, Egypt, Cameroon, Morocco, ..."
ns-3441-0,"[where did calvin murray go to school?, where ...",table_csv/203_149.csv,"[[(10, 4)], [(19, 4)], [(24, 4)], [(9, 4)], [(...","[[W.T. White High School (Dallas, TX)], [Unive..."
ns-3441-1,"[what team was the last pick in the top 10, wh...",table_csv/203_149.csv,"[[(9, 2)], [(9, 1)]]","[[Montreal Expos], [Charles Johnson]]"
ns-3441-2,"[who are all of the players?, where did they a...",table_csv/203_149.csv,"[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, ...","[[Ben McDonald, Tyler Houston, Roger Salkeld, ..."
nt-10730-0,[what was the production numbers of each revol...,table_csv/203_253.csv,"[[(0, 4), (1, 4), (2, 4), (3, 4), (4, 4), (5, ...","[[1,900 (estimated), 14,500 (estimated), 6,000..."


Each row in the dataframe above now consists of a table and one or more questions which are asked in a sequence. Let's visualize the first row, i.e. a table, together with its queries:

In [None]:
from IPython.display import display

# path to the directory containing all csv files
table_csv_path = "SQA Release 1.0/table_csv"

item = grouped.iloc[1]
table = pd.read_csv(table_csv_path + item.table_file[9:]).astype(str) 

display(table)
print("")
print(item.question)

Unnamed: 0,Rank,Name,Nationality,Time (hand),Notes
0,,Tommy Green,Great Britain,4:50:10,OR
1,,Janis Dalins,Latvia,4:57:20,
2,,Ugo Frigerio,Italy,4:59:06,
3,4.0,Karl Hahnel,Germany,5:06:06,
4,5.0,Ettore Rivolta,Italy,5:07:39,
5,6.0,Paul Sievert,Germany,5:16:41,
6,7.0,Henri Quintric,France,5:27:25,
7,8.0,Ernie Crosbie,United States,5:28:02,
8,9.0,Bill Chisholm,United States,5:51:00,
9,10.0,Alfred Maasik,Estonia,6:19:00,



["who competed in the men's 50 kilometer walk at the 1932 summer olympics?", "of those competitors in the 1932 summer olympics men's 50 kilometer walk, which ones were from germany?", 'of those competitors from germany, which was not paul sievert?']


We can see that there are 3 sequential questions asked related to the contents of the table.

We can now use TapasTokenizer to batch encode this, as follows:

In [None]:
import torch
from transformers import TapasTokenizer

# initialize the tokenizer
tokenizer = TapasTokenizer.from_pretrained("google/tapas-base")

In [None]:
encoding = tokenizer(table=table, queries=item.question, answer_coordinates=item.answer_coordinates, answer_text=item.answer_text,
                     truncation=True, padding="max_length", return_tensors="pt")
encoding.keys()

dict_keys(['input_ids', 'labels', 'numeric_values', 'numeric_values_scale', 'token_type_ids', 'attention_mask'])

TAPAS basically flattens every table-question pair before feeding it into a BERT like model:

In [None]:
tokenizer.decode(encoding["input_ids"][0])

"[CLS] who competed in the men's 50 kilometer walk at the 1932 summer olympics? [SEP] rank name nationality time ( hand ) notes [EMPTY] tommy green great britain 4 : 50 : 10 or [EMPTY] janis dalins latvia 4 : 57 : 20 [EMPTY] [EMPTY] ugo frigerio italy 4 : 59 : 06 [EMPTY] 4. 0 karl hahnel germany 5 : 06 : 06 [EMPTY] 5. 0 ettore rivolta italy 5 : 07 : 39 [EMPTY] 6. 0 paul sievert germany 5 : 16 : 41 [EMPTY] 7. 0 henri quintric france 5 : 27 : 25 [EMPTY] 8. 0 ernie crosbie united states 5 : 28 : 02 [EMPTY] 9. 0 bill chisholm united states 5 : 51 : 00 [EMPTY] 10. 0 alfred maasik estonia 6 : 19 : 00 [EMPTY] [EMPTY] henry cieman canada [EMPTY] dnf [EMPTY] john moralis greece [EMPTY] dnf [EMPTY] francesco pretti italy [EMPTY] dnf [EMPTY] arthur tell schwab switzerland [EMPTY] dnf [EMPTY] harry hinkel united states [EMPTY] dnf [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

Let's create a PyTorch dataset and corresponding dataloader. Note the getitem method here: in order to properly set the prev_labels token types, we must check whether a table-question pair is the first in a sequence or not. In case it is, we can just encode it. In case it isn't, we need to encode it together with the previous table-question pair.

In [None]:
class TableDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        item = self.df.iloc[idx]
        table = pd.read_csv(table_csv_path + item.table_file[9:]).astype(str) # TapasTokenizer expects the table data to be text only
        if item.position != 0:
          # use the previous table-question pair to correctly set the prev_labels token type ids
          previous_item = self.df.iloc[idx-1]
          encoding = self.tokenizer(table=table, 
                                    queries=[previous_item.question, item.question], 
                                    answer_coordinates=[previous_item.answer_coordinates, item.answer_coordinates], 
                                    answer_text=[previous_item.answer_text, item.answer_text],
                                    padding="max_length",
                                    truncation=True,
                                    return_tensors="pt"
          )
          # use encodings of second table-question pair in the batch
          encoding = {key: val[-1] for key, val in encoding.items()}
        else:
          # this means it's the first table-question pair in a sequence
          encoding = self.tokenizer(table=table, 
                                    queries=item.question, 
                                    answer_coordinates=item.answer_coordinates, 
                                    answer_text=item.answer_text,
                                    padding="max_length",
                                    truncation=True,
                                    return_tensors="pt"
          )
          # remove the batch dimension which the tokenizer adds 
          encoding = {key: val.squeeze(0) for key, val in encoding.items()}
        return encoding

    def __len__(self):
        return len(self.df)

train_dataset = TableDataset(df=train_data, tokenizer=tokenizer)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=2)

### Define the Model

In [None]:
from transformers import TapasConfig, TapasForQuestionAnswering

# the base  model with default SQA configuration
model = TapasForQuestionAnswering.from_pretrained("google/tapas-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

Some weights of TapasForQuestionAnswering were not initialized from the model checkpoint at google/tapas-base and are newly initialized: ['output_weights', 'column_output_bias', 'column_output_weights', 'output_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TapasForQuestionAnswering(
  (tapas): TapasModel(
    (embeddings): TapasEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(1024, 768)
      (token_type_embeddings_0): Embedding(3, 768)
      (token_type_embeddings_1): Embedding(256, 768)
      (token_type_embeddings_2): Embedding(256, 768)
      (token_type_embeddings_3): Embedding(2, 768)
      (token_type_embeddings_4): Embedding(256, 768)
      (token_type_embeddings_5): Embedding(256, 768)
      (token_type_embeddings_6): Embedding(10, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.07, inplace=False)
    )
    (encoder): TapasEncoder(
      (layer): ModuleList(
        (0): TapasLayer(
          (attention): TapasAttention(
            (self): TapasSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)


### Train the Model

In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(10):  # loop over the dataset multiple times
   print("Epoch:", epoch)
   for idx, batch in enumerate(train_dataloader):
        # get the inputs;
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch["labels"].to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,
                       labels=labels)
        loss = outputs.loss
        print("Loss:", loss.item())
        loss.backward()
        optimizer.step()

Epoch: 0
Loss: 2.4576611518859863
Loss: 2.208237648010254
Loss: 1.532903790473938
Loss: 1.6185946464538574
Loss: 1.4657636880874634
Loss: 2.6599626541137695
Loss: 2.347184419631958
Loss: 2.606064796447754
Loss: 2.5185210704803467
Loss: 3.0615274906158447
Loss: 2.393543243408203
Loss: 2.137967586517334
Loss: 2.0683250427246094
Loss: 1.0513044595718384
Loss: 4.0133466720581055
Loss: 0.9346457719802856
Loss: 1.0688081979751587
Loss: 1.3251779079437256
Loss: 2.0984935760498047
Loss: 0.9930031299591064
Loss: 0.8833472728729248
Loss: 0.9931430816650391
Loss: 0.9654159545898438
Loss: 0.9942173957824707
Loss: 4.04036808013916
Loss: 0.7319822311401367
Loss: 0.8009426593780518
Loss: 2.230072259902954
Loss: 0.6047083139419556
Loss: 2.072066307067871
Loss: 2.064149856567383
Loss: 0.8191809058189392
Loss: 0.8792343139648438
Loss: 1.3855931758880615
Loss: 1.428628921508789
Loss: 1.130543828010559
Loss: 1.8510346412658691
Loss: 2.871675968170166
Loss: 0.6975874900817871
Loss: 0.5966156721115112
Loss:

### Pushing to HuggingFace Hub

In [1]:
!pip install huggingface_hub

Collecting huggingface_hub
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[?25l[K     |████▏                           | 10 kB 16.5 MB/s eta 0:00:01[K     |████████▍                       | 20 kB 18.2 MB/s eta 0:00:01[K     |████████████▋                   | 30 kB 11.2 MB/s eta 0:00:01[K     |████████████████▉               | 40 kB 3.9 MB/s eta 0:00:01[K     |█████████████████████           | 51 kB 3.9 MB/s eta 0:00:01[K     |█████████████████████████▎      | 61 kB 4.6 MB/s eta 0:00:01[K     |█████████████████████████████▌  | 71 kB 4.8 MB/s eta 0:00:01[K     |████████████████████████████████| 77 kB 2.8 MB/s 
Installing collected packages: huggingface-hub
Successfully installed huggingface-hub-0.5.1


In [None]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [None]:
model.push_to_hub('Meena/table-question-answering-tapas')

Cloning https://huggingface.co/Meena/table-question-answering-tapas into local empty directory.


Upload file pytorch_model.bin:   0%|          | 3.34k/422M [00:00<?, ?B/s]

To https://huggingface.co/Meena/table-question-answering-tapas
   c32d802..0f828a7  main -> main



'https://huggingface.co/Meena/table-question-answering-tapas/commit/0f828a70eb3dd6daca8851fa171ad5c6b9a52d84'

In [None]:
tokenizer.push_to_hub('Meena/table-question-answering-tapas')

To https://huggingface.co/Meena/table-question-answering-tapas
   0f828a7..11151eb  main -> main



'https://huggingface.co/Meena/table-question-answering-tapas/commit/11151eb21004a9d969d27381aea7f953a9c4bfd1'