In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, utils
from bertviz import model_view
utils.logging.set_verbosity_error()  # Suppress standard warnings

model_name = "../models/bart-infotab"  # Find popular HuggingFace models here: https://huggingface.co/models
model = AutoModelForSequenceClassification.from_pretrained(model_name, output_attentions=True)  # Configure model to return attention values
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [13]:
import jsonlines
import json
import pandas as pd

for item in jsonlines.open("../data/infotab/processed/dev.jsonl"):
    table = pd.DataFrame(json.loads(item["table"]))
    for column in table.columns:
        table[column] = table[column].apply(lambda x: f"{column} : {' , '.join(x)}")

    linearized_table = tokenizer.sep_token.join(
        table.apply(lambda x: " ; ".join(x), axis=1).values.tolist()
    )    
    sentence = item["sentence"]

    inputs = tokenizer(linearized_table, sentence, truncation=True, padding=True, return_tensors="pt")
    break

In [16]:
inputs

{'input_ids': tensor([[    0,  1270,  4832, 14590,  2060,   677,   260, 30268, 25606,  8912,
          4832,    36,   504,  4429,    12,   698,    12,  4124,  4839,   231,
           779,   504,  4429,  1437, 27090,  2089,  1899,     6, 26211,    36,
         13040,   233,     9,  1083, 11492,    43, 25606, 38339,  4832,  1132,
           830, 23137,    36, 42442,    12,  3669,    12,  2890,    43,  1437,
            36,  4628,  4034,    43,  1437,  6130,    12,   448,  8616,    12,
         10067,    12,   597,  5434,   293,     6,  1470, 25606, 24869,  1258,
          4832, 43027,  2156,  1437, 31569,  1437,  2156,  1437,  8980,  8083,
             2,     2, 14590,  2060,   677,   260, 30268,   962,   137,   623,
          1771,    38,     4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [20]:
outputs = model(**inputs, output_attentions=True)  # Run model

In [None]:
attention = outputs.decoder_attentions  # Retrieve attention from model outputs
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])  # Convert input ids to token strings
model_view(attention, tokens)  # Display model view