In [None]:
import re
import itertools
import numpy as np
import pandas as pd
import torch
from transformers import (
    AutoModel, 
    AutoTokenizer, 
#    BertTokenizer,
)

torch.set_grad_enabled(False)

In [None]:
# Store the model we want to use
MODEL_NAME = "bert-base-uncased" #@param
#MODEL_NAME = "gpt2" #@param

# We need to create the model and tokenizer
model = AutoModel.from_pretrained(MODEL_NAME,
                                  output_hidden_states=True,
                                  output_attentions=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

In [None]:
sentences = [
    'Do not compare apples and oranges, but apples and apples.',
    'This is a follow-up sentence with less common punctuation!',
    'Does any of this really work?',
    "Do not and don't are subtly different.",
    '"Go away!" — she said.'
]

for sent in sentences:
    print("SENTENCE", sent)
    encoded = tokenizer(sent, return_tensors="pt")
    inputs = encoded.input_ids
    print("INPUTS", inputs.shape, inputs)
    decoded = tokenizer.convert_ids_to_tokens(inputs[0], skip_special_tokens=False)
    print("DECODE", decoded)
    attention_mask =  encoded['attention_mask']
    print("MASK", attention_mask.shape, attention_mask)
    output = model(input_ids=inputs, attention_mask=attention_mask)
    token_len = attention_mask.sum().item()
    print("# OF TOKENS", token_len)
    if MODEL_NAME in ['gpt2', 'gpt2-medium', 'gpt2-large']:
        word_indices = np.array(list(map(lambda e: -1 if e is None else e, encoded.word_ids())))[:token_len]
        word_groups = np.split(np.arange(word_indices.shape[0]), np.unique(word_indices, return_index=True)[1])[1:]
    else:
        word_indices = np.array(list(map(lambda e: -1 if e is None else e, encoded.word_ids())))[1:token_len - 1]
        word_groups = np.split(np.arange(word_indices.shape[0]) + 1, np.unique(word_indices, return_index=True)[1])[1:]
#    word_indices = np.array(list(map(lambda e: -1 if e is None else e, encoded.word_ids())))[1:token_len - 1]
    print("WORD INDICES", word_indices)
#    word_groups = np.split(np.arange(word_indices.shape[0]) + 1, np.unique(word_indices, return_index=True)[1])[1:]
    print("GROUPS", len(word_groups), word_groups)
    
    display(pd.DataFrame([
        (
                g,
                np.array(decoded)[g] if g is not None else None,
                inputs[0, :][g].numpy().tolist() if g is not None else None,
                t,
                "".join(list(map(lambda t: t[2:] if t[:2] == "##" else t, np.array(decoded)[g]))) if g is not None else None
        )
            for g, t in itertools.zip_longest(word_groups, re.findall(r"\w+|[^\s\w]", sent))
    ], columns=['group', 'token_ids', 'decoded', 'regex', 'combined'])
           )
    print("=" * 40)
