In [1]:
import sys
sys.path.append("..")
from pathlib import Path
import torch
import numpy as np

from utils_glue import *
from pytorch_transformers import *

In [2]:
task = "sst-2"
processor = processors[task]()
output_mode = "classification"

model_type = "bert"
model_name = "bert-base-uncased"
max_seq_length = 128

train_examples = processor.get_train_examples("../glue_data/SST-2/")
label_list = processor.get_labels()
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)

In [3]:
features = convert_examples_to_features(train_examples, label_list, max_seq_length, tokenizer, output_mode,
    cls_token_at_end=bool(model_type in ['xlnet']),            # xlnet has a cls token at the end
    cls_token=tokenizer.cls_token,
    cls_token_segment_id=2 if model_type in ['xlnet'] else 0,
    sep_token=tokenizer.sep_token,
    sep_token_extra=bool(model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
    pad_on_left=bool(model_type in ['xlnet']),                 # pad on the left for xlnet
    pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
    pad_token_segment_id=4 if model_type in ['xlnet'] else 0,
)

In [4]:
vars(features[0])

{'input_ids': [101,
  5342,
  2047,
  3595,
  8496,
  2013,
  1996,
  18643,
  3197,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'input_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0

In [6]:
tokenizer.convert_ids_to_tokens([0, 0])

['[PAD]', '[PAD]']

In [7]:
from collections import Counter
freqs = Counter()
for feat in features:
    for i, w in zip(feat.input_ids, 
                    tokenizer.convert_ids_to_tokens(feat.input_ids)):
        if i == 0: break
        freqs[w] += 1

In [9]:
import json
with open("../info/train_freqs.json", "wt") as f:
    json.dump(freqs, f)