In [1]:
import nlp
import tokenizers
import transformers

### Part 1: Train base BERT tokenizer

In [2]:
train, test = nlp.load_dataset("emo", split = ["train", "test"])

In [3]:
train_text = " ".join([i["text"] for i in train])
test_text = " ".join([i["text"] for i in test])

In [4]:
with open('../data/train.txt', 'w') as f:
    f.write(train_text)
with open('../data/test.txt', 'w') as f:
    f.write(test_text)

In [5]:
tokenizer = tokenizers.BertWordPieceTokenizer()

In [6]:
vocab_size = 5000

tokenizer.train(files = ['../data/train.txt', '../data/test.txt'], \
                vocab_size = vocab_size, min_frequency = 50)

In [7]:
tokenizer.save_model('../tokenizers/emo-mobilebert/')

['../tokenizers/emo-mobilebert/vocab.txt']

In [8]:
tokenizer = transformers.MobileBertTokenizerFast.from_pretrained('../tokenizers/emo-mobilebert/')

In [24]:
config = transformers.MobileBertConfig(vocab_size = len(tokenizer.get_vocab()))

In [26]:
config

MobileBertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_activation": true,
  "embedding_size": 128,
  "hidden_act": "relu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "intra_bottleneck_size": 128,
  "key_query_shared_bottleneck": true,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "mobilebert",
  "normalization_type": "no_norm",
  "num_attention_heads": 4,
  "num_feedforward_networks": 4,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "trigram_input": true,
  "true_hidden_size": 128,
  "type_vocab_size": 2,
  "use_bottleneck": true,
  "use_bottleneck_attention": false,
  "vocab_size": 2016
}

In [27]:
model = transformers.MobileBertForSequenceClassification(config)

In [28]:
model.num_labels = 4

In [30]:
id2label = {}
for i in range(model.num_labels):
    id2label[i] = train.features["label"].int2str(i)
id2label

{0: 'others', 1: 'happy', 2: 'sad', 3: 'angry'}

In [31]:
model.config.id2label = id2label

In [32]:
model.config

MobileBertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_activation": true,
  "embedding_size": 128,
  "hidden_act": "relu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 512,
  "id2label": {
    "0": "others",
    "1": "happy",
    "2": "sad",
    "3": "angry"
  },
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "intra_bottleneck_size": 128,
  "key_query_shared_bottleneck": true,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "mobilebert",
  "normalization_type": "no_norm",
  "num_attention_heads": 4,
  "num_feedforward_networks": 4,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "trigram_input": true,
  "true_hidden_size": 128,
  "type_vocab_size": 2,
  "use_bottleneck": true,
  "use_bottleneck_attention": false,
  "vocab_size": 2016
}