In [None]:
import nlp
import warnings
import tokenizers
import transformers

from sklearn.metrics import accuracy_score

In [None]:
warnings.filterwarnings('ignore')

### Part 1: Train base BERT tokenizer

In [None]:
train, test = nlp.load_dataset("emo", split = ["train", "test"])

In [None]:
train_text = " ".join([i["text"] for i in train])
test_text = " ".join([i["text"] for i in test])

In [None]:
with open('../data/train.txt', 'w') as f:
    f.write(train_text)
with open('../data/test.txt', 'w') as f:
    f.write(test_text)

In [None]:
tokenizer = tokenizers.BertWordPieceTokenizer()

In [None]:
vocab_size = 5000

tokenizer.train(files = ['../data/train.txt', '../data/test.txt'], \
                vocab_size = vocab_size, min_frequency = 50)

In [None]:
tokenizer.save_model('../tokenizers/emo-mobilebert/')

['../tokenizers/emo-mobilebert/vocab.txt']

In [None]:
tokenizer = transformers.MobileBertTokenizerFast.from_pretrained('../tokenizers/emo-mobilebert/')

### Part 2: Instantiate MobileBERT Model and reset params

In [None]:
config = transformers.MobileBertConfig(vocab_size = len(tokenizer.get_vocab()))

In [None]:
config.num_labels = 4

In [None]:
config.max_length = 128

In [None]:
id2label = {}
for i in range(config.num_labels):
    id2label[i] = train.features["label"].int2str(i)
id2label

{0: 'others', 1: 'happy', 2: 'sad', 3: 'angry'}

In [None]:
config.id2label = id2label

In [None]:
config.label2id = {v:k for k,v in id2label.items()}

In [None]:
model = transformers.MobileBertForSequenceClassification(config)

In [None]:
model.config

MobileBertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_activation": true,
  "embedding_size": 128,
  "hidden_act": "relu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 512,
  "id2label": {
    "0": "others",
    "1": "happy",
    "2": "sad",
    "3": "angry"
  },
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "intra_bottleneck_size": 128,
  "key_query_shared_bottleneck": true,
  "label2id": {
    "angry": 3,
    "happy": 1,
    "others": 0,
    "sad": 2
  },
  "layer_norm_eps": 1e-12,
  "max_length": 128,
  "max_position_embeddings": 512,
  "model_type": "mobilebert",
  "normalization_type": "no_norm",
  "num_attention_heads": 4,
  "num_feedforward_networks": 4,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "trigram_input": true,
  "true_hidden_size": 128,
  "type_vocab_size": 2,
  "use_bottleneck": true,
  "use_bottleneck_attention": false,
  "vocab_size": 2016
}

### Part 3: Training

In [None]:
## No max len defined as all sentences are not too long
def tokenize(batch):
    return tokenizer(batch['text'], padding = True)

In [None]:
train_dataset = train.map(tokenize, batched = True, batch_size = len(train))
test_dataset = test.map(tokenize, batched = True, batch_size = len(train))
train_dataset.set_format('torch', columns = ['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns = ['input_ids', 'attention_mask', 'label'])

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc
    }

In [None]:
training_args = transformers.TrainingArguments(
    output_dir = './results',
    num_train_epochs = 10,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 32,
    warmup_steps = 500,
    weight_decay = 0.01,
    evaluate_during_training = True,
    logging_dir = './logs',
)

trainer = transformers.Trainer(
    model = model,
    args = training_args,
    compute_metrics = compute_metrics,
    train_dataset = train_dataset,
    eval_dataset = test_dataset
)

In [None]:
trainer.train() 

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1885.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=173.0, style=ProgressStyle(description_w…





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1885.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=173.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=173.0, style=ProgressStyle(description_w…





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1885.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=173.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=173.0, style=ProgressStyle(description_w…





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1885.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=173.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=173.0, style=ProgressStyle(description_w…





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1885.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=173.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=173.0, style=ProgressStyle(description_w…





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1885.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=173.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=173.0, style=ProgressStyle(description_w…





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1885.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=173.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=173.0, style=ProgressStyle(description_w…





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1885.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=173.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=173.0, style=ProgressStyle(description_w…





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1885.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=173.0, style=ProgressStyle(description_w…





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1885.0, style=ProgressStyle(description_w…

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=173.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=173.0, style=ProgressStyle(description_w…






TrainOutput(global_step=18850, training_loss=0.5102641563745962)

In [None]:
trainer.evaluate()

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=173.0, style=ProgressStyle(description_w…




{'eval_loss': 0.42279379155939023,
 'eval_accuracy': 0.8489744055182429,
 'epoch': 10.0}

In [None]:
trainer.save_model("../models/emo-mobilebert/")

In [None]:
tokenizer.save_pretrained("../tokenizers/emo-mobilebert/")

('../tokenizers/emo-mobilebert/vocab.txt',
 '../tokenizers/emo-mobilebert/special_tokens_map.json',
 '../tokenizers/emo-mobilebert/added_tokens.json')

### Part 4: Pipeline

In [None]:
transformer = transformers.AutoModelForSequenceClassification.from_pretrained("../models/emo-mobilebert/")

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained("../tokenizers/emo-mobilebert/")

In [None]:
nlp_sentence_classif = transformers.pipeline('sentiment-analysis', model = transformer, tokenizer = tokenizer)
nlp_sentence_classif("I've never had such a bad day in my life")

[{'label': 'sad', 'score': 0.93153977394104}]