# Pretrained BERT model

Use a pretrained BertModel from HuggingFace, only fit the classifier layers

https://github.com/huggingface/transformers/blob/master/notebooks/02-transformers.ipynb

Download distilbert model:
* https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-tf_model.h5
* https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-cased-config.json

In [1]:
import pandas as pd
from transformers import BertTokenizer

import re

import logging

logging.basicConfig(level=logging.WARNING)

2000 records is 3 minutes for creating the embeddings. If we assume linear performance it would take 75 minutes to convert all embeddings. Unfortantely, it leads to a dead kernel in the tokenize step. We need to create batches to run this on a local machine.

In [10]:
df = pd.read_csv('../data/IMDB Dataset.csv')

SAMPLE_SIZE = 50000

def preprocess_imdb_raw_data(x):
    x = re.sub("<br\\s*/?>", " ", x)
    return x 

X = [preprocess_imdb_raw_data(x) for x in df['review'].values][:SAMPLE_SIZE]

y = df['sentiment'].apply(lambda x: int(x == 'positive')).values[:SAMPLE_SIZE]

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Using a transformers pipeline
Without any additional training

In [39]:
from transformers import pipeline

nlp_sentence_classif = pipeline('sentiment-analysis')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




In [41]:
from sklearn.model_selection import train_test_split

# Use the same test set as before
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [42]:
predicted_sentiment = [nlp_sentence_classif(x)[0]['label'].lower() for x in X_test]

In [43]:
from sklearn.metrics import classification_report

y_pred = [s == 'positive' for s in predicted_sentiment]

print(f"Test: {classification_report(y_test, y_pred)}")

Test:               precision    recall  f1-score   support

           0       0.87      0.92      0.89      5044
           1       0.91      0.86      0.88      4956

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



# Pre-trained BertModel

In [46]:
import torch
from transformers import AutoTokenizer, BertTokenizer
from transformers import TFBertModel, BertModel

torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x142171080>

Q: Can you use the tokenizer from a different model?

Q: Distilbert also takes around 3 to create embeddings. What is the efficiency gain that we could have expected?

In [48]:
# Store the model we want to use
MODEL_NAME = "bert-base-cased" 

# We need to create the model and tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# MODEL_NAME = "../models/distilbert-base-cased"

model_pt = BertModel.from_pretrained(MODEL_NAME)

model_pt.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [None]:
MAX_SEQ_LENGTH = 200

def get_sentence_embedding(text):
    
    tokenized_text = tokenizer.encode(text, 
                                      return_tensors='pt', 
                                      max_length=MAX_SEQ_LENGTH)

    with torch.no_grad():
        last_hidden_states, _ = model_pt(tokenized_text, )
    
    # Use the mean of the sentence embeddings
    return torch.mean(last_hidden_states, dim=1).numpy().reshape(1, -1)[0]

embeddings = np.concatenate([
  get_sentence_embedding(text)
  for text in X
])

## Generate sentence embeddings per batch
Contains error - no proper results

In [31]:
def generate_embeddings(X, batch_size):
    
    for i in range(0, len(X), batch_size):
    
        tokens = tokenizer.batch_encode_plus(X[i:i+batch_size], 
                                         max_length=MAX_SEQ_LENGTH, 
                                         pad_to_max_length=True,
                                         return_tensors='tf')
        _, pooled = model_tf(tokens)

        yield i, np.array(pooled)

In [None]:
for (i, embedding) in generate_embeddings(X, 5000):
    np.save(f'../models/bert-model/bert_pooled_layer_{i}.npy', embedding)

### Load embeddings

In [44]:
import os
from sklearn.preprocessing import Normalizer

embeddings = np.concatenate([
    np.load(f'../models/bert-model/bert_pooled_layer_{i}.npy')
    for i in range(0, SAMPLE_SIZE, 5000)
])

In [45]:
normalized_embeddings = Normalizer().fit_transform(embeddings)

In [46]:
X_train, X_test, y_train, y_test = train_test_split(normalized_embeddings, y, test_size=0.2, random_state=1)

## Model for last clf layer

In [47]:
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras import losses
from tensorflow.keras.optimizers import Adam

def make_simple_model(embedding_size=768):

    inp = Input(shape=[embedding_size])
    
    x = Dense(128, activation="relu")(inp)
    
    out = Dense(1, activation="sigmoid")(x)

    model = Model(inp, out)
    
    print(model.summary())
    
    model.compile(Adam(lr=5e-3), loss=losses.binary_crossentropy, metrics=['accuracy'])
    
    return model

model_clf = make_simple_model()

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 768)]             0         
_________________________________________________________________
dense_13 (Dense)             (None, 128)               98432     
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 129       
Total params: 98,561
Trainable params: 98,561
Non-trainable params: 0
_________________________________________________________________
None


In [48]:
model_clf.fit(X_train, y_train, epochs=150)

Train on 40000 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150


Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 79/150
Epoch 80/150
Epoch 81/150
Epoch 82/150
Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Ep

Epoch 150/150


<tensorflow.python.keras.callbacks.History at 0x1a5252a58>

## Validation

In [49]:
from sklearn.metrics import classification_report

y_test_probs = model_clf.predict(x=X_test)
y_test_pred = (y_test_probs >= 0.5).astype(int)

print(f"Test: {classification_report(y_test, y_test_pred)}")

Test:               precision    recall  f1-score   support

           0       0.79      0.53      0.63      5044
           1       0.64      0.86      0.73      4956

    accuracy                           0.69     10000
   macro avg       0.72      0.69      0.68     10000
weighted avg       0.72      0.69      0.68     10000



In [50]:
from sklearn.metrics import classification_report

y_train_probs = model_clf.predict(x=X_train)
y_train_pred = (y_train_probs >= 0.5).astype(int)

print(f"Train: {classification_report(y_train, y_train_pred)}")

Train:               precision    recall  f1-score   support

           0       0.80      0.55      0.65     19956
           1       0.66      0.87      0.75     20044

    accuracy                           0.71     40000
   macro avg       0.73      0.71      0.70     40000
weighted avg       0.73      0.71      0.70     40000



# Train with own weights
https://github.com/huggingface/transformers/issues/400#issuecomment-571354368

~~~py
for w in model.bert.weights():
        w._trainable = False
~~~

Train on a GPU, use pytorch https://mccormickml.com/2019/07/22/BERT-fine-tuning/#11-using-colab-gpu-for-training