# Pretrained BERT model

Use a pretrained BertModel from HuggingFace, only fit the classifier layers

https://github.com/huggingface/transformers/blob/master/notebooks/02-transformers.ipynb

In [95]:
import pandas as pd
from transformers import BertTokenizer

import re

import logging

logging.basicConfig(level=logging.WARNING)

In [127]:
df = pd.read_csv('../data/IMDB Dataset.csv')

SAMPLE_SIZE = 2000

def preprocess_imdb_raw_data(x):
    x = re.sub("<br\\s*/?>", " ", x)
    return x 

X = [preprocess_imdb_raw_data(x) for x in df['review'].values][:SAMPLE_SIZE]

y = df['sentiment'].apply(lambda x: int(x == 'positive')).values[:SAMPLE_SIZE]

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Using a transformers pipeline
Without any additional training

In [97]:
from transformers import pipeline

nlp_sentence_classif = pipeline('sentiment-analysis')

In [98]:
predicted_sentiment = [nlp_sentence_classif(x)[0]['label'].lower() for x in X]

In [99]:
from sklearn.metrics import classification_report

y_pred = [s == 'positive' for s in predicted_sentiment]

print(f"Test: {classification_report(y, y_pred)}")

Test:               precision    recall  f1-score   support

           0       0.87      0.95      0.90       115
           1       0.92      0.80      0.86        85

    accuracy                           0.89       200
   macro avg       0.89      0.87      0.88       200
weighted avg       0.89      0.89      0.88       200



## Using last pooled layer

In [128]:
import torch
from transformers import AutoTokenizer, BertTokenizer
from transformers import TFBertModel

torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x17c76a8d0>

In [129]:
# Store the model we want to use
MODEL_NAME = "bert-base-cased"

# We need to create the model and tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model_tf = TFBertModel.from_pretrained(MODEL_NAME)

In [130]:
MAX_SEQ_LENGTH = 100

tokens = tokenizer.batch_encode_plus(X, 
                                     max_length=MAX_SEQ_LENGTH, 
                                     return_tensors='tf')

In [None]:
outputs, pooled = model_tf(tokens)
pooled.shape

In [None]:
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras import losses

def make_simple_model(embedding_size=768):

    inp = Input(shape=[embedding_size])

    out = Dense(1, activation="sigmoid")(inp)

    model = Model(inp, out)
    print(model.summary())
    
    model.compile("adam", loss=losses.binary_crossentropy, metrics=['accuracy'])
    
    return model

model_clf = make_simple_model()

In [None]:
model_clf.fit(pooled, y, epochs=5)

In [None]:
y_train_probs = model.predict(x=pooled)
y_train_pred = (y_train_probs >= 0.5).astype(int)

print(f"Train: {classification_report(y, y_train_pred)}")