# Travel agency's reviews - classification with BERT

Implement and evaluate a classifier of user reviews with BERT.

In [None]:
import os
os.environ['TF_USE_LEGACY_KERAS'] = '1'
!pip install transformers

In [None]:
import pandas as pd

reviews = pd.read_csv('https://raw.githubusercontent.com/mlcollege/natural-language-processing/master/data/en_reviews.csv', sep='\t', header=None, names =['rating', 'text'])
reviews[35:45]

## Preparation of train and test data sets
Separate and rename target values.

In [None]:
target = reviews['rating']
data = reviews['text']

print(data[:5])
print(target[:5])

Import the BERT model and tokenizer

In [4]:
from transformers import BertTokenizer, TFBertForSequenceClassification

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=5)

Split the data to train and test parts.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.1)
print('Train size: {}'.format(len(X_train)))
print('Test size: {}'.format(len(X_test)))

Tokenize the documents and create attention masks.

In [None]:
import numpy as np

train_ids=[]
train_masks=[]
test_ids=[]
test_masks=[]


for doc in X_train:
    bert_inp = bert_tokenizer.encode_plus(doc, add_special_tokens = True, pad_to_max_length = True, max_length = 64, return_attention_mask = True)
    train_ids.append(np.array(bert_inp['input_ids']))
    train_masks.append(np.array(bert_inp['attention_mask']))

for doc in X_test:
    bert_inp = bert_tokenizer.encode_plus(doc, add_special_tokens = True, pad_to_max_length = True, max_length = 64, return_attention_mask = True)
    test_ids.append(np.array(bert_inp['input_ids']))
    test_masks.append(np.array(bert_inp['attention_mask']))

train_ids = np.asarray(train_ids)
train_masks = np.asarray(train_masks)
test_ids = np.asarray(test_ids)
test_masks = np.asarray(test_masks)

print (train_ids.shape)
print (test_ids.shape)

One-hot encode the target values.

In [8]:
from tensorflow.python.keras.utils import np_utils

n_classes = 5
y_train = np_utils.to_categorical(y_train-1, n_classes)
y_test = np_utils.to_categorical(y_test-1, n_classes)

Compile the model.

In [9]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
optimizer = Adam(learning_rate=2e-5, epsilon=1e-08)
bert_model.compile(loss=loss,optimizer = optimizer,metrics=["accuracy"])

In [None]:
bert_model.fit([train_ids, train_masks], y_train, batch_size=32, epochs=3, validation_data=([test_ids, test_masks], y_test))

## Evaluate the model

In [None]:
from sklearn import metrics
from sklearn.metrics import accuracy_score

y_pred = bert_model.predict([test_ids, test_masks])
y_test_class = np.argmax(y_test, axis=1)
y_pred_class = np.argmax(y_pred[0], axis=1)

print ("Test accuracy: {:.4f}".format(accuracy_score(y_test_class, y_pred_class)))
print ()
print(metrics.classification_report(y_test_class, y_pred_class, digits=4))