# Text Classification with Deep Learning

## History

### 2020/11/3

- Explain how to fine-tune BERT for text classification with ktrain
- Change `weights` to `embeddings_initializer`
- Call `clear_session()` when creating models in a loop to avoid OOM

**References**
- [Using pre-trained word embeddings | version: Last modified: 2020/05/05](https://keras.io/examples/nlp/pretrained_word_embeddings/)
- [tf.keras.backend.clear_session
](https://www.tensorflow.org/api_docs/python/tf/keras/backend/clear_session)

## Setup

In [1]:
%tensorflow_version 2.x

In [2]:
!pip install janome beautifulsoup4

Collecting janome
  Downloading Janome-0.4.1-py2.py3-none-any.whl (19.7 MB)
[K     |████████████████████████████████| 19.7 MB 53.5 MB/s 
Installing collected packages: janome
Successfully installed janome-0.4.1


In [3]:
!mkdir data
!mkdir models
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.vec.gz -P data/

--2021-08-09 06:15:48--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ja.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 104.22.74.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1279641604 (1.2G) [binary/octet-stream]
Saving to: ‘data/cc.ja.300.vec.gz’


2021-08-09 06:16:26 (32.2 MB/s) - ‘data/cc.ja.300.vec.gz’ saved [1279641604/1279641604]



### Hyper-parameters

In [4]:
maxlen = 300
num_words = 40000
num_label = 2

### Imports

In [6]:
import string

import gensim
import numpy as np
import pandas as pd
import tensorflow as tf
from bs4 import BeautifulSoup
from janome.tokenizer import Tokenizer
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dense, Input, Embedding, SimpleRNN, LSTM, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.preprocessing.sequence import pad_sequences

## The dataset

### Load the Amazon Customer Reviews Datasets

In [7]:
def filter_by_ascii_rate(text, threshold=0.9):
    ascii_letters = set(string.printable)
    rate = sum(c in ascii_letters for c in text) / len(text)
    return rate <= threshold


def load_dataset(filename, n=5000):
    df = pd.read_csv(filename, sep='\t')

    # Converts multi-class to binary-class.
    mapping = {1: 0, 2: 0, 4: 1, 5: 1}
    df = df[df.star_rating != 3]
    df.star_rating = df.star_rating.map(mapping)

    # extracts Japanese texts.
    is_jp = df.review_body.apply(filter_by_ascii_rate)
    df = df[is_jp]

    # sampling.
    df = df.sample(frac=1, random_state=7)  # shuffle
    grouped = df.groupby('star_rating')
    df = grouped.head(n=n)
    return df.review_body.values, df.star_rating.values


url = 'https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_JP_v1_00.tsv.gz'
x, y = load_dataset(url)

### Load the word embeddings

In [8]:
def load_fasttext(filepath, binary=False):
    """Loads fastText vectors.

    Args:
        filepath (str): a path to a fastText file.

    Return:
        model: KeyedVectors
    """
    model = gensim.models.KeyedVectors.load_word2vec_format(filepath, binary=binary)
    return model


wv = load_fasttext('data/cc.ja.300.vec.gz')

### Preprocess the dataset

In [9]:
t = Tokenizer(wakati=True)


def build_vocabulary(texts, num_words=None):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=num_words, oov_token='<UNK>'
    )
    tokenizer.fit_on_texts(texts)
    return tokenizer


def clean_html(html, strip=False):
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text(strip=strip)
    return text


def tokenize(text):
    return t.tokenize(text)


def preprocess_dataset(texts):
    texts = [clean_html(text) for text in texts]
    texts = [' '.join(tokenize(text)) for text in texts]
    return texts


def filter_embeddings(embeddings, vocab, num_words, dim=300):
  """Filter word vectors.

  Args:
      embeddings: a dictionary like object.
      vocab: word-index lookup table.
      num_words: the number of words.
      dim: dimension.

  Returns:
      numpy array: an array of word embeddings.
  """
  _embeddings = np.zeros((num_words, dim))
  for word in vocab:
      if word in embeddings:
          word_id = vocab[word]
          if word_id >= num_words:
              continue
          _embeddings[word_id] = embeddings[word]

  return _embeddings

In [10]:
x = preprocess_dataset(x)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
vocab = build_vocabulary(x_train, num_words)
x_train = vocab.texts_to_sequences(x_train)
x_test = vocab.texts_to_sequences(x_test)
x_train = pad_sequences(x_train, maxlen=maxlen, truncating='post', padding='post')
x_test = pad_sequences(x_test, maxlen=maxlen, truncating='post', padding='post')

wv = filter_embeddings(wv, vocab.word_index, num_words)

## The models

### Build the models

In [11]:
class RNNModel:

    def __init__(self, input_dim, output_dim,
                 emb_dim=300, hid_dim=100,
                 embeddings=None, trainable=True):
        self.input = Input(shape=(None,), name='input')
        if embeddings is None:
            self.embedding = Embedding(input_dim=input_dim,
                                       output_dim=emb_dim,
                                       mask_zero=True,
                                       trainable=trainable,
                                       name='embedding')
        else:
            self.embedding = Embedding(input_dim=embeddings.shape[0],
                                       output_dim=embeddings.shape[1],
                                       mask_zero=True,
                                       trainable=trainable,
                                       embeddings_initializer=tf.keras.initializers.Constant(embeddings),
                                       # weights=[embeddings],
                                       name='embedding')
        self.rnn = SimpleRNN(hid_dim, name='rnn')
        self.fc = Dense(output_dim, activation='softmax')

    def build(self):
        x = self.input
        embedding = self.embedding(x)
        output = self.rnn(embedding)
        y = self.fc(output)
        return Model(inputs=x, outputs=y)


class LSTMModel:

    def __init__(self, input_dim, output_dim,
                 emb_dim=300, hid_dim=100,
                 embeddings=None, trainable=True):
        self.input = Input(shape=(None,), name='input')
        if embeddings is None:
            self.embedding = Embedding(input_dim=input_dim,
                                       output_dim=emb_dim,
                                       mask_zero=True,
                                       trainable=trainable,
                                       name='embedding')
        else:
            self.embedding = Embedding(input_dim=embeddings.shape[0],
                                       output_dim=embeddings.shape[1],
                                       mask_zero=True,
                                       trainable=trainable,
                                       embeddings_initializer=tf.keras.initializers.Constant(embeddings),
                                       # weights=[embeddings],
                                       name='embedding')
        self.lstm = LSTM(hid_dim, name='lstm')
        self.fc = Dense(output_dim, activation='softmax')

    def build(self):
        x = self.input
        embedding = self.embedding(x)
        output = self.lstm(embedding)
        y = self.fc(output)
        return Model(inputs=x, outputs=y)


class CNNModel:

    def __init__(self, input_dim, output_dim,
                 filters=250, kernel_size=3,
                 emb_dim=300, embeddings=None, trainable=True):
        self.input = Input(shape=(None,), name='input')
        if embeddings is None:
            self.embedding = Embedding(input_dim=input_dim,
                                       output_dim=emb_dim,
                                       trainable=trainable,
                                       name='embedding')
        else:
            self.embedding = Embedding(input_dim=embeddings.shape[0],
                                       output_dim=embeddings.shape[1],
                                       trainable=trainable,
                                       embeddings_initializer=tf.keras.initializers.Constant(embeddings),
                                       # weights=[embeddings],
                                       name='embedding')
        self.conv = Conv1D(filters,
                           kernel_size,
                           padding='valid',
                           activation='relu',
                           strides=1)
        self.pool = GlobalMaxPooling1D()
        self.fc = Dense(output_dim, activation='softmax')

    def build(self):
        x = self.input
        embedding = self.embedding(x)
        conv = self.conv(embedding)
        pool = self.pool(conv)
        y = self.fc(pool)
        return Model(inputs=x, outputs=y)

In [12]:
models = [
    RNNModel,
    LSTMModel,
    CNNModel,
    CNNModel
]

### Train the models

In [13]:
model_path = 'models/model_{}'
embeddings = [None, None, None, wv]
batch_size = 128
epochs = 100
i = 0
for model, embedding in zip(models, embeddings):
    tf.keras.backend.clear_session()
    model = model(num_words, num_label, embeddings=embedding).build()
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['acc']
    )

    callbacks = [
        EarlyStopping(patience=3),
        ModelCheckpoint(model_path.format(i), save_best_only=True)
    ]

    model.fit(
        x=x_train, y=y_train,
        batch_size=batch_size,
        epochs=epochs,
        validation_split=0.2,
        callbacks=callbacks,
        shuffle=True
    )
    i += 1

Epoch 1/100
INFO:tensorflow:Assets written to: models/model_0/assets
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 1/100




INFO:tensorflow:Assets written to: models/model_1/assets


INFO:tensorflow:Assets written to: models/model_1/assets


Epoch 2/100




INFO:tensorflow:Assets written to: models/model_1/assets


INFO:tensorflow:Assets written to: models/model_1/assets


Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 1/100
INFO:tensorflow:Assets written to: models/model_2/assets


INFO:tensorflow:Assets written to: models/model_2/assets


Epoch 2/100
INFO:tensorflow:Assets written to: models/model_2/assets


INFO:tensorflow:Assets written to: models/model_2/assets


Epoch 3/100
INFO:tensorflow:Assets written to: models/model_2/assets


INFO:tensorflow:Assets written to: models/model_2/assets


Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 1/100
INFO:tensorflow:Assets written to: models/model_3/assets


INFO:tensorflow:Assets written to: models/model_3/assets


Epoch 2/100
INFO:tensorflow:Assets written to: models/model_3/assets


INFO:tensorflow:Assets written to: models/model_3/assets


Epoch 3/100
INFO:tensorflow:Assets written to: models/model_3/assets


INFO:tensorflow:Assets written to: models/model_3/assets


Epoch 4/100
Epoch 5/100
Epoch 6/100


### Evaluate the models

In [14]:
class InferenceAPI:
    """A model API that generates output sequence.

    Attributes:
        model: Model.
        vocab: language's vocabulary.
    """

    def __init__(self, model, vocab, preprocess):
        self.model = model
        self.vocab = vocab
        self.preprocess = preprocess

    def predict_from_texts(self, texts):
        x = self.preprocess(texts)
        x = self.vocab.texts_to_sequences(x)
        return self.predict_from_sequences(x)

    def predict_from_sequences(self, sequences):
        sequences = pad_sequences(sequences, truncating='post')
        y = self.model.predict(sequences)
        return np.argmax(y, -1)

In [15]:
model_names = ['RNN', 'LSTM', 'CNN', 'CNN(wv)']
for i, model_name in enumerate(model_names):
    tf.keras.backend.clear_session()
    model = load_model(model_path.format(i))
    api = InferenceAPI(model, vocab, preprocess_dataset)
    y_pred = api.predict_from_sequences(x_test)
    print(model_name)
    print('precision\t: {:.4f}'.format(precision_score(y_test, y_pred, average='binary')))
    print('recall\t: {:.4f}'.format(recall_score(y_test, y_pred, average='binary')))
    print('f1\t: {:.4f}'.format(f1_score(y_test, y_pred, average='binary')))
    print()

RNN
precision	: 0.8055
recall	: 0.6513
f1	: 0.7202

LSTM
precision	: 0.8037
recall	: 0.8287
f1	: 0.8160

CNN
precision	: 0.7989
recall	: 0.8958
f1	: 0.8446

CNN(wv)
precision	: 0.8587
recall	: 0.8096
f1	: 0.8334



# Fine-tune BERT for Text Classification with ktrain

文量の都合上、書籍には含まれていませんが、東北大が公開している日本語のBERTをFine-tuneして文章分類をしてみましょう。現在では、BERTを使う場合、Huggingfaceの[Transformers](https://github.com/huggingface/transformers)というパッケージが使われることが多いですが、ここではよりシンプルに書ける[ktrain](https://github.com/amaiya/ktrain)を使ってみます。細かい設定はともかく、サクッと学習させたいときに便利です。

## Setup

MeCabとktrainをインストールします。MeCabはBERTのTokenizerの中で使われています。

In [16]:
!apt install aptitude
!aptitude install mecab libmecab-dev -y
!pip install mecab-python3 fugashi ipadic
!pip install ktrain

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  aptitude-common libcgi-fast-perl libcgi-pm-perl libclass-accessor-perl
  libcwidget3v5 libencode-locale-perl libfcgi-perl libhtml-parser-perl
  libhtml-tagset-perl libhttp-date-perl libhttp-message-perl libio-html-perl
  libio-string-perl liblwp-mediatypes-perl libparse-debianchangelog-perl
  libsigc++-2.0-0v5 libsub-name-perl libtimedate-perl liburi-perl libxapian30
Suggested packages:
  aptitude-doc-en | aptitude-doc apt-xapian-index debtags tasksel
  libcwidget-dev libdata-dump-perl libhtml-template-perl libxml-simple-perl
  libwww-perl xapian-tools
The following NEW packages will be installed:
  aptitude aptitude-common libcgi-fast-perl libcgi-pm-perl
  libclass-accessor-perl libcwidget3v5 libencode-locale-perl libfcgi-perl
  libhtml-parser-perl libhtml-tagset-perl libhttp-date-perl
  libhttp-message-perl libio-html-perl libio-string

### Hyper-parameters

In [17]:
maxlen = 300
lr = 2e-5
epochs = 2
MODEL_NAME = 'cl-tohoku/bert-base-japanese-whole-word-masking'

### Imports

In [22]:
! pip install ktrain



In [24]:
from ktrain import text
from sklearn.metrics import classification_report

ImportError: ignored

## The dataset

### Preprocess the dataset

前処理としては、HTMLタグの除去とテキストをBERTに入力できる形式に変換しています。`text.Transformer`を使うことで、HuggingfaceのTransformersをラップして利用することができます。

In [None]:
x = [clean_html(text) for text in x]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2

In [None]:
classes = list(set(y_train))
t = text.Transformer(MODEL_NAME, maxlen=maxlen, class_names=classes)
trn = t.preprocess_train(x_train, y_train)
val = t.preprocess_train(x_val, y_val)

## The models

### Build the model

In [None]:
model = t.get_classifier()

### Train the model

In [None]:
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=8)
learner.fit_onecycle(lr, epochs)

### Evaluate the model

In [None]:
predictor = ktrain.get_predictor(learner.model, preproc=t)

In [None]:
y_pred = predictor.predict(x_test)

In [None]:
print(classification_report(y_test, y_pred, digits=4))

### Save and Load the model

In [None]:
predictor.save('/tmp/model')

In [None]:
reloaded_predictor = ktrain.load_predictor('/tmp/model')

In [None]:
x_test[1]

In [None]:
reloaded_predictor.predict(x_test[1])

# Text Classification with Universal Sentence Encoder on TensorFlow Hub

この例では、[TensorFlow Hub](https://tfhub.dev/)を使って文書分類をしてみましょう。TensorFlow Hubとは何かというと、機械学習モデルのリポジトリです。より噛み砕いて言うと、画像分類やテキスト分類、音声認識といったタスクに使うことのできる学習済みのモデルが置いてある場所です。TensorFlow Hubから学習済みのモデルをダウンロードすることで、そのモデルを基に自分のモデルを構築することができます。

今回は、文書分類のために、Multilingual Universal Sentence Encoder(m-USE)と呼ばれるモデルを利用することにしましょう。m-USEを使うことで、多言語の文表現を得られます。モデルの詳細は以下から確認することができます。

- [Multilingual Universal Sentence Encoder | TensorFlow Hub](https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3)

## Setup

m-USEは、入力の処理を[TensorFlow Text](https://github.com/tensorflow/text)に依存しています。そのため、事前にTensorFlow Textをインストールしておく必要があります。

In [None]:
!pip install tensorflow-text

### Imports

In [None]:
import string

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub
from bs4 import BeautifulSoup
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import load_model, Model

### Resource

TensorFlow Hubを使って、m-USEをKerasのレイヤーとしてロードします。`trainable`パラメータによって、学習する際に重みを更新するか否かを決定します。今回はm-USEの重みも更新することにします。

In [None]:
model_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3'
use_layer = hub.KerasLayer(model_url, trainable=True)

## The dataset

### Preprocess the dataset

前処理としては以下の2つを行います。
- HTMLの除去
- テキストの切り詰め

本来はトークン数でテキストを切り詰めるべきだと思いますが、今回は文字数で切り詰めてしまいます。m-USEのインターフェースの都合上、トークン数で切り詰めるのが容易ではなさそうなためです。

In [None]:
x = [clean_html(text) for text in x]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
def create_input(input_strings, max_seq_length):
    input_texts = [text[:max_seq_length] for text in input_strings]
    return input_texts

In [None]:
max_seq_length = 300
train_inputs = create_input(x_train, max_seq_length=max_seq_length)
validation_inputs = create_input(x_test, max_seq_length=max_seq_length)

## The model

### Build the model

In [None]:
def get_model(use_layer, num_labels, rate=0.1):
    input_strings = tf.keras.layers.Input(shape=[], dtype=tf.string)
    pooled_output = use_layer(input_strings)
    pooled_output = tf.keras.layers.Dropout(rate=rate)(pooled_output)
    output = tf.keras.layers.Dense(units=num_labels, activation='softmax')(pooled_output)

    return tf.keras.Model(
                inputs=[input_strings],
                outputs=output
            )


num_labels = 2
model = get_model(
    use_layer,
    num_labels=num_labels
)

In [None]:
model.summary()

### Train the model

In [None]:
epochs = 100
batch_size = 16
save_path = '/tmp/model'

model.compile(
    optimizer='sgd',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.fit(
      x=np.array(train_inputs), y=y_train,
      validation_split=0.2,
      epochs=epochs,
      callbacks=[
                 tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3),
                 tf.keras.callbacks.ModelCheckpoint(
                     filepath=save_path,
                     monitor='val_loss',
                     save_best_only=True,
                     mode='min'
                 )
      ],
      shuffle=True
)

### Load the trained model

In [None]:
model = load_model(save_path)

### Evaluate the model

In [None]:
y_pred = model.predict(validation_inputs, batch_size=batch_size)

In [None]:
print(classification_report(y_test, np.argmax(y_pred, axis=-1), digits=4))