<a href="https://colab.research.google.com/github/perlatomdpi/NLP/blob/main/BERT_Tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **BERT Tokenizer**
Example of tokenization using BERT



# **Import dependencies**


In [1]:
import numpy as np
import math
import re # advanced text pre-processing
import pandas as pd
from bs4 import BeautifulSoup # encode text
import random

from google.colab import drive # get data from drive

In [2]:
# packages related to BERT
!pip install bert-for-tf2 
!pip install sentencepiece # requirement for tf2

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/18/d3/820ccaf55f1e24b5dd43583ac0da6d86c2d27bbdfffadbba69bafe73ca93/bert-for-tf2-0.14.7.tar.gz (41kB)
[K     |████████                        | 10kB 22.9MB/s eta 0:00:01[K     |████████████████                | 20kB 19.5MB/s eta 0:00:01[K     |███████████████████████▉        | 30kB 11.3MB/s eta 0:00:01[K     |███████████████████████████████▉| 40kB 9.5MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 5.7MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/a4/bf/c1c70d5315a8677310ea10a41cfc41c5970d9b37c31f9c90d4ab98021fd1/py-params-0.9.7.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... [

In [7]:
# use tf2
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

import tensorflow_hub as hub # to download the weights of bert
from tensorflow.keras import layers # to create cnn layers
import bert

# **Data pre-processing**


In [8]:
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
cols = ["sentiment", "id", "query", "user", "text"]
data = pd.read.csv(
    "content/drive/My Drive/.../BERT/data/train.csv",
    header=None,
    names=cols,
    engine="python", 
    encoding="latin1"
)

In [None]:
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True) # inplace garantee that new lighter data are loaded

# **Cleaning**

In [None]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    tweet = re.sub(r"@[A-Za-z0-9]+", '',tweet)   # replace a substring with another
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", '', tweet)
    tweet = re.sub(r"[^A-Za-z.!?]", '', tweet)
    tweet = re.sub(r" +", '', tweet)
    return tweet

In [None]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [None]:
data_labels = data.sentiment.values # get the value of the sentiment
data_labels[data_labels == 4] = 1   # in the data 4 is for positve and we converted it to 1

# **Tokenization**
We have to create a BERT layer to have access to meta data for the tokenier  - vocab size.

In [None]:
FullTokenizer = bert.bert_tokenization.FullTokenizer # call bert module

bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False) # create bert layer - hub is were the pre-trainined model is stored
                                             # here we use the light version of bert
                                             # L-12: 12 encoders
                                             # trainable=false --> we don't use bert for fine-tuning weights

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() # give access to the vocab file as numpy
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() # lower case
tokenizer = FullTokenizer(vocab_file, do_lower_case) # create the tokenizer