In [27]:
import os
import sys
import tensorflow as tf
from transformers import (
    BertTokenizer,
    TFBertForSequenceClassification, 
    BertForSequenceClassification, 
    glue_convert_examples_to_features
)
import tensorflow_datasets as tfds

import numpy as np
np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(linewidth=1000) 

# Using BERT

Google [BERT](https://github.com/google-research/bert) Github is obsolete (only works with TF 1.0).

Use [Hugging Face](https://huggingface.co/transformers/installation.html).

## Installation

* (https://huggingface.co/transformers/installation.html#with-conda)

> Since Transformers version v4.0.0, we now have a conda channel: huggingface.

```
conda install -c huggingface transformers
```

* [conda-forge / packages / transformers](https://anaconda.org/conda-forge/transformers)

> State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch

## Pretrained models

* [Pretrained models](https://huggingface.co/transformers/pretrained_models.html)

In [5]:
!conda install -c conda-forge transformers -y

Collecting package metadata (current_repodata.json): done
Solving environment: \ 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/noarch::tensorboard==2.4.0=pyhc547734_0
  - defaults/linux-64::tensorflow-base==2.4.1=mkl_py38h43e0292_0
  - defaults/noarch::tensorflow-estimator==2.4.1=pyheb71bc4_0
  - defaults/linux-64::tensorflow==2.4.1=mkl_py38hb2083e0_0
done


  current version: 4.8.2
  latest version: 4.10.1

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/oonisim/conda/envs/python_programs

  added / updated specs:
    - transformers


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _libgcc_mutex-0.1          |      conda_forge           3 KB  conda-forge
    _openmp_mutex-4.5          |            1_gnu         

In [6]:
!python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"

Downloading: 100%|██████████████████████████████| 629/629 [00:00<00:00, 355kB/s]
Downloading: 100%|███████████████████████████| 268M/268M [01:07<00:00, 3.94MB/s]
2021-05-21 15:46:40.180256: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-05-21 15:46:40.181120: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-05-21 15:46:40.183650: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2021-05-21 15:46:40.239274: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the f

---

# Tokenizer

In [10]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [21]:
sentences = [
    "We are very happy to show you the 🤗 Transformers library.", 
    "We hope you don't hate it."
]
tokens = tokenizer(
    sentences,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="tf"
)
print(tokens)

{'input_ids': <tf.Tensor: shape=(2, 14), dtype=int32, numpy=
array([[  101,  2057,  2024,  2200,  3407,  2000,  2265,  2017,  1996,
          100, 19081,  3075,  1012,   102],
       [  101,  2057,  3246,  2017,  2123,  1005,  1056,  5223,  2009,
         1012,   102,     0,     0,     0]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(2, 14), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 14), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]], dtype=int32)>}


In [43]:
for sentence, words in zip(sentences, tokens['input_ids']):
    print(f"sentence:[{sentence}]\nwords:[{words}]\n")

tokenizer.decode(tokens['input_ids'][0])

sentence:[We are very happy to show you the 🤗 Transformers library.]
words:[[  101  2057  2024  2200  3407  2000  2265  2017  1996   100 19081  3075  1012   102]]

sentence:[We hope you don't hate it.]
words:[[ 101 2057 3246 2017 2123 1005 1056 5223 2009 1012  102    0    0    0]]



'[CLS] we are very happy to show you the [UNK] transformers library. [SEP]'

## Encoding

Convert the words into word indices (tokens).

In [51]:
indices = tokenizer.encode(
    sentences[0],
    padding=True,
    truncation=True,
)
for word, index in zip(sentences[0].split(), indices):
    print(f"word:[{word:16s}] index:[{index:8d}]")

word:[We              ] index:[     101]
word:[are             ] index:[    2057]
word:[very            ] index:[    2024]
word:[happy           ] index:[    2200]
word:[to              ] index:[    3407]
word:[show            ] index:[    2000]
word:[you             ] index:[    2265]
word:[the             ] index:[    2017]
word:[🤗               ] index:[    1996]
word:[Transformers    ] index:[     100]
word:[library.        ] index:[   19081]


## Decoding

Convert the token (index) into the word.

In [44]:
tokenizer.decode(indices)

'[CLS] we are very happy to show you the [UNK] transformers library. [SEP]'