In [1]:
#import bert
#import bert.run_classifier, bert.optimization, bert.tokenization
from official.nlp.tools import tokenization
import fasttext
import numpy as np
import pandas as pd
import sklearn
import sklearn.linear_model
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
import sklearn.multioutput
import tensorflow_text as text  # tf registers ops on import
import tensorflow as tf
import tensorflow_hub as hub
import transformers
import official.nlp

from retrain_bert import settings
from retrain_bert.preprocessor import load_labels

In [2]:
#help("official.nlp.modeling.models.bert_classifier")

In [3]:
# ft_model = fasttext.train_unsupervised(str(settings.PROJECT_DIR / "data/train/ocr_values.txt"))
#ft_model.get_nearest_neighbors("COCA")

In [4]:
linear = sklearn.linear_model.RidgeClassifier()

In [5]:
train_data, val_data = train_test_split(pd.read_csv(settings.PROJECT_DIR / "data/train/train.csv"))
labels = load_labels()

In [6]:
train_data["OcrValue"]

8298                    GREC LLEUGER
29231    QUESO UNTAR PHILADELPHIA 20
22444               SALCHICHON DE PA
13689            MAIZ DULCE CARREFOU
7049                  P INT B SAL/SU
                    ...             
32673             NATA COCINA LIGERA
7799             PEPINILLOS PEQUEÑOS
34494          FREEWAY/ICE TEA LIMÓN
36380                 ROLLON EXTRA D
25065               SOPINSTANT POLLO
Name: OcrValue, Length: 29642, dtype: object

In [7]:
# train_vectors = train_data["OcrValue"].apply(ft_model.get_word_vector)
# train_vectors = pd.DataFrame.from_records(train_vectors.to_list()).values
# train_vectors[:5, :5]

In [8]:
def encode_label(label_ids, labels_table=labels):
    one_hot = np.zeros(len(labels_table))
    one_hot[label_ids] = 1
    return one_hot

In [9]:
train_targets = train_data.drop("OcrValue", axis="columns").apply(encode_label, axis="columns")
train_targets = pd.DataFrame.from_records(train_targets.tolist()).values

In [10]:
# cross_val_score(linear, train_vectors, train_targets, cv=3)

In [11]:
# linear.fit(train_vectors, train_targets)

In [12]:
labels_conf = []
level_start = 0
level_end = 0
for level in range(settings.DEEPEST_LEVEL):
    level_end = level_start + len(labels.loc[level + 1])
    labels_conf.append({
        "level": level + 1,
        "start": level_start,
        "end": level_end,
        "num_classes": len(labels.loc[level + 1])
    })
    level_start = level_end

In [13]:
labels_conf

[{'level': 1, 'start': 0, 'end': 5, 'num_classes': 5},
 {'level': 2, 'start': 5, 'end': 16, 'num_classes': 11},
 {'level': 3, 'start': 16, 'end': 44, 'num_classes': 28},
 {'level': 4, 'start': 44, 'end': 75, 'num_classes': 31},
 {'level': 5, 'start': 75, 'end': 94, 'num_classes': 19}]

In [14]:
def get_level_labels(labels, labels_table=labels):
    """Separates labels by levels
    
    Args:
        labels (pd.Series): one hot encoded labels to separate
    """
    level_start = 0
    level_end = 0
    for level in range(settings.DEEPEST_LEVEL):
        level_end = level_start + len(labels_table.loc[level + 1])
        #print(level, level_start, level_end, pred[:, level_start:level_end], true[:, level_start:level_end])
        yield labels[:, level_start:level_end]
        level_start = level_end

In [15]:
# def level_accuracy(pred, true, labels_table=labels):
#     level_start = 0
#     level_end = 0
#     for level in range(settings.DEEPEST_LEVEL):
#         level_end = level_start + len(labels_table.loc[level + 1])
#         #print(level, level_start, level_end, pred[:, level_start:level_end], true[:, level_start:level_end])
#         yield sklearn.metrics.accuracy_score(pred[:, level_start:level_end], true[:, level_start:level_end])
#         level_start = level_end

In [16]:
# list(level_accuracy(linear.predict(train_vectors), train_targets))

In [17]:
# linears = [sklearn.linear_model.LogisticRegression(max_iter=15000, C=1) for _ in range(settings.DEEPEST_LEVEL)]

# for model, targets in zip(linears, get_level_labels(train_targets)):
#     break
#     targets = np.argmax(targets, axis=1)
#     max_label = np.max(targets)
#     mask = targets != max_label
#     model.fit(train_vectors[mask], targets[mask])
#     print(model.score(train_vectors[mask], targets[mask]), mask.mean())

In [18]:
# BERT

# tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-multilingual-uncased")
# model = transformers.TFBertModel.from_pretrained("bert-base-multilingual-uncased")

In [19]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
print("Downloading preprocessor")
preprocessor = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3"
)
print("Downloading Bert")
encoder_inputs = preprocessor(text_input)
encoder = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4",
    trainable=True
)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]
sequence_output = outputs["sequence_output"]
embedding_model = tf.keras.Model(text_input, pooled_output)
print("Embedding model ready")

sentences = tf.constant(train_data.OcrValue)
print(embedding_model(sentences[:10]))

Downloading preprocessor


2022-08-23 12:18:20.205058: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-23 12:18:20.273559: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-23 12:18:20.273887: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-23 12:18:20.274700: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Downloading Bert
Embedding model ready
tf.Tensor(
[[ 0.425459   -0.10762922  0.17411175 ... -0.19871248  0.2508777
   0.32016873]
 [ 0.3679686   0.00953042  0.15352617 ... -0.27434888  0.20082268
   0.31470606]
 [ 0.17154132 -0.10462623 -0.00532867 ... -0.13786131  0.20510909
   0.3654451 ]
 ...
 [ 0.39986557 -0.18243173  0.14306602 ... -0.41894498  0.33462557
   0.38243207]
 [ 0.22366148 -0.12727991  0.05482608 ... -0.14929196  0.2768532
   0.35954604]
 [ 0.28140125 -0.09750124  0.06395062 ... -0.23479459  0.26293826
   0.3070255 ]], shape=(10, 768), dtype=float32)


In [20]:
class BertClassifier(tf.keras.Model):
    def __init__(self, labels_conf=labels_conf):
        super().__init__()
        self.text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
        self.tokenizer = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3",
            #arguments={"seq_length": 32}
        )
        self.encoder = hub.KerasLayer(
            "https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4",
            trainable=True
        )
        self.labels_conf = labels_conf
        self.heads = [
            tf.keras.layers.Dense(c["num_classes"], activation="softmax") for c in labels_conf
        ]

    def __call__(self, sentences, training=False):
        X = self.tokenizer(self.text_input)
        X = self.text_input(sentences)
        X = self.tokenizer(X)
        X = self.encoder(X)
        level_0 = self.heads[0](X["pooled_output"])
        return level_0


In [26]:
text_inputs = tf.keras.layers.Input(shape=(), dtype=tf.string)
# TODO: reduce seq_length (128 is an overkill)
# https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3
tokenizer = hub.KerasLayer(
   "https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3",
    #arguments={"seq_length": 32}
)
encoder_inputs = tokenizer(text_inputs)
encoder = hub.KerasLayer(
   "https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4",
    trainable=True
)
encoder_outputs = encoder(encoder_inputs)
pooled_output = encoder_outputs["pooled_output"]
level_0 = tf.keras.layers.Dense(labels_conf[0]["num_classes"], activation="softmax")(pooled_output)
model = tf.keras.Model(text_inputs, level_0)

TypeError: Cannot iterate over a Tensor with unknown first dimension.

In [None]:
level0_labels = train_targets[:, labels_conf[0]["start"]:labels_conf[0]["end"]]
model.compile("adam", "categorical_crossentropy", metrics=["accuracy"])
model.fit(train_data.OcrValue[:512], level0_labels[:512], epochs=1)

2022-08-23 12:19:05.533846: W tensorflow/core/common_runtime/bfc_allocator.cc:479] Allocator (GPU_0_bfc) ran out of memory trying to allocate 48.00MiB (rounded to 50331648)requested by op Adam/gradients/PartitionedCall/gradients/StatefulPartitionedCall_grad/PartitionedCall/gradients/StatefulPartitionedCall_grad/PartitionedCall/gradients/bert_encoder/StatefulPartitionedCall_grad/PartitionedCall/gradients/transformer/layer_6/activation/Gelu/Pow_grad/Pow
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2022-08-23 12:19:05.533966: I tensorflow/core/common_runtime/bfc_allocator.cc:1027] BFCAllocator dump for GPU_0_bfc
2022-08-23 12:19:05.533998: I tensorflow/core/common_runtime/bfc_allocator.cc:1034] Bin (256): 	Total Chunks: 150, Chunks in use: 147. 37.5KiB allocated for chunks. 36.8KiB in use in bin. 1.1KiB client-requested in use in 

ResourceExhaustedError: Graph execution error:

Detected at node 'gradients/transformer/layer_6/activation/Gelu/Pow_grad/Pow' defined at (most recent call last):
    File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/traitlets/config/application.py", line 976, in launch_instance
      app.start()
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/usr/lib/python3.10/asyncio/base_events.py", line 600, in run_forever
      self._run_once()
    File "/usr/lib/python3.10/asyncio/base_events.py", line 1896, in _run_once
      handle._run()
    File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2881, in run_cell
      result = self._run_cell(
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2936, in _run_cell
      return runner(coro)
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3135, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3338, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3398, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_2242175/870469832.py", line 3, in <cell line: 3>
      model.fit(train_data.OcrValue[:512], level0_labels[:512], epochs=1)
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/keras/engine/training.py", line 1409, in fit
      tmp_logs = self.train_function(iterator)
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/keras/engine/training.py", line 1051, in train_function
      return step_function(self, iterator)
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/keras/engine/training.py", line 1040, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/keras/engine/training.py", line 1030, in run_step
      outputs = model.train_step(data)
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/keras/engine/training.py", line 893, in train_step
      self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/keras/optimizers/optimizer_v2/optimizer_v2.py", line 537, in minimize
      grads_and_vars = self._compute_gradients(
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/keras/optimizers/optimizer_v2/optimizer_v2.py", line 590, in _compute_gradients
      grads_and_vars = self._get_gradients(tape, loss, var_list, grad_loss)
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/keras/optimizers/optimizer_v2/optimizer_v2.py", line 471, in _get_gradients
      grads = tape.gradient(loss, var_list, grad_loss)
Node: 'gradients/transformer/layer_6/activation/Gelu/Pow_grad/Pow'
failed to allocate memory
	 [[{{node gradients/transformer/layer_6/activation/Gelu/Pow_grad/Pow}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_128134]

In [None]:
labels_conf

[{'level': 1, 'start': 0, 'end': 5, 'num_classes': 5},
 {'level': 2, 'start': 5, 'end': 16, 'num_classes': 11},
 {'level': 3, 'start': 16, 'end': 44, 'num_classes': 28},
 {'level': 4, 'start': 44, 'end': 75, 'num_classes': 31},
 {'level': 5, 'start': 75, 'end': 94, 'num_classes': 19}]

In [None]:
bc = BertClassifier()
bc.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
bc.fit(train_data.OcrValue[:100], level0_labels[:100], batch_size=128, epochs=10)

Epoch 1/10


TypeError: in user code:

    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/keras/engine/training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/keras/engine/training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/keras/engine/training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "/home/julius/lp/algori/retrain_bert/venv/lib/python3.10/site-packages/keras/engine/training.py", line 889, in train_step
        y_pred = self(x, training=True)
    File "/tmp/ipykernel_2033894/2948954739.py", line 19, in __call__
        X = self.text_input(sentences)

    TypeError: 'KerasTensor' object is not callable


In [None]:
raise NotImplementedError()

In [None]:
linears = [sklearn.linear_model.LogisticRegression(max_iter=15000, C=1) for _ in range(settings.DEEPEST_LEVEL)]

for model, targets in zip(linears, get_level_labels(train_targets)):
    targets = np.argmax(targets, axis=1)
    max_label = np.max(targets)
    mask = targets != max_label
    model.fit(train_vectors[mask], targets[mask])
    print(model.score(train_vectors[mask], targets[mask]), mask.mean())