In [1]:
#import bert
#import bert.run_classifier, bert.optimization, bert.tokenization
from official.nlp.tools import tokenization
import fasttext
import numpy as np
import official.nlp
import pandas as pd
import seaborn as sns
import sklearn
import sklearn.linear_model
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
import sklearn.multioutput
import tensorflow_text as text  # tf registers ops on import
import tensorflow as tf
from tensorflow.keras.regularizers import L2
import tensorflow_hub as hub
import transformers

from retrain_bert import settings
from retrain_bert.preprocessor import load_labels

In [2]:
#help("official.nlp.modeling.models.bert_classifier")

In [3]:
ft_model = fasttext.load_model(str(settings.PROJECT_DIR / "models/fasttext_model.bin"))
ft_model.get_nearest_neighbors("COCA")



[(0.9224011898040771, '+COCA'),
 (0.9198992848396301, 'CCOCA'),
 (0.9144041538238525, 'COCA~COLA'),
 (0.9136794805526733, '6COCA'),
 (0.9041818976402283, '*COCA'),
 (0.9007999897003174, '&COCA'),
 (0.8999505639076233, 'MCOCA'),
 (0.8999376893043518, 'C0CA'),
 (0.8985378742218018, 'COCA_COLA'),
 (0.8983514308929443, 'COCA-0LA')]

In [4]:
# linear = sklearn.linear_model.RidgeClassifier()

In [5]:
train_data, val_data = train_test_split(pd.read_csv(settings.PROJECT_DIR / "data/train/train.csv"))
labels = load_labels()

In [6]:
train_data["OcrValue"]

15073         CHOCO NEGRO ALM
24025      AQUARIUS LLIM FRED
18719      ANILLA ROMANA 400G
17753     PALLA REUTILITZABLE
5842           ESPINACS AMB C
                 ...         
36629     SALCHICHAS FRANFURT
14725      CUÑA QUESO CYO 350
145         SARDINA MED 25/40
14053             TOMATE RAMA
17743    SABÓ REN PLATS ULTRA
Name: OcrValue, Length: 29642, dtype: object

In [7]:
# train_vectors = train_data["OcrValue"].apply(ft_model.get_word_vector)
# train_vectors = pd.DataFrame.from_records(train_vectors.to_list()).values
# train_vectors[:5, :5]

In [8]:
def encode_label(label_ids, labels_table=labels):
    one_hot = np.zeros(len(labels_table))
    one_hot[label_ids] = 1
    return one_hot

In [9]:
train_targets = train_data.drop("OcrValue", axis="columns").apply(encode_label, axis="columns")
train_targets = pd.DataFrame.from_records(train_targets.tolist()).values

val_targets = val_data.drop("OcrValue", axis="columns").apply(encode_label, axis="columns")
val_targets = pd.DataFrame.from_records(val_targets.tolist()).values

In [10]:
# cross_val_score(linear, train_vectors, train_targets, cv=3)

In [11]:
# linear.fit(train_vectors, train_targets)

In [12]:
labels_conf = []
level_start = 0
level_end = 0
for level in range(settings.DEEPEST_LEVEL):
    level_end = level_start + len(labels.loc[level + 1])
    labels_conf.append({
        "level": level + 1,
        "start": level_start,
        "end": level_end,
        "num_classes": len(labels.loc[level + 1])
    })
    level_start = level_end

In [13]:
labels_conf

[{'level': 1, 'start': 0, 'end': 5, 'num_classes': 5},
 {'level': 2, 'start': 5, 'end': 16, 'num_classes': 11},
 {'level': 3, 'start': 16, 'end': 44, 'num_classes': 28},
 {'level': 4, 'start': 44, 'end': 75, 'num_classes': 31},
 {'level': 5, 'start': 75, 'end': 94, 'num_classes': 19}]

In [14]:
def get_level_labels(labels, labels_table=labels):
    """Separates labels by levels
    
    Args:
        labels (pd.Series): one hot encoded labels to separate
    """
    level_start = 0
    level_end = 0
    for level in range(settings.DEEPEST_LEVEL):
        level_end = level_start + len(labels_table.loc[level + 1])
        #print(level, level_start, level_end, pred[:, level_start:level_end], true[:, level_start:level_end])
        yield labels[:, level_start:level_end]
        level_start = level_end

In [15]:
# def level_accuracy(pred, true, labels_table=labels):
#     level_start = 0
#     level_end = 0
#     for level in range(settings.DEEPEST_LEVEL):
#         level_end = level_start + len(labels_table.loc[level + 1])
#         #print(level, level_start, level_end, pred[:, level_start:level_end], true[:, level_start:level_end])
#         yield sklearn.metrics.accuracy_score(pred[:, level_start:level_end], true[:, level_start:level_end])
#         level_start = level_end

In [16]:
# list(level_accuracy(linear.predict(train_vectors), train_targets))

In [17]:
# linears = [sklearn.linear_model.LogisticRegression(max_iter=15000, C=1) for _ in range(settings.DEEPEST_LEVEL)]

# for model, targets in zip(linears, get_level_labels(train_targets)):
#     break
#     targets = np.argmax(targets, axis=1)
#     max_label = np.max(targets)
#     mask = targets != max_label
#     model.fit(train_vectors[mask], targets[mask])
#     print(model.score(train_vectors[mask], targets[mask]), mask.mean())

In [18]:
# BERT

# tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-multilingual-uncased")
# model = transformers.TFBertModel.from_pretrained("bert-base-multilingual-uncased")

In [19]:
class ReplaceWithSynonims(tf.keras.layers.Layer):
    def __init__(self, fasttext_model, replace_proba=.2, **kwargs):
        super().__init__(**kwargs)
        self.fasttext_model = fasttext_model
        self.replace_proba = replace_proba
    
    def __call__(self, sentences, training=False, **kwargs):
        if not training:
            return sentences
        sentences = tf.strings.split(sentences, sep=" ")
        sentences = tf.map_fn(
            lambda x: tf.cond(tf.random.uniform([]) < self.replace_proba,
            lambda: self.__replace_with_a_synonim(x), lambda: x),
            sentences
        )
        sentences = tf.strings.join(sentences, sep=" ")
        return sentences
        
    def __replace_with_a_synonim(self, word):
        synonyms = self.fasttext_model.get_nearest_neighbors(word)
        if len(synonyms) > 0:
            return np.random.choice(synonyms)
        return word

In [20]:
text_inputs = tf.keras.layers.Input(shape=(), dtype=tf.string)
# TODO: reduce seq_length (128 is an overkill)
# https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3
data_augmentation = ReplaceWithSynonims(ft_model)
#tokenizer_inputs = data_augmentation(text_inputs)
lowercase = tf.strings.lower(text_inputs)
tokenizer_inputs = lowercase
tokenizer = hub.KerasLayer(
   "https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3",
    #arguments={"seq_length": 32}
)
encoder_inputs = tokenizer(tokenizer_inputs)
encoder = hub.KerasLayer(
   "https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4",
    trainable=False
)
encoder_outputs = encoder(encoder_inputs)
pooled_output = encoder_outputs["pooled_output"]
X = tf.keras.layers.Dense(settings.DEEPEST_LEVEL * settings.DEEPEST_LEVEL, activation="relu")(pooled_output)
X = tf.keras.layers.Dense(settings.DEEPEST_LEVEL * settings.DEEPEST_LEVEL, activation="relu")(X)
X = tf.keras.layers.Dense(settings.DEEPEST_LEVEL * settings.DEEPEST_LEVEL, activation="relu")(X)
X = tf.keras.layers.Dense(settings.DEEPEST_LEVEL * settings.DEEPEST_LEVEL, activation="relu")(X)

sequence_output = encoder_outputs["sequence_output"]
sequence_output = tf.keras.layers.Dropout(0.2)(sequence_output)
sequence_output = tf.keras.layers.Conv1D(
    128,
    7,
#    kernel_regularizer=L2(1e-3)
)(sequence_output)
sequence_output = tf.keras.layers.Flatten()(sequence_output)

X = tf.keras.layers.Concatenate()([X, sequence_output])
X = tf.keras.layers.Dropout(.2)(X)

heads = []
for level in range(settings.DEEPEST_LEVEL):
    head = tf.keras.layers.Dense(
        labels_conf[level]["num_classes"],
        activation="softmax",
        name=f"level_{level}",
        kernel_regularizer=L2(1e-5)
    )(X)
    X = tf.keras.layers.Concatenate()([X, head])
    heads.append(head)

model = tf.keras.Model(text_inputs, heads)

2022-08-30 18:10:31.099808: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-30 18:10:31.152138: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-30 18:10:31.152355: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-30 18:10:31.153127: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [21]:
vocab_file = encoder.resolved_object.vocab_file.asset_path.numpy() #The vocab file of bert for tokenizer

In [22]:
vocab_file = vocab_file.decode("utf-8")
vocab_file

'/tmp/tfhub_modules/26807cff8c00c2271e22a6b31b83988c4bfe6528/assets/vocab.txt'

In [23]:
with open(vocab_file, "r", encoding="utf-8") as f:
    vocab = [line.strip() for line in f]

hg_tokenizer = transformers.BertTokenizer(vocab_file, do_lower_case=False)

In [24]:
hg_tokenizer

PreTrainedTokenizer(name_or_path='', vocab_size=119547, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [25]:
labels = [
    train_targets[:, labels_conf[l]["start"]:labels_conf[l]["end"]]
    for l in range(settings.DEEPEST_LEVEL)
]
val_labels = [
    val_targets[:, labels_conf[l]["start"]:labels_conf[l]["end"]]
    for l in range(settings.DEEPEST_LEVEL)
]

train_sample_weigths = [np.ones(len(train_targets)) for _ in range(settings.DEEPEST_LEVEL)]
for level, level_labels in enumerate(labels):
    unknown_label = labels_conf[level]["num_classes"] - 1
    train_sample_weigths[level][level_labels[:, unknown_label] == 1] = 0

model.compile("adam", "categorical_crossentropy", metrics=["accuracy"])
model.fit(
    train_data.OcrValue,
    labels,
    batch_size=128,
    epochs=3,
    sample_weight=train_sample_weigths,
    validation_data=(val_data.OcrValue, val_labels),
)

Epoch 1/3


2022-08-30 18:10:53.696349: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8401


 40/232 [====>.........................] - ETA: 4:46 - loss: 16.2956 - level_0_loss: 2.5159 - level_1_loss: 3.6579 - level_2_loss: 4.9872 - level_3_loss: 3.7860 - level_4_loss: 1.3464 - level_0_accuracy: 0.4754 - level_1_accuracy: 0.3168 - level_2_accuracy: 0.1764 - level_3_accuracy: 0.2217 - level_4_accuracy: 0.1348

In [None]:
raise NotImplementedError()

In [None]:
for  layer in model.layers:
    layer.trainable = True

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

model.fit(train_data.OcrValue, labels, batch_size=32, epochs=3, validation_data=(val_data.OcrValue, val_labels))

Epoch 1/3


ERROR:absl:hub.KerasLayer is trainable but has zero trainable weights.
2022-08-30 17:55:42.864502: E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:288] gpu_async_0 cuMemAllocAsync failed to allocate 12582912 bytes: CUDA error: out of memory (CUDA_ERROR_OUT_OF_MEMORY)
 Reported by CUDA: Free memory/Total memory: 21954560/6222970880
2022-08-30 17:55:42.864525: E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:293] Stats: Limit:                      4857987072
InUse:                      5348182599
MaxInUse:                   5348182599
NumAllocs:                     8662956
MaxAllocSize:                367248384
Reserved:                            0
PeakReserved:                        0
LargestFreeBlock:                    0

2022-08-30 17:55:42.864568: E tensorflow/core/common_runtime/gpu/gpu_cudamallocasync_allocator.cc:56] Histogram of current allocation: (allocation_size_in_bytes, nb_allocation_of_that_sizes), ...;
2022-08-30 17:55:42.86457

ResourceExhaustedError: Graph execution error:

OOM when allocating tensor with shape[32,12,128,64] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator gpu_async_0
	 [[{{node transformer/layer_6/self_attention/einsum_1/Einsum}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_115971]

In [None]:
val_preds = model.predict(val_data.OcrValue, batch_size=128)



In [None]:
val_preds_ids = [
    np.argmax(preds, axis=1) for preds in val_preds
]

In [None]:
train_preds = model.predict(train_data.OcrValue, batch_size=128)



In [None]:
val_labels_ids = [ np.argmax(labels, axis=1) for labels in val_labels ]

In [None]:
def level_specific_accuracy(preds, true, level):
    return sklearn.metrics.accuracy_score(preds > 0.5, true, sample_weight=1 - true[:, -1])

[level_specific_accuracy(preds, true, level) for level, preds, true in zip(range(settings.DEEPEST_LEVEL), val_preds, val_labels)]

[0.7844347738083189,
 0.7338056680161943,
 0.6173252279635258,
 0.5221003301736075,
 0.5364622760534948]

In [None]:
[level_specific_accuracy(preds, true, level) for level, preds, true in zip(range(settings.DEEPEST_LEVEL), train_preds, labels)]

[0.9231495850482424,
 0.9383265856950067,
 0.9352953101770509,
 0.8644007788989202,
 0.8962358427714857]

In [None]:
raise NotImplementedError()

In [None]:
labels_conf

[{'level': 1, 'start': 0, 'end': 5, 'num_classes': 5},
 {'level': 2, 'start': 5, 'end': 16, 'num_classes': 11},
 {'level': 3, 'start': 16, 'end': 44, 'num_classes': 28},
 {'level': 4, 'start': 44, 'end': 75, 'num_classes': 31},
 {'level': 5, 'start': 75, 'end': 94, 'num_classes': 19}]

In [None]:
raise NotImplementedError()

In [None]:
linears = [sklearn.linear_model.LogisticRegression(max_iter=15000, C=1) for _ in range(settings.DEEPEST_LEVEL)]

for model, targets in zip(linears, get_level_labels(train_targets)):
    targets = np.argmax(targets, axis=1)
    max_label = np.max(targets)
    mask = targets != max_label
    model.fit(train_vectors[mask], targets[mask])
    print(model.score(train_vectors[mask], targets[mask]), mask.mean())