# __🗃️ USPPM Model Training Tensorflow__

---
### <a href='#hyperparameters'> ⚙️ Hyperparameters </a> | <a href='#data-factory'> ⚒ Data Factory </a>  | <a href='#training'> ⚡ Training </a> 

In [1]:
# Sync Notebook with VS Code #
!pip install -q transformers==4.10.0 datasets
!git clone https://github.com/sarthak-314/fast-nlp
import sys; sys.path.append('fast-nlp')

from src import *
from src.tflow import *

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
allennlp 2.7.0 requires transformers<4.10,>=4.1, but you have transformers 4.10.0 which is incompatible.[0m
Cloning into 'fast-nlp'...
remote: Enumerating objects: 68, done.[K
remote: Counting objects: 100% (68/68), done.[K
remote: Compressing objects: 100% (44/44), done.[K
remote: Total 68 (delta 28), reused 59 (delta 19), pack-reused 0[K
Unpacking objects: 100% (68/68), done.


2022-06-15 13:22:32.664155: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2022-06-15 13:22:32.664349: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Notebook running on [34mKaggle[0m on [34mTPU[0m
Installing omeaconf




# ⚙️ Hyperparameters ⚙️
---
### <a href='#data-factory'> ⚒ Data Factory </a>  | <a href='#model'> 🧠 Model </a>|  <a href='#training'> ⚡ Training </a> 

<a name='hyperparameters'>

In [2]:
%%hyperparameters

## Huggingface Backbone ##
backbone_name: 'anferico/bert-for-patents'
attention_dropout_prob: 0.10
backbone_weights: ['bert_for_patents_mlm25_ep20_loss5997.h5', 'usppm_mlm/runs/17zo3zpa']

## Model Architecture ##
max_seq_len: 128
hidden_layer_units: [256, 16]
hidden_layer_dropout: 0.10
hidden_layer_activation: 'gelu'

## Model Training ##
max_epochs: 4
train_steps_multiply: 5

train_batch_size: 512
eval_batch_size: 4096

## Cosine Decay LR Scheduler ##
warmup_epochs: 1.03125
warmup_power: 1.0

max_lr: 1e-4
min_lr: 0.0
decay_epochs: 3.0
step_gamma: 1.0
lr_gamma: 1.00

## AdamW Optimizer ## 
max_weight_decay: 1e-6
max_grad_norm: 1.00
beta_1: 0.9
beta_2: 0.98
epsilon: 1e-6
average_decay: 0.90

## Random Perturbation Training ##
rpt_noise_epsilon: 1e-2
consistency_loss_weight: 1.00

## Data Factory ##
validation_anchors: 70

In [3]:
PROMPT_FORMAT = \
"""
Match the phrase {target} to the anchor {anchor} in the context of {context} in patent documents.
Targets for anchor in {context}: {target}, {common_targets_in_context}.
All targets for anchor: {all_targets}
"""

In [4]:
tqdm.pandas()

def load_model_weights(model, model_weights): 
    if model_weights is None: 
        return
    weights_file, run_name = model_weights
    print(f'Loading model weights from {green(weights_file)}')
    weights_path = wandb.restore(weights_file, run_name).name
    model.load_weights(weights_path)

def build_hidden_layer(hidden_layer_units=[], hidden_dropout=0.10, activation_str='mish', name='hidden_layer'): 
    if not hidden_layer_units:
        return tf.keras.layers.Lambda(lambda x: x)
    activation_fn = {'mish': tfa.activations.mish, None: None, 'gelu': tf.keras.activations.gelu}[activation_str]
    hidden_layers = []
    for units in hidden_layer_units: 
        hidden_layers.append(tf.keras.layers.Dropout(hidden_dropout))
        hidden_layers.append(tf.keras.layers.Dense(units=units, activation=activation_fn))
    return tf.keras.Sequential(hidden_layers, name=name)

from src.tflow.factory import CosineDecayRestarts, WarmUp
def adamw_optimizer_factory(HP, lr_scheduler): 
    optimizer = tfa.optimizers.AdamW(
        beta_1=HP.beta_1, 
        beta_2=HP.beta_2, 
        epsilon=HP.epsilon, 
        weight_decay=HP.max_weight_decay, 
        clipnorm=HP.max_grad_norm,
        learning_rate=lr_scheduler,
    )
    if HP.average_decay > 0: 
        print(f'Using EMA with decay {blue(HP.average_decay)}')
        optimizer = tfa.optimizers.MovingAverage(
            optimizer, 
            average_decay=HP.average_decay, 
            dynamic_decay=True, 
        )
    return optimizer

def lr_scheduler_factory(HP, train_steps):
    lr_scheduler = CosineDecayRestarts(
        HP.max_lr, 
        int(HP.decay_epochs*train_steps)+1, 
        HP.step_gamma, 
        HP.lr_gamma, 
        HP.min_lr/HP.max_lr, 
    )
    lr_scheduler = WarmUp(
        warmup_lr=HP.max_lr, 
        lr_scheduler=lr_scheduler, 
        warmup_steps=int(train_steps*HP.warmup_epochs)-1, 
        power=HP.warmup_power, 
    )
    return lr_scheduler

def get_model_average(model, weight_files): 
    model_weights = []
    for weight_file in tqdm(weight_files): 
        model.load_weights(weight_file)
        model_weights.append(model.get_weights())
    
    model.set_weights([
        (sum(w)/len(weight_files))
        for w in zip(*model_weights)
    ])
    del model_weights; _ = gc.collect()
    return model

In [5]:
!wandb login '3b335317f20548af7e3b941d09a6de9f1736bd8d'
STRATEGY = tf_accelerator(bfloat16=True, jit_compile=True)

with STRATEGY.scope():
    backbone = TFAutoModel.from_pretrained(
        HP.backbone_name,
        attention_probs_dropout_prob=HP.attention_dropout_prob,
        from_pt=True,
    )
    load_model_weights(backbone, HP.backbone_weights)
tokenizer = AutoTokenizer.from_pretrained(HP.backbone_name)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


2022-06-15 13:23:00.868334: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-06-15 13:23:00.871286: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2022-06-15 13:23:00.871326: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2022-06-15 13:23:00.871358: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (107098b5be62): /proc/driver/nvidia/version does not exist
2022-06-15 13:23:00.874529: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operation

Running on TPU: grpc://10.0.0.2:8470
Running on 8 replicas
Mixed precision enabled


Downloading:   0%|          | 0.00/327 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.38G [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'bert.embeddings.position_ids', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

Loading model weights from [32mbert_for_patents_mlm25_ep20_loss5997.h5[0m


Downloading:   0%|          | 0.00/329k [00:00<?, ?B/s]

# ⚒ Data Factory 


---
### <a href='#hyperparameters'> ⚙ Hyperparameters </a>  | <a href='#model'> 🧠 Model </a> | <a href='#training'> ⚡ Training </a>

<a name='data-factory'>
   

In [6]:
df = pd.read_csv('/kaggle/input/validation-strategy-stratified-group-k-fold/folds.csv')
df['cpc_code'] = df.context

# Merge Information about CPC codes
titles = pd.read_csv('/kaggle/input/cpc-codes/titles.csv')
cpc_code_to_context = titles.set_index('code').to_dict()['title']
df['context'] = df.cpc_code.map(cpc_code_to_context)

anchor_dfs = {anchor: df[df.anchor==anchor] for anchor in df.anchor.unique()}
def _common_targets_in_context(row):
    anchor_df = anchor_dfs[row.anchor]
    ', '.join(anchor_df[anchor_df==row.context].target.apply(str).sample(frac=1.))
df['all_targets'] = df.apply(lambda row: ', '.join(anchor_dfs[row.anchor].target), axis=1)
df['common_targets_in_context'] = df.progress_apply(_common_targets_in_context, axis=1)
del anchor_dfs, titles; gc.collect()

df['text_string'] = df.apply(
    lambda row: PROMPT_FORMAT.format(
        anchor=row.anchor, 
        target=row.target, 
        context=row.context, 
        all_targets=row.all_targets,
        common_targets_in_context=row.common_targets_in_context,
    ), axis=1
)
display(df.sample(frac=1.0))

valid_anchors = random.sample(list(df.anchor.unique()), HP.validation_anchors)
valid = df[df.anchor.isin(valid_anchors)]
train = df[~df.anchor.isin(valid_anchors)]

  0%|          | 0/36473 [00:00<?, ?it/s]

8

Unnamed: 0,id,anchor,target,context,score,score_map,anchor_map,fold,cpc_code,all_targets,common_targets_in_context,text_string
2093,f1a43e47a7aedc4b,arcuate means,support means,WEAVING,0.25,1,46,3,D03,"including means, including vibrator, upwardly,...",,\nMatch the phrase support means to the anchor...
32811,45d1d9d0e38223a0,target pointer,counter pointer,COMPUTING; CALCULATING; COUNTING,0.50,2,661,1,G06,"aim laser, aim pointer, aiming pointer, circul...",,\nMatch the phrase counter pointer to the anch...
18882,00e38c37fb74faf3,machine end,unit end,EARTH DRILLING; MINING,0.75,3,387,0,E21,"appliance end, base, end plate, end user, illu...",,\nMatch the phrase unit end to the anchor mach...
2357,336deea873686250,auxiliary water,sub auxiliary water,TREATMENT OF TEXTILES OR THE LIKE; LAUNDERING;...,0.50,2,52,1,D06,"additional, auxiliary, auxiliary reservoir, hy...",,\nMatch the phrase sub auxiliary water to the ...
27302,f3917e6cedc53f18,reflection type liquid crystal display,reflection type display device,OPTICS,0.50,2,555,0,G02,"bright crystal device, bright crystal display,...",,\nMatch the phrase reflection type display dev...
...,...,...,...,...,...,...,...,...,...,...,...,...
34744,ba6c321a20c92ebd,vacuum cups,cups,BUILDING,0.50,2,702,2,E04,"use vacuum cups, air fryer, air suction, apert...",,\nMatch the phrase cups to the anchor vacuum c...
24048,c0fd977ce3e834b7,perform working operations,milling operation,MECHANICAL METAL-WORKING WITHOUT ESSENTIALLY R...,0.25,1,484,1,B21,"brain working recursive therapy, execute opera...",,\nMatch the phrase milling operation to the an...
19644,244a635ac4dedac7,mayenite,mayenite,CASTING; POWDER METALLURGY,1.00,4,402,4,B22,"alumina, aluminate, aluminate medicine tablets...",,\nMatch the phrase mayenite to the anchor maye...
14278,e326ff6c5f019dff,fused layer,skin layer,WORKING OF PLASTICS; WORKING OF SUBSTANCES IN ...,0.00,0,289,0,B29,"capacitor plate, extruder, eye layer, foam pan...",,\nMatch the phrase skin layer to the anchor fu...


In [7]:
%%time
def convert_to_features(example_batch): 
    tokenized_examples = tokenizer(
        example_batch['text_string'], 
        max_length=HP.max_seq_len, 
        padding='max_length', 
        truncation=True,
    )
    tokenized_examples['label'] = example_batch['score']
    return tokenized_examples

def df_to_dataset(df, convert_to_features): 
    raw_dataset = datasets.Dataset.from_pandas(df)
    processed_dataset = raw_dataset.map(
        convert_to_features, 
        batched=True, batch_size=4096, num_proc=1, 
        remove_columns=raw_dataset.column_names, 
        desc='Running tokenizer on raw dataset'
    )
    print(f'{blue(len(df))} examples and {blue(len(processed_dataset))} features found in')
    return processed_dataset

train_dataset = df_to_dataset(train, convert_to_features)
valid_dataset = df_to_dataset(valid, convert_to_features)

(np.array(train_dataset['attention_mask'])[:, -1] == 0).sum() / len(train_dataset)

Running tokenizer on raw dataset:   0%|          | 0/8 [00:00<?, ?ba/s]

[34m32715[0m examples and [34m32715[0m features found in


Running tokenizer on raw dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

[34m3758[0m examples and [34m3758[0m features found in
CPU times: user 1min 7s, sys: 369 ms, total: 1min 8s
Wall time: 23.2 s


0.05557083906464924

In [8]:
def convert_dataset_to_tfds(dataset):
    dataset.set_format(type='numpy')
    model_inputs = {
        'input_ids': dataset['input_ids'].astype(np.int32), 
        'attention_mask': dataset['attention_mask'].astype(np.int32),
    }
    input_ds = tf.data.Dataset.from_tensor_slices(model_inputs)
    
    model_outputs = {'label': dataset['label'].astype(np.float32)}
    output_ds = tf.data.Dataset.from_tensor_slices(model_outputs)
    ds = tf.data.Dataset.zip((input_ds, output_ds))
    return ds

def dataset_to_tfds(dataset, dataset_type, batch_size): 
    ds = convert_dataset_to_tfds(dataset)
    if dataset_type == 'train': 
        ds = ds.shuffle(len(dataset), reshuffle_each_iteration=True).repeat()
    elif dataset_type == 'valid': 
        ds = ds.cache()
    ds = ds.batch(batch_size)
    steps = len(dataset)//batch_size+1
    return ds.prefetch(tf.data.AUTOTUNE), steps

train_ds, train_steps = dataset_to_tfds(train_dataset, 'train', HP.train_batch_size)
valid_ds, valid_steps = dataset_to_tfds(valid_dataset, 'valid', HP.eval_batch_size)
train_steps *= HP.train_steps_multiply

## 🧠 Model Factory
---
#### <a href='#training'> ⚡ Training </a>

<a name='model-factory'>

In [9]:
class USPatentModel(tf.keras.Model): 
    def __init__(self, inputs, outputs): 
        super().__init__(inputs=inputs, outputs=outputs)
        
        self.metrics_tracker = {
            'pearson_coef': tf.keras.metrics.Mean(name='pearson_coef'), 
            'x': tf.keras.metrics.Mean(name='x'), 
            'dx': tf.keras.metrics.Mean(name='dx'),
            'aug_pearson_coef': tf.keras.metrics.Mean(name='aug_pearson_coef'), 
            'consistency_loss': tf.keras.metrics.Mean(name='consistency_loss'), 
            'total_loss': tf.keras.metrics.Mean(name='total_loss'),
            'gradient_norm': tf.keras.metrics.Mean(name='gradient_norm'), 
        }
    
    def mse_loss_fn(self, x, y):
        return tf.math.reduce_mean(tf.math.abs(x-y)**2)
    
    def pearson_correlation(self, x, y): 
        dx = x - tf.math.reduce_mean(x)
        dy = y - tf.math.reduce_mean(y)
        return tf.math.reduce_mean(dx*dy) / (tf.math.reduce_std(dx) * tf.math.reduce_std(dy))
    
    def forward_pass(self, input_ids, attention_mask, noise_std=1e-8): 
        batch_size = tf.shape(input_ids)[0]
        inputs_embeds = tf.cast(backbone.layers[0].embeddings(input_ids), tf.float32)
        noise = tf.random.normal(shape=(batch_size, HP.max_seq_len, backbone.config.hidden_size), stddev=noise_std)
        inputs_embeds = inputs_embeds + noise
        return self((inputs_embeds, attention_mask), training=True)
    
    @tf.function
    def train_step(self, data): 
        x, y = data
        with tf.GradientTape() as tape: 
            y_pred = self.forward_pass(x['input_ids'], x['attention_mask'])
            y_pred['label'] = tf.cast(y_pred['label'], tf.float32)
            pearson_coef = self.pearson_correlation(y_pred['label'], y['label'])
            
            # Noised Forward Pass #
            y_pred_aug = self.forward_pass(x['input_ids'], x['attention_mask'], HP.rpt_noise_epsilon)
            y_pred_aug['label'] = tf.cast(y_pred_aug['label'], tf.float32)
            aug_pearson_coef = self.pearson_correlation(y_pred_aug['label'], y['label'])
            
            consistency_pearson_loss = 1 - self.pearson_correlation(y_pred_aug['label'], y_pred['label'])
            pearson_loss = 1 - pearson_coef
            loss = pearson_loss + HP.consistency_loss_weight * consistency_pearson_loss
        
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        gradient_norm = tf.linalg.global_norm(gradients)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        
        self.metrics_tracker['x'].update_state(tf.reduce_mean(y_pred['label']))
        self.metrics_tracker['dx'].update_state(tf.math.reduce_std(y_pred['label']))
        self.metrics_tracker['pearson_coef'].update_state(pearson_coef)
        self.metrics_tracker['aug_pearson_coef'].update_state(aug_pearson_coef)
        self.metrics_tracker['consistency_loss'].update_state(consistency_pearson_loss)
        self.metrics_tracker['total_loss'].update_state(loss)
        self.metrics_tracker['gradient_norm'].update_state(gradient_norm)
        return {m.name: m.result() for m in self.metrics}
    
    
    def test_step(self, data):
        x, y = data
        y_pred = self.forward_pass(x['input_ids'], x['attention_mask'])
        y_pred['label'] = tf.cast(y_pred['label'], tf.float32)
        pearson_coef = self.pearson_correlation(y_pred['label'], y['label'])
        pearson_loss = 1 - pearson_coef
        
        self.metrics_tracker['x'].update_state(tf.reduce_mean(y_pred['label']))
        self.metrics_tracker['dx'].update_state(tf.math.reduce_std(y_pred['label']))
        self.metrics_tracker['pearson_coef'].update_state(pearson_coef)
        self.metrics_tracker['total_loss'].update_state(pearson_loss)
        return {m.name: m.result() for m in self.metrics}
        
    @property
    def metrics(self):
        return list(self.metrics_tracker.values())

def build_cross_encoder_model(backbone): 
    # Model Inputs #
    inputs_embeds = tf.keras.Input(
        shape=(HP.max_seq_len, backbone.config.hidden_size), 
        dtype=tf.float32, name='inputs_embeds',
    )
    attention_mask = tf.keras.Input(shape=(HP.max_seq_len,), dtype=tf.float32, name='attention_mask')
    model_inputs = [inputs_embeds, attention_mask]
    
    # Model Layers #
    hidden_layer = build_hidden_layer(HP.hidden_layer_units, HP.hidden_layer_dropout, HP.hidden_layer_activation)
    out_layer = tf.keras.Sequential([tf.keras.layers.Dense(1, name='label'), tf.keras.layers.Reshape(())])
    
    # Forward Pass #
    backbone_outputs = backbone(
        input_ids=None,
        attention_mask=attention_mask,
        inputs_embeds=inputs_embeds, 
    )
    x = backbone_outputs.pooler_output
    x = hidden_layer(x)
    
    model_outputs = {}
    model_outputs['label'] = out_layer(x)
    return USPatentModel(inputs=model_inputs, outputs=model_outputs)

def get_compiled_model(): 
    lr_scheduler = lr_scheduler_factory(HP, train_steps)
    with STRATEGY.scope(): 
        model = build_cross_encoder_model(backbone)
        optimizer = adamw_optimizer_factory(HP, lr_scheduler)
        spe = 4096 if HP.multi_steps_per_execution else None
        model.compile(
            optimizer=optimizer, 
            run_eagerly=HARDWARE == 'CPU',
            steps_per_execution=spe,
        )
    return model

def get_freeze_compiled_model(lr): 
    with STRATEGY.scope(): 
        model = build_cross_encoder_model(backbone)
        backbone.trainable = False
        optimizer = tf.keras.optimizers.Adam(lr)
        spe = 1024 if HP.multi_steps_per_execution else None
        model.compile(
            optimizer=optimizer, 
            steps_per_execution=spe, 
        )
    return model

with STRATEGY.scope():
    model = build_cross_encoder_model(backbone)

backbone_code = HP.backbone_name.replace('-', '_').replace('/', '_')
CHECKPOINT_SAVE_DIR = Path(f'/kaggle/tmp/{backbone_code}')
os.makedirs(CHECKPOINT_SAVE_DIR, exist_ok=True)
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    str(CHECKPOINT_SAVE_DIR/'P{val_pearson_coef:.4f}_epoch{epoch}.h5'),
    monitor='val_pearson_coef', verbose=1, save_weights_only=True,
)

## ⚡ Training 
---
### <a href='#hyperparameters'> ⚙️ Hyperparameters </a> | <a href='#model'> 🧠 Model </a>

<a name='training'>

In [10]:
HP.multi_steps_per_execution = True
with STRATEGY.scope():
    backbone.trainable = True
    model = get_compiled_model()

history = model.fit(
    train_ds, steps_per_epoch=train_steps, epochs=HP.max_epochs,
    validation_data=valid_ds, validation_steps=valid_steps,
    callbacks=[checkpoint_callback],
)
pearson = history.history['val_pearson_coef'][-1]

Using EMA with decay [34m0.9[0m
Epoch 1/4

Epoch 00001: saving model to /kaggle/tmp/anferico_bert_for_patents/P0.8159_epoch1.h5
Epoch 2/4

Epoch 00002: saving model to /kaggle/tmp/anferico_bert_for_patents/P0.8174_epoch2.h5
Epoch 3/4

Epoch 00003: saving model to /kaggle/tmp/anferico_bert_for_patents/P0.8135_epoch3.h5
Epoch 4/4

Epoch 00004: saving model to /kaggle/tmp/anferico_bert_for_patents/P0.8147_epoch4.h5


### 🦾 SWA Ensembling & Model Calibration
---



In [11]:
NUM_SWA_MODELS = 2

BATCH_SIZE, LR = 1024, 1e-3
STEPS_MULTIPLY, EPOCHS = 20, 10

In [12]:
# del train, train_dataset
# gc.collect()

# model_weights = os.listdir(CHECKPOINT_SAVE_DIR)
# model_weight_files = sorted(model_weights)[-NUM_SWA_MODELS:]
# model_weight_files = [str(CHECKPOINT_SAVE_DIR/file) for file in model_weight_files]
# print(model_weight_files)

# with STRATEGY.scope():
#     model = get_model_average(model, model_weight_files)
#     model.save_weights(f'swa_model.h5')

# !rm -r $CHECKPOINT_SAVE_DIR

In [13]:
HP.multi_steps_per_execution = True
with STRATEGY.scope():
    model = get_freeze_compiled_model(LR)

train_ds, train_steps = dataset_to_tfds(valid_dataset, 'train', BATCH_SIZE)
history = model.fit(
    train_ds, steps_per_epoch=train_steps*STEPS_MULTIPLY, epochs=EPOCHS,
)
tuned_pearson = history.history['pearson_coef'][-1]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
weights_file = f'{backbone_code}_pearson{int(pearson*100000)}_tuned{int(tuned_pearson*1000)}.h5'
print(blue(weights_file))

wandb.init(project='usppm_tf')
model.save_weights(weights_file)
wandb.save(weights_file)

[34manferico_bert_for_patents_pearson81474_tuned829.h5[0m


[34m[1mwandb[0m: Currently logged in as: [33mreadoc[0m (use `wandb login --relogin` to force relogin)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: wandb version 0.12.18 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
2022-06-15 14:05:57.375221: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2022-06-15 14:05:57.375685: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.



CondaEnvException: Unable to determine environment

Please re-run this command with one of the following options:

* Provide an environment name via --name or -n
* Re-run this command inside an activated conda environment.



['/kaggle/working/wandb/run-20220615_140554-3g6p0ihz/files/anferico_bert_for_patents_pearson81474_tuned829.h5']

In [15]:
import time
time.sleep(300)

## 🎯 Inference & Visualization
---
<a name='inference'>

In [16]:
# def build_infer_model(backbone):
#     input_ids = tf.keras.Input((HP.max_seq_len,), dtype=tf.int32, name='input_ids')
#     attention_mask = tf.keras.Input((HP.max_seq_len,), dtype=tf.int32, name='attention_mask')
#     backbone_outputs = backbone(input_ids=input_ids, attention_mask=attention_mask)
#     if 'pooler_output' in backbone_outputs: 
#         x = backbone_outputs.pooler_output
#     else:
#         x = tf.math.reduce_mean(backbone_outputs.last_hidden_state, 1)
#     output = tf.keras.Sequential([
#         tf.keras.layers.Dropout(backbone.config.hidden_dropout_prob), 
#         tf.keras.layers.Dense(1, name='label'), 
#         tf.keras.layers.Reshape(()), 
#     ])(x)
#     return tf.keras.Model(inputs=[inputs_embeds, attention_mask], outputs=output)

# def build_infer_ds(dataset):
#     id_ds = tf.data.Dataset.from_tensor_slices(dataset['input_ids'])
#     mask_ds = tf.data.Dataset.from_tensor_slices(dataset['attention_mask'])
#     input_ds = tf.data.Dataset.zip((id_ds, mask_ds))
#     ds = tf.data.Dataset.zip((input_ds, input_ds))
#     return ds.batch(HP.eval_batch_size).prefetch(tf.data.AUTOTUNE)

# targets = valid.score.values
# with STRATEGY.scope(): 
#     infer_model = build_infer_model(backbone, model)
#     preds = model.predict(valid_ds, verbose=1)
#     preds = (preds-preds.min()) / (preds.max()-preds.min())
#     valid['error'] = np.abs(preds-targets)
# display(valid.sort('error', reverse=True))

# 