In [4]:
# tensorflow on gpu
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [10]:
import os
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from transformers import BertTokenizer, BertConfig, TFBertModel
from sklearn.model_selection import GroupKFold

### custom scripts
sys.path.append("./utility_scripts/")
from ml_stratifiers import MultilabelStratifiedKFold
from custom_callbacks import EarlyStopping
from bert_embedder import compute_input_arrays_tqa, compute_sentece_pair_embedding
###

MODELS_PATH = "./models/" 
BERT_PATH = "./transformers/bert-base-uncased/"
MAX_SEQUENCE_LENGTH = 512
SEED = 19

***

In [2]:
try:
  bert_base_uncased = TFBertModel.from_pretrained(BERT_PATH) 
except:
  bert_base_uncased = TFBertModel.from_pretrained("bert-base-uncased")
  bert_base_uncased.save_pretrained(BERT_PATH)

I0131 19:53:33.106970 4647570880 file_utils.py:296] https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json not found in cache or force_download set to True, downloading to /var/folders/5z/msjn51117gj9gl8xkrrshhqm0000gn/T/tmpe5dy8b0k
100%|██████████| 313/313 [00:00<00:00, 68461.47B/s]
I0131 19:53:33.761574 4647570880 file_utils.py:309] copying /var/folders/5z/msjn51117gj9gl8xkrrshhqm0000gn/T/tmpe5dy8b0k to cache at /Users/martin/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
I0131 19:53:33.764458 4647570880 file_utils.py:313] creating metadata file for /Users/martin/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
I0131 19:53:33.772543 4647570880 file_utils.py:322] removing temp file /var/folders/5z/msjn51117gj9gl8xkrrshhqm0000gn/T/tmpe5dy8b0k
I013

In [3]:
try:
  bert_tokenizer = BertTokenizer.from_pretrained(BERT_PATH)
except:
  bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
  bert_tokenizer.save_vocabulary(BERT_PATH)  

I0131 19:54:38.104176 4647570880 tokenization_utils.py:306] Model name './transformers/bert-base-uncased/' not found in model shortcut name list (bert-base-uncased, bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, bert-base-multilingual-cased, bert-base-chinese, bert-base-german-cased, bert-large-uncased-whole-word-masking, bert-large-cased-whole-word-masking, bert-large-uncased-whole-word-masking-finetuned-squad, bert-large-cased-whole-word-masking-finetuned-squad, bert-base-cased-finetuned-mrpc, bert-base-german-dbmdz-cased, bert-base-german-dbmdz-uncased). Assuming './transformers/bert-base-uncased/' is a path or url to a directory containing tokenizer files.
I0131 19:54:38.107874 4647570880 tokenization_utils.py:317] Didn't find file ./transformers/bert-base-uncased/vocab.txt. We won't load it.
I0131 19:54:38.110166 4647570880 tokenization_utils.py:335] Didn't find file ./transformers/bert-base-uncased/added_tokens.json. We won't load it.
I0131

***

In [5]:
train = pd.read_csv("./input/train.csv").sample(50).reset_index(drop=True)
target_columns = list(train.columns[11:])
train_targets = train.loc[:, target_columns]

In [6]:
train_tqa_bert_encoded = compute_sentece_pair_embedding(train, which="tqa", bert_path=BERT_PATH)
train_tqa_bert_encoded.reset_index(inplace=True)
bert_columns = train_tqa_bert_encoded.columns[1:]

50it [00:00, 111.18it/s]
I0131 19:55:25.474032 4647570880 modeling_tf_utils.py:255] loading weights file ./transformers/bert-base-uncased/tf_model.h5




***
## finetuning of the output layer

In [11]:
SEED = 19
NUM_FOLDS = 5
DROPOUT = 0.2
ACTIVATION = "sigmoid"
LEARNING_RATE = 5e-4
EPOCHS = 100
BATCH_SIZE = 32

***

In [21]:
def get_model(input_size, output_size, activation, dropout):
    input_layer = tf.keras.layers.Input((input_size,), dtype=tf.float32, name='input')
    input_layer_dpout = tf.keras.layers.Dropout(dropout)(input_layer)
    output_layer = tf.keras.layers.Dense(output_size, 
                                         activation=activation, 
                                         name="output")(input_layer_dpout)
    model = tf.keras.models.Model(inputs=input_layer,
                                  outputs=output_layer)
    return model

***

In [22]:
kf = MultilabelStratifiedKFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True)
kf_split = kf.split(train ,train.loc[:, target_columns])

all_models = list()
kfold_scores = list()
for fold, (train_idx, valid_idx) in enumerate(kf_split):
    print(f" fold {fold} ".center(120, "#"))
    model = get_model(input_size=768, 
                      output_size=30,
                      activation=ACTIVATION,
                      dropout=DROPOUT)
        
    train_inputs = train_tqa_bert_encoded.loc[train_idx, bert_columns].values
    _train_targets = train_targets.loc[train_idx, :].values
    
    valid_inputs = train_tqa_bert_encoded.loc[valid_idx, bert_columns].values
    _valid_targets = train_targets.loc[valid_idx, :].values
       
    optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
    model.compile(loss="mse", optimizer=optimizer)
    callback = EarlyStopping(validation_data=(valid_inputs, _valid_targets),
                             batch_size=BATCH_SIZE,
                             patience=3,
                             restore_best_weights=True,
                             mode='max',
                             verbose=1)
    model.fit(train_inputs, _train_targets, epochs=EPOCHS, batch_size=BATCH_SIZE, shuffle=True,
              validation_data=(valid_inputs, _valid_targets),
              callbacks=[callback])
    all_models.append(model)
    kfold_scores.append(callback.best)

######################################################## fold 0 ########################################################
Train on 40 samples, validate on 10 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100

  c /= stddev[:, None]
  c /= stddev[None, :]
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


 - valid_spearman_rho: -0.052708599482241845
Epoch 5/100
Epoch 6/100
Restoring model weights from the end of the best epoch.
Epoch 00006: early stopping
######################################################## fold 1 ########################################################
Train on 40 samples, validate on 10 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Restoring model weights from the end of the best epoch.
Epoch 00015: early stopping
######################################################## fold 2 ########################################################
Train on 40 samples, validate on 10 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/

***

In [23]:
print(kfold_scores)
print(f"Mean k-fold rho: {np.mean(kfold_scores)}")

[-0.03379004361224979, -0.0778870915709578, 0.20947663211752804, 0.23518146790739677, 0.10702970363227535]
Mean k-fold rho: 0.08800213369479852


In [24]:
for fold,model in enumerate(all_models):
    model.save(MODELS_PATH + f"output_tqa_1h_fold{fold}.h5")

In [11]:
# in case the output layers have been precomputed
all_models = list()
for model_fname in sorted([fname for fname in os.listdir(MODELS_PATH) if "output_tqa_fold" in fname]):
  print(model_fname)
  all_models.append(tf.keras.models.load_model(MODELS_PATH + model_fname))

output_tqa_fold0.h5
output_tqa_fold1.h5
output_tqa_fold2.h5
output_tqa_fold3.h5
output_tqa_fold4.h5


***
## finetuning of the bert layer

In [26]:
def get_model(output_model, dropout=0.2, output_layer_name="output"):
    input_word_ids = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_word_ids')
    input_masks = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_masks')
    input_segments = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_segments')

    config = BertConfig()
    bert_layer = TFBertModel.from_pretrained(BERT_PATH, config=config)
    hidden_layer,_ = bert_layer([input_word_ids, input_masks, input_segments])

    hidden_layer_cls = tf.reshape(hidden_layer[:,0], (-1,768))
    
    hidden_layer_dpout = tf.keras.layers.Dropout(dropout)(hidden_layer_cls)
    output_layer = output_model.get_layer(output_layer_name)(hidden_layer_dpout)
    model = tf.keras.models.Model(
        inputs=[input_word_ids, input_masks, input_segments], 
        outputs=output_layer)
    return model

***

In [27]:
SEED = 19
NUM_FOLDS = 5
DROPOUT = 0.1
LEARNING_RATE = 2e-5
EPOCHS = 5
BATCH_SIZE = 12

In [28]:
tokenizer = BertTokenizer(BERT_PATH+'vocab.txt', True)
train_inputs = compute_input_arrays_tqa(train, tokenizer, MAX_SEQUENCE_LENGTH)

50it [00:00, 95.60it/s]


***
### generates splits for folds

In [29]:
kf = MultilabelStratifiedKFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True)
kf_split = kf.split(train ,train.loc[:, target_columns])

fold_split = dict()
for fold, (train_idx, valid_idx) in enumerate(kf_split):
  fold_split[fold] = (train_idx, valid_idx)

***
### load saved bert models

In [30]:
all_bert_models = dict()

for model_fname in [fname for fname in os.listdir(MODELS_PATH) if "bert_tqa" in fname]:
  fold = int(model_fname.split("_")[2][4:])
  epoch = int(model_fname.split("_")[3].split(".")[0][5:])
  all_bert_models[fold] = (tf.keras.models.load_model(MODELS_PATH + model_fname), epoch)

***
### fold 0

In [32]:
kfold_scores = list()

In [None]:
fold = 0
print(f" fold {fold} ".center(120, "#"))

if fold in all_bert_models:
  model,epoch = all_bert_models[fold]
else:
  model = get_model(output_model = all_models[fold],
                    dropout = DROPOUT,
                    output_layer_name = "output")
  epoch = 0

train_idx,valid_idx = fold_split[fold]

_train_inputs = [train_inputs[i][train_idx] for i in range(3)]
_train_targets = train_targets.loc[train_idx, :].values

_valid_inputs = [train_inputs[i][valid_idx] for i in range(3)]
_valid_targets = train_targets.loc[valid_idx, :].values
    
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(loss="mse", optimizer=optimizer)
callback = EarlyStopping(validation_data=(_valid_inputs, _valid_targets),
                          batch_size=BATCH_SIZE,
                          patience=2,
                          restore_best_weights=True,
                          mode='max',
                          verbose=1)
model.fit(_train_inputs, _train_targets, epochs=EPOCHS, batch_size=BATCH_SIZE, shuffle=True,
          validation_data=(_valid_inputs, _valid_targets),
          callbacks=[callback])
all_bert_models[fold] = (model, epoch+EPOCHS)
kfold_scores.append(callback.best)
model.save(MODELS_PATH + f"bert_tqa_1h_fold{fold}_epoch{epoch+EPOCHS}.h5")

I0131 20:02:05.051416 4647570880 modeling_tf_utils.py:255] loading weights file ./transformers/bert-base-uncased/tf_model.h5


######################################################## fold 0 ########################################################
Train on 40 samples, validate on 10 samples
Epoch 1/5


W0131 20:02:22.832198 4647570880 optimizer_v2.py:1043] Gradients do not exist for variables ['tf_bert_model_2/bert/pooler/dense/kernel:0', 'tf_bert_model_2/bert/pooler/dense/bias:0'] when minimizing the loss.
W0131 20:02:31.987792 4647570880 optimizer_v2.py:1043] Gradients do not exist for variables ['tf_bert_model_2/bert/pooler/dense/kernel:0', 'tf_bert_model_2/bert/pooler/dense/bias:0'] when minimizing the loss.


In [20]:
model.save(MODELS_PATH + f"bert_tqa_fold{fold}_epoch{epoch+EPOCHS}.h5")

NotImplementedError: ignored

***
### fold 1

In [21]:
fold = 1
print(f" fold {fold} ".center(120, "#"))

if fold in all_bert_models:
  model,epoch = all_bert_models[fold]
else:
  model = get_model(output_model = all_models[fold],
                    dropout = DROPOUT,
                    output_layer_name = "output")
  epoch = 0

train_idx,valid_idx = fold_split[fold]

_train_inputs = [train_inputs[i][train_idx] for i in range(3)]
_train_targets = train_targets.loc[train_idx, :].values

_valid_inputs = [train_inputs[i][valid_idx] for i in range(3)]
_valid_targets = train_targets.loc[valid_idx, :].values
    
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(loss="mse", optimizer=optimizer)
callback = EarlyStopping(validation_data=(_valid_inputs, _valid_targets),
                          batch_size=BATCH_SIZE,
                          patience=2,
                          restore_best_weights=True,
                          mode='max',
                          verbose=1)
model.fit(_train_inputs, _train_targets, epochs=EPOCHS, batch_size=BATCH_SIZE, shuffle=True,
          validation_data=(_valid_inputs, _valid_targets),
          callbacks=[callback])
all_bert_models[fold] = (model, epoch+EPOCHS)
kfold_scores.append(callback.best)
model.save(MODELS_PATH + f"bert_tqa_fold{fold}_epoch{epoch+EPOCHS}.h5")

######################################################## fold 1 ########################################################
Train on 4863 samples, validate on 1216 samples
Epoch 1/5
  12/4863 [..............................] - ETA: 2:37:14 - valid_spearman_rho: 0.3086365662599288
  12/4863 [..............................] - ETA: 8:50:53

ResourceExhaustedError: ignored

*** 
### fold 2


In [22]:
fold = 2
print(f" fold {fold} ".center(120, "#"))

if fold in all_bert_models:
  model,epoch = all_bert_models[fold]
else:
  model = get_model(output_model = all_models[fold],
                    dropout = DROPOUT,
                    output_layer_name = "output")
  epoch = 0

train_idx,valid_idx = fold_split[fold]

_train_inputs = [train_inputs[i][train_idx] for i in range(3)]
_train_targets = train_targets.loc[train_idx, :].values

_valid_inputs = [train_inputs[i][valid_idx] for i in range(3)]
_valid_targets = train_targets.loc[valid_idx, :].values
    
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(loss="mse", optimizer=optimizer)
callback = EarlyStopping(validation_data=(_valid_inputs, _valid_targets),
                          batch_size=BATCH_SIZE,
                          patience=2,
                          restore_best_weights=True,
                          mode='max',
                          verbose=1)
model.fit(_train_inputs, _train_targets, epochs=EPOCHS, batch_size=BATCH_SIZE, shuffle=True,
          validation_data=(_valid_inputs, _valid_targets),
          callbacks=[callback])
all_bert_models[fold] = (model, epoch+EPOCHS)
kfold_scores.append(callback.best)
model.save(MODELS_PATH + f"bert_tqa_fold{fold}_epoch{epoch+EPOCHS}.h5")

######################################################## fold 2 ########################################################
Train on 4863 samples, validate on 1216 samples
Epoch 1/5
  12/4863 [..............................] - ETA: 2:42:51 - valid_spearman_rho: 0.30770025986814714
  12/4863 [..............................] - ETA: 8:53:15

ResourceExhaustedError: ignored

*** 
### fold 3

In [None]:
fold = 3
print(f" fold {fold} ".center(120, "#"))

if fold in all_bert_models:
  model,epoch = all_bert_models[fold]
else:
  model = get_model(output_model = all_models[fold],
                    dropout = DROPOUT,
                    output_layer_name = "output")
  epoch = 0

train_idx,valid_idx = fold_split[fold]

_train_inputs = [train_inputs[i][train_idx] for i in range(3)]
_train_targets = train_targets.loc[train_idx, :].values

_valid_inputs = [train_inputs[i][valid_idx] for i in range(3)]
_valid_targets = train_targets.loc[valid_idx, :].values
    
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(loss="mse", optimizer=optimizer)
callback = EarlyStopping(validation_data=(_valid_inputs, _valid_targets),
                          batch_size=BATCH_SIZE,
                          patience=2,
                          restore_best_weights=True,
                          mode='max',
                          verbose=1)
model.fit(_train_inputs, _train_targets, epochs=EPOCHS, batch_size=BATCH_SIZE, shuffle=True,
          validation_data=(_valid_inputs, _valid_targets),
          callbacks=[callback])
all_bert_models[fold] = (model, epoch+EPOCHS)
kfold_scores.append(callback.best)
model.save(MODELS_PATH + f"bert_tqa_fold{fold}_epoch{epoch+EPOCHS}.h5")

***
### fold 4

In [None]:
fold = 4
print(f" fold {fold} ".center(120, "#"))

if fold in all_bert_models:
  model,epoch = all_bert_models[fold]
else:
  model = get_model(output_model = all_models[fold],
                    dropout = DROPOUT,
                    output_layer_name = "output")
  epoch = 0

train_idx,valid_idx = fold_split[fold]

_train_inputs = [train_inputs[i][train_idx] for i in range(3)]
_train_targets = train_targets.loc[train_idx, :].values

_valid_inputs = [train_inputs[i][valid_idx] for i in range(3)]
_valid_targets = train_targets.loc[valid_idx, :].values
    
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(loss="mse", optimizer=optimizer)
callback = EarlyStopping(validation_data=(_valid_inputs, _valid_targets),
                          batch_size=BATCH_SIZE,
                          patience=2,
                          restore_best_weights=True,
                          mode='max',
                          verbose=1)
model.fit(_train_inputs, _train_targets, epochs=EPOCHS, batch_size=BATCH_SIZE, shuffle=True,
          validation_data=(_valid_inputs, _valid_targets),
          callbacks=[callback])
all_bert_models[fold] = (model, epoch+EPOCHS)
kfold_scores.append(callback.best)
model.save(MODELS_PATH + f"bert_tqa_fold{fold}_epoch{epoch+EPOCHS}.h5")

***

***