In [2]:
# !pip install tensorflow-gpu
# !pip install bert-for-tf2
# !pip install --ignore-installed graphviz
# !pip install --ignore-installed pydot
# !pip install pydotplus
# !pip install kaggle
# !pip install keras_lr_finder
# !pip install tf-models-nightly
# !pip uninstall tf-nightly
# !pip uninstall tf-estimator-nightly

# Librerias

In [3]:
import tensorflow as tf
import os
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import datetime
from tqdm import tqdm
import math
import kaggle
from sklearn.model_selection import StratifiedKFold
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.utils import compute_class_weight
from keras_lr_finder import LRFinder
import random

logs_base_dir = ".\logs"
os.makedirs(logs_base_dir, exist_ok=True)
%load_ext tensorboard
%tensorboard --logdir={logs_base_dir}

# Hacemos que el modelo sea 100% deterministico incluso corriendo con aceleracion GPU
SEED = 12
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

plt.rc('figure', figsize=(20,10))
plt.rc('axes', labelsize=18, titlesize=20, titleweight = 'bold')    # tamaño de label y titulo 
plt.rc('xtick', labelsize=14)    # tamaño de los indicadores de variacion eje x
plt.rc('ytick', labelsize=14)    # tamaño de los indicadores de variacion eje y
plt.rc('legend', fontsize=14)    # tamaño del indicador (por ej, verdadero o falso)

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) 
tf.get_logger().setLevel('ERROR')

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
  except RuntimeError as e:
    print(e)

%run ../0_Data/0_DataLoader.ipynb

Reusing TensorBoard on port 6006 (pid 6096), started 4:13:52 ago. (Use '!kill 6096' to kill it.)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\germa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\germa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\germa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Hiperparametros y parametros

In [4]:
adapter_size = None #64
learning_rate = 0.00001
max_seq_len = 96
batch_size = 24
epochs = 3

# Clase para tokenizar los inputs

In [5]:
class Tweets:
    DATA_COLUMN = "text"
    LABEL_COLUMN = "target"

    def __init__(self, tokenizer: FullTokenizer, max_seq_len=1024, train=None, test=None):
        self.tokenizer = tokenizer
        self.max_seq_len = 0
        self.train = train
        self.test = test
        train = train[['text', 'target']].reset_index(drop=True)
        test = test[['text', 'target']].reset_index(drop=True)
        
        ((self.train_x, self.train_y),
         (self.test_x, self.test_y)) = map(self._prepare, [train, test])
        
        print("max seq_len", self.max_seq_len)
#         self.max_seq_len = min(self.max_seq_len, max_seq_len)
        self.max_seq_len = max_seq_len
        ((self.train_x, self.train_x_token_types),
         (self.test_x, self.test_x_token_types)) = map(self._pad, [self.train_x, self.test_x])

    def _prepare(self, df):
        x, y = [], []
        with tqdm(total=df.shape[0], unit_scale=True) as pbar:
            for ndx, row in df.iterrows():
                text, label = row[Tweets.DATA_COLUMN], row[Tweets.LABEL_COLUMN]
                tokens = self.tokenizer.tokenize(text)
                tokens = ["[CLS]"] + tokens + ["[SEP]"]
                token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
#                 self.max_seq_len = max(self.max_seq_len, len(token_ids))
                x.append(token_ids)
                y.append(int(label))
                pbar.update()
        return np.array(x), np.array(y)
    
    def _pad(self, ids):
        x, t = [], []
        token_type_ids = [0] * self.max_seq_len
        for input_ids in ids:
            input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
            input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
            x.append(np.array(input_ids))
            t.append(token_type_ids)
        return np.array(x), np.array(t)

# Funciones para compilar el modelo

In [8]:
bert_ckpt_dir=".models/uncased_L-12_H-768_A-12/"
bert_ckpt_file = bert_ckpt_dir + "bert_model.ckpt"
bert_config_file = bert_ckpt_dir + "bert_config.json"

def flatten_layers(root_layer):
    if isinstance(root_layer, tf.keras.layers.Layer):
        yield root_layer
    for layer in root_layer._layers:
        for sub_layer in flatten_layers(layer):
            yield sub_layer

def freeze_bert_layers(l_bert):
    """
    Freezes all but LayerNorm and adapter layers - see arXiv:1902.00751.
    """
    for layer in flatten_layers(l_bert):
        if layer.name in ["LayerNorm", "adapter-down", "adapter-up"]:
            layer.trainable = True
        elif len(layer._layers) == 0:
            layer.trainable = False
        l_bert.embeddings_layer.trainable = False


def create_learning_rate_scheduler(max_learn_rate=5e-5,
                                   end_learn_rate=1e-7,
                                   warmup_epoch_count=10,
                                   total_epoch_count=90):

    def lr_scheduler(epoch):
        if epoch < warmup_epoch_count:
            res = (max_learn_rate/warmup_epoch_count) * (epoch + 1)
        else:
            res = max_learn_rate*math.exp(math.log(end_learn_rate/max_learn_rate)*(epoch-warmup_epoch_count+1)/(total_epoch_count-warmup_epoch_count+1))
        return float(res)
    learning_rate_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_scheduler, verbose=1)

    return learning_rate_scheduler

def create_model(max_seq_len, adapter_size=64):
  # create the bert layer
  with tf.io.gfile.GFile(bert_config_file, "r") as reader:
      bc = StockBertConfig.from_json_string(reader.read())
      bert_params = map_stock_config_to_params(bc)
      bert_params.adapter_size = adapter_size
      bert = BertModelLayer.from_params(bert_params, name="bert")
        
  input_ids      = tf.keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="input_ids")
  output         = bert(input_ids)

  print("bert shape", output.shape)
  cls_out = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :])(output)
#   cls_out1 = tf.keras.layers.Dropout(0.5)(cls_out)
#   logits0 = tf.keras.layers.Dense(units=768, activation="tanh")(cls_out1)
  middle = tf.keras.layers.Dropout(0.5)(cls_out)
  logits = tf.keras.layers.Dense(units=1, activation=None)(middle)

  model = tf.keras.Model(inputs=input_ids, outputs=logits)
  model.build(input_shape=(None, max_seq_len))

  # load the pre-trained model weights
  load_stock_weights(bert, bert_ckpt_file)

  if adapter_size is not None:
      freeze_bert_layers(bert)

  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),#SGD(learning_rate=learning_rate, momentum=0.9, nesterov=True),
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                metrics=[tf.keras.metrics.BinaryAccuracy()])

  model.summary()
        
  return model

# Entrenamiento y validación del modelo

In [9]:
%%time

tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))

results = []
history = 0
def entrenar(train_df, val_df):
    
    tf.keras.backend.clear_session()
    tf.compat.v1.reset_default_graph()
    gc.collect()

    global max_seq_len
    
    data = Tweets(tokenizer, max_seq_len=max_seq_len, train=train_df, test=val_df)    
    
    model = create_model(max_seq_len, adapter_size=adapter_size)
        
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logs_base_dir)

    class_weights = compute_class_weight('balanced', np.unique(train_df.target), train_df.target) 
    class_weights_dict = dict(enumerate(class_weights))
    
    history = model.fit(x=data.train_x, y=data.train_y, batch_size=batch_size, epochs=epochs,
                validation_data=(data.test_x, data.test_y),
                class_weight = class_weights_dict,                        
                callbacks=[tensorboard_callback, 
                           tf.keras.callbacks.EarlyStopping(patience=1, restore_best_weights=True)])

#           callbacks=[create_learning_rate_scheduler(max_learn_rate=1e-5,
#                                                     end_learn_rate=1e-7,
#                                                     warmup_epoch_count=20,
#                                                     total_epoch_count=epochs),
#                      tf.keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True),
#                      tensorboard_callback])
    
    _, train_acc = model.evaluate(data.train_x, data.train_y)
    _, test_acc = model.evaluate(data.test_x , data.test_y)
    print("train acc", train_acc)
    print(" test acc", test_acc)
    results.append(test_acc)
    model.save_weights('./real_or_not.h5', overwrite=True)
    tf.keras.backend.clear_session()
    tf.compat.v1.reset_default_graph()
    gc.collect()

datos = get_data_original_as_database()

test_data = datos.test
train_df = datos.train
val_data = datos.validation
predict = datos.predict

entrenar(train_df, val_data)

# split, df_train, test_data = get_k_folded_data_original_as_database()

# #Mantenemos los pesos de cada clase en cada K-Fold
# for train_index, val_index in split:
#     train_df = df_train.iloc[train_index]
#     val_df = df_train.iloc[val_index]
#     entrenar(train_df, val_df)

    
print("results",results)
print(f"Mean-Precision: {sum(results) / len(results)}")


100%|██████████| 4.87k/4.87k [00:02<00:00, 1.77kit/s]
100%|██████████| 1.22k/1.22k [00:00<00:00, 1.81kit/s]


max seq_len 0
bert shape (None, 96, 768)
Done loading 196 BERT weights from: .models/uncased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x000001F328E992E0> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/embeddings/token_type_embeddings
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights
Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 96)]              0         
_________________________________________________________________
bert (BertModelLayer)       

3774    0
203     1
4778    1
3879    1
4692    1
       ..
1607    0
3745    1
5002    0
1726    0
3933    1
Name: target, Length: 4872, dtype: int64 as keyword args. From version 0.25 passing these as positional arguments will result in an error


Epoch 1/3
Epoch 2/3
Epoch 3/3
train acc 0.8879310488700867
 test acc 0.802955687046051
results [0.802955687046051]
Mean-Precision: 0.802955687046051
Wall time: 11min 22s


# Testeo del modelo

In [11]:
%%time 
tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))

data = Tweets(tokenizer, max_seq_len=max_seq_len, train=train_df, test=test_data)    
model = create_model(max_seq_len, adapter_size=adapter_size)
model.load_weights("real_or_not.h5")


_, test_acc = model.evaluate(data.test_x, data.test_y)

print("train acc", test_acc)

100%|██████████| 4.87k/4.87k [00:02<00:00, 1.75kit/s]
100%|██████████| 1.52k/1.52k [00:00<00:00, 1.76kit/s]


max seq_len 0
bert shape (None, 96, 768)
Done loading 196 BERT weights from: .models/uncased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x000001F37ACDC3D0> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/embeddings/token_type_embeddings
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights
Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 96)]              0         
_________________________________________________________________
bert (BertModelLayer)       

# Predecimos los datos y submiteamos a Kaggle

In [None]:
x, y = [], []
for ndx, row in predict.iterrows():
    text = row['text']
    tokens = tokenizer.tokenize(text)
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    x.append(token_ids)
ids = np.array(x)

x, t = [], []
token_type_ids = [0] * max_seq_len
for input_ids in ids:
    input_ids = input_ids[:min(len(input_ids), max_seq_len - 2)]
    input_ids = input_ids + [0] * (max_seq_len - len(input_ids))
    x.append(np.array(input_ids))
    t.append(token_type_ids)
test_x, test_x_token_types = np.array(x), np.array(t)

model = create_model(max_seq_len, adapter_size=adapter_size)
model.load_weights("real_or_not.h5")

y_test = (model.predict(test_x, batch_size=batch_size, verbose=1) > 0.5).astype("int32")
submission = pd.read_csv('../dataset/sample_submission.csv')
submission['target'] = y_test
submission.to_csv("submission3.csv", index=False)

!kaggle competitions submit nlp-getting-started -f submission3.csv -m 'BERT'

x, y = [], []
for ndx, row in predict.iterrows():
    text = row['text']
    tokens = tokenizer.tokenize(text)
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    x.append(token_ids)
ids = np.array(x)

x, t = [], []
token_type_ids = [0] * max_seq_len
for input_ids in ids:
    input_ids = input_ids[:min(len(input_ids), max_seq_len - 2)]
    input_ids = input_ids + [0] * (max_seq_len - len(input_ids))
    x.append(np.array(input_ids))
    t.append(token_type_ids)
test_x, test_x_token_types = np.array(x), np.array(t)

model = create_model(max_seq_len, adapter_size=adapter_size)
model.load_weights("real_or_not.h5")

y_test = (model.predict(test_x, batch_size=batch_size, verbose=1) > 0.5).astype("int32")
submission = pd.read_csv('../dataset/sample_submission.csv')
submission['target'] = y_test
submission.to_csv("submission3.csv", index=False)

!kaggle competitions submit nlp-getting-started -f submission3.csv -m 'BERT'

In [19]:
x, y = [], []
for ndx, row in predict.iterrows():
    text = row['text']
    tokens = tokenizer.tokenize(text)
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    x.append(token_ids)
ids = np.array(x)

x, t = [], []
token_type_ids = [0] * max_seq_len
for input_ids in ids:
    input_ids = input_ids[:min(len(input_ids), max_seq_len - 2)]
    input_ids = input_ids + [0] * (max_seq_len - len(input_ids))
    x.append(np.array(input_ids))
    t.append(token_type_ids)
test_x, test_x_token_types = np.array(x), np.array(t)

model = create_model(max_seq_len, adapter_size=adapter_size)
model.load_weights("real_or_not.h5")

y_test = (model.predict(test_x, batch_size=batch_size, verbose=1) > 0.5).astype("int32")
submission = pd.read_csv('../dataset/sample_submission.csv')
submission['target'] = y_test
submission.to_csv("submission3.csv", index=False)

!kaggle competitions submit nlp-getting-started -f submission3.csv -m 'BERT'

bert shape (None, 96, 768)
Done loading 196 BERT weights from: .models/uncased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x00000287248611F0> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/embeddings/token_type_embeddings
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights
Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 96)]              0         
_________________________________________________________________
bert (BertModelLayer)        (None, 96, 76


  0%|          | 0.00/25.4k [00:00<?, ?B/s]
 31%|###1      | 8.00k/25.4k [00:00<00:00, 51.6kB/s]
100%|##########| 25.4k/25.4k [00:02<00:00, 10.1kB/s]
