In [None]:
# !pip install --ignore-installed tensorflow-gpu --user
# !pip install bert-for-tf2
# !pip install --ignore-installed graphviz
# !pip install --ignore-installed pydot
# !pip install pydotplus
# !pip install tf-models-nightly
# !pip uninstall tf-nightly
# !pip uninstall tf-estimator-nightly
# !pip install kaggle
# !pip install keras_lr_finder

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
import os
import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import datetime
from tqdm import tqdm
import math
import kaggle
from sklearn.model_selection import StratifiedKFold
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.utils import compute_class_weight
from keras_lr_finder import LRFinder


plt.rc('figure', figsize=(20,10))
plt.rc('axes', labelsize=18, titlesize=20, titleweight = 'bold')    # tamaño de label y titulo 
plt.rc('xtick', labelsize=14)    # tamaño de los indicadores de variacion eje x
plt.rc('ytick', labelsize=14)    # tamaño de los indicadores de variacion eje y
plt.rc('legend', fontsize=14)    # tamaño del indicador (por ej, verdadero o falso)


gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
  except RuntimeError as e:
    print(e)

%run ../0_Data/0_DataLoader.ipynb

# Hiperparametros y parametros

In [None]:
adapter_size = None #64
learning_rate = 1e-5
max_seq_len = 20

In [None]:
class Tweets:
    DATA_COLUMN = "text"
    LABEL_COLUMN = "target"

    def __init__(self, tokenizer: FullTokenizer, sample_size=None, max_seq_len=1024, train=None, test=None):
        self.tokenizer = tokenizer
        self.sample_size = sample_size
        self.max_seq_len = 0
        self.train = train
        self.test = test
        train = train[['text', 'target']].reset_index(drop=True)
        test = test[['text', 'target']].reset_index(drop=True)
        
        ((self.train_x, self.train_y),
         (self.test_x, self.test_y)) = map(self._prepare, [train, test])
        
        print("max seq_len", self.max_seq_len)
        self.max_seq_len = min(self.max_seq_len, max_seq_len)
        ((self.train_x, self.train_x_token_types),
         (self.test_x, self.test_x_token_types)) = map(self._pad, [self.train_x, self.test_x])

    def _prepare(self, df):
        x, y = [], []
        with tqdm(total=df.shape[0], unit_scale=True) as pbar:
            for ndx, row in df.iterrows():
                text, label = row[Tweets.DATA_COLUMN], row[Tweets.LABEL_COLUMN]
                tokens = self.tokenizer.tokenize(text)
                tokens = ["[CLS]"] + tokens + ["[SEP]"]
                token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
                self.max_seq_len = max(self.max_seq_len, len(token_ids))
                x.append(token_ids)
                y.append(int(label))
                pbar.update()
        return np.array(x), np.array(y)
    
    def _pad(self, ids):
        x, t = [], []
        token_type_ids = [0] * self.max_seq_len
        for input_ids in ids:
            input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
            input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
            x.append(np.array(input_ids))
            t.append(token_type_ids)
        return np.array(x), np.array(t)

In [None]:
bert_ckpt_dir=".models/uncased_L-12_H-768_A-12/"
bert_ckpt_file = bert_ckpt_dir + "bert_model.ckpt"
bert_config_file = bert_ckpt_dir + "bert_config.json"

In [None]:
def flatten_layers(root_layer):
    if isinstance(root_layer, keras.layers.Layer):
        yield root_layer
    for layer in root_layer._layers:
        for sub_layer in flatten_layers(layer):
            yield sub_layer


def freeze_bert_layers(l_bert):
    """
    Freezes all but LayerNorm and adapter layers - see arXiv:1902.00751.
    """
    for layer in flatten_layers(l_bert):
        if layer.name in ["LayerNorm", "adapter-down", "adapter-up"]:
            layer.trainable = True
        elif len(layer._layers) == 0:
            layer.trainable = False
        l_bert.embeddings_layer.trainable = False


def create_learning_rate_scheduler(max_learn_rate=5e-5,
                                   end_learn_rate=1e-7,
                                   warmup_epoch_count=10,
                                   total_epoch_count=90):

    def lr_scheduler(epoch):
        if epoch < warmup_epoch_count:
            res = (max_learn_rate/warmup_epoch_count) * (epoch + 1)
        else:
            res = max_learn_rate*math.exp(math.log(end_learn_rate/max_learn_rate)*(epoch-warmup_epoch_count+1)/(total_epoch_count-warmup_epoch_count+1))
        return float(res)
    learning_rate_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_scheduler, verbose=1)

    return learning_rate_scheduler

In [None]:
def create_model(max_seq_len, adapter_size=64):
  # create the bert layer
  with tf.io.gfile.GFile(bert_config_file, "r") as reader:
      bc = StockBertConfig.from_json_string(reader.read())
      bert_params = map_stock_config_to_params(bc)
      bert_params.adapter_size = adapter_size
      bert = BertModelLayer.from_params(bert_params, name="bert")
        
  input_ids      = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="input_ids")
  output         = bert(input_ids)

  print("bert shape", output.shape)
  cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(output)
  logits = keras.layers.Dense(units=1, activation=None)(cls_out)

  model = keras.Model(inputs=input_ids, outputs=logits)
  model.build(input_shape=(None, max_seq_len))

  # load the pre-trained model weights
  load_stock_weights(bert, bert_ckpt_file)

  if adapter_size is not None:
      freeze_bert_layers(bert)

  model.compile(optimizer=keras.optimizers.Adam(learning_rate=2e-5),
                loss=keras.losses.BinaryCrossentropy(from_logits=True),
                metrics=[keras.metrics.BinaryAccuracy(name="acc")])

  model.summary()
        
  return model

In [None]:
%%time

tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))

kf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)


results = []
history = 0
def entrenar(train_df, val_df):
    
    tf.keras.backend.clear_session()
    tf.compat.v1.reset_default_graph()
    gc.collect()

    global max_seq_len

    #Lo hacemos 2 veces, la primera con el dataframe de test para que nos tire el max_seq_len y la otra
    # para que formatee bien los datos
    data = Tweets(tokenizer, sample_size=None, max_seq_len=max_seq_len, train=train_df, test=train_df)    

    max_seq_len = data.max_seq_len
    
    data = Tweets(tokenizer, sample_size=None, max_seq_len=max_seq_len, train=train_df, test=val_df)    
    
    model = create_model(max_seq_len, adapter_size=adapter_size)
    
    # model is a Keras model
    lr_finder = LRFinder(model)
    
    # Train a model with batch size 512 for 5 epochs
    # with learning rate growing exponentially from 0.0001 to 1
#     lr_finder.find(data.train_x, data.train_y, start_lr=5.14e-6, end_lr=5.16e-6, batch_size=16, epochs=10)
    # Plot the loss, ignore 20 batches in the beginning and 5 in the end
#     lr_finder.plot_loss()
    
    log_dir = ".log/real_or_not/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir)

    class_weights = compute_class_weight('balanced', np.unique(train_df.target), train_df.target) 
    class_weights_dict = dict(enumerate(class_weights))

    history = model.fit(x=data.train_x, y=data.train_y, batch_size=32, epochs=2,
                validation_data=(data.test_x, data.test_y),
#               class_weight = class_weights_dict,                        
  callbacks=[tensorboard_callback, keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)])

# create_learning_rate_scheduler(max_learn_rate=2e-5,
#                                                         end_learn_rate=1e-5,
#                                                         warmup_epoch_count=20,
#                                                         total_epoch_count=total_epoch_count),
    
    _, train_acc = model.evaluate(data.train_x, data.train_y)
    _, test_acc = model.evaluate(data.test_x , data.test_y)
    print("train acc", train_acc)
    print(" test acc", test_acc)
    results.append(test_acc)
    model.save_weights('./real_or_not.h5', overwrite=True)
    tf.keras.backend.clear_session()
    tf.compat.v1.reset_default_graph()
    gc.collect()

datos = get_data_sentiment_analysis_as_database()

test_data = datos.test
train_df = datos.train
val_data = datos.validation
predict = datos.predict

entrenar(train_df, val_data)

# split, df_train, test_data = get_k_folded_data_original_as_database()

# #Mantenemos los pesos de cada clase en cada K-Fold
# for train_index, val_index in split:
#     train_df = df_train.iloc[train_index]
#     val_df = df_train.iloc[val_index]
#     entrenar(train_df, val_df)

    
print("results",results)
print(f"Mean-Precision: {sum(results) / len(results)}")


In [None]:
%%time 
# model.save_weights('./real_or_not.h5', overwrite=True)
tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))


data = Tweets(tokenizer, sample_size=None, max_seq_len=max_seq_len, train=train_df, test=test_data)    
model = create_model(max_seq_len, adapter_size=adapter_size)
model.load_weights("real_or_not.h5")


_, test_acc = model.evaluate(data.test_x, data.test_y)

print("train acc", test_acc)

In [None]:
def plot_history(history):
    acc = history.history['acc']
    loss = history.history['loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.plot(x, acc, 'r', label='Training acc')
    plt.plot(x, loss, 'b', label='Training loss')
    plt.title('Training accuracy vs loss')
    plt.legend()
    
plot_history(history)

In [None]:
predict = datos.predict

max_seq_len1 = 0

x, y = [], []
for ndx, row in predict.iterrows():
    text = row['text']
    tokens = tokenizer.tokenize(text)
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    max_seq_len1 = max(max_seq_len1, len(token_ids))
    x.append(token_ids)
ids = np.array(x)
max_seq_len1 = min(max_seq_len1, max_seq_len)

x, t = [], []
token_type_ids = [0] * max_seq_len1
for input_ids in ids:
    input_ids = input_ids[:min(len(input_ids), max_seq_len1 - 2)]
    input_ids = input_ids + [0] * (max_seq_len1 - len(input_ids))
    x.append(np.array(input_ids))
    t.append(token_type_ids)
test_x, test_x_token_types = np.array(x), np.array(t)

model = create_model(max_seq_len, adapter_size=adapter_size)
model.load_weights("real_or_not.h5")

y_test = (model.predict(test_x, batch_size=16, verbose=1) > 0.5).astype("int32")
submission = pd.read_csv('../dataset/sample_submission.csv')
submission['target'] = y_test
submission.to_csv("submission3.csv", index=False)

# !kaggle competitions submit nlp-getting-started -f submission3.csv -m 'BERT'

In [None]:
test_data.target.value_counts()/len(test_data.target)