In [1]:
!pip install tensorflow-gpu==1.15
# @title Preparation
!pip install -q keras-bert keras-rectified-adam
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import tensorflow as tf
import keras
from keras_radam import RAdam
from keras_bert import get_custom_objects
import numpy as np
from tqdm import tqdm
from keras_bert import Tokenizer
import pandas as pd
import tensorflow.keras.backend as K
import sys
from sklearn.metrics import classification_report
from google.colab import drive

Using TensorFlow backend.


In [0]:
# @title Constants

np.random.seed(42)
SEQ_LEN = 128
BATCH_SIZE = 8
EPOCHS = 5
LR = 1e-5

In [0]:
# @title Environment
import os
pretrained_path = '/content/drive/My Drive/codiesp/alberto_tweets_uncased_L-12_H-768_A-12/'
config_path = os.path.join(pretrained_path, 'bert_config.json')
checkpoint_path = os.path.join(pretrained_path, 'model.ckpt-1000000')
vocab_path = os.path.join(pretrained_path, 'vocabulary_lower_case_128.txt')

In [5]:
# @title Load Basic Model
import sys

!test -d bert_repo || git clone https://github.com/google-research/bert bert_repo
if not 'bert_repo' in sys.path:
  sys.path += ['bert_repo']

# import python modules defined by BERT
from run_classifier import *
import modeling
import optimization
import tokenization

import codecs
from keras_bert import load_trained_model_from_checkpoint

token_dict = {}
with codecs.open(vocab_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)





In [6]:
# @title Load Data

from keras import Sequential
from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing.text import Tokenizer

!pip install ekphrasis
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

import joblib
import pandas as pd
import numpy as np
import random
import re

import keras
from keras.layers import Input
from keras.layers import Dense
from keras.callbacks import ModelCheckpoint

!pip install ndjson
import ndjson



def _pad(input_ids, max_seq_len):
    x = []
    input_ids = input_ids[:min(len(input_ids), max_seq_len - 2)]
    input_ids = input_ids + [0] * (max_seq_len - len(input_ids))
    return np.array(input_ids)

#LOADING DATASET
#Load the dataset
dataframe = pd.DataFrame()

with open('ate_absita_training.ndjson') as f:
            reader = ndjson.reader(f)

            for post in reader:
                df = pd.DataFrame([post], columns=post.keys())
                dataframe = pd.concat([dataframe, df],
                                           axis=0,
                                           ignore_index=True)
sentences = dataframe['sentence']
examples_test = []


#Inizialize Text preprocessor
text_processor = TextPreProcessor (
    # terms that will be normalized
    normalize=[ 'url' , 'email', 'user', 'percent', 'money', 'phone', 'time', 'date', 'number'] ,
    # terms that will be annotated
    annotate={"hashtag"} ,
    fix_html=True ,  # fix HTML tokens

    unpack_hashtags=True ,  # perform word segmentation on hashtags

    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts = [ emoticons ]
)

examples_test = []
i = 0
for s in sentences:
    s = s.lower()
    s = str(" ".join(text_processor.pre_process_doc(s)))
    s = re.sub(r"[^a-zA-ZÀ-ú</>!?♥♡\s\U00010000-\U0010ffff]", ' ', s)
    s = re.sub(r"\s+", ' ', s)
    s = re.sub(r'(\w)\1{2,}',r'\1\1', s)
    s = re.sub ( r'^\s' , '' , s )
    s = re.sub ( r'\s$' , '' , s )
    #print("Processing:---> "+s)
    examples_test.append(s)
    i = i+1

#Tokenization
#Inizialize the tokenizer
tokenizer = tokenization.FullTokenizer(vocab_path, do_lower_case=True)
indices_train = []

for text in examples_test:
  tk = tokenizer.tokenize(text)
  tokens = ["[CLS]"] + tk + ["[SEP]"]
  token_ids = tokenizer.convert_tokens_to_ids(tokens)
  token_ids = _pad(token_ids,SEQ_LEN)
  indices_train.append(token_ids)

indices_train = [indices_train, np.zeros_like(indices_train)]

train_labels = dataframe['score']

Reading english - 1grams ...
Reading english - 2grams ...
Reading english - 1grams ...



In [7]:
len(train_labels)

3054

In [8]:
bert = load_trained_model_from_checkpoint(
    config_file=config_path,
    checkpoint_file=checkpoint_path,
    training=True,
    trainable=True,
    seq_len=128
)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [0]:
# @title Build Custom Model

inputs = bert.inputs[:2]
dense = bert.get_layer('NSP-Dense').output
dense1 = keras.layers.Dense(units=500, activation='relu') (dense)
outputs = keras.layers.Dense(units=1, activation='linear')(dense1)

modelk = keras.models.Model(inputs, outputs)

def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 

modelk.compile(
    optimizer='adam',
    #optimizer='sgd',
    loss=root_mean_squared_error,
    #loss=tf.keras.losses.MeanAbsoluteError(),
    metrics=[root_mean_squared_error]
)

In [0]:
# @title Initialize Variables
sess = K.get_session()
uninitialized_variables = set([i.decode('ascii') for i in sess.run(tf.report_uninitialized_variables())])
init_op = tf.variables_initializer(
    [v for v in tf.global_variables() if v.name.split(':')[0] in uninitialized_variables]
)
sess.run(init_op)

In [0]:
# @title Fit

filepath="/content/drive/My Drive/codiesp/ate_absita/alberto_sa.{epoch:05d}-{root_mean_squared_error:.5f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='root_mean_squared_error', verbose=1, save_best_only=True, mode='min')

#RMSE function
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

import datetime
import numpy as np

class MyCustomCallback(tf.keras.callbacks.Callback):
  def on_train_begin(self, logs=None):
    # Initialize the best as infinity.
    self.best = np.Inf

  def on_epoch_end(self, batch, logs=None):
    # @title Predict
    predicts = self.model.predict(indices_train, verbose=True)
    predictions= []
    for a in predicts:
      predictions.append(a[0])
    rmse_val = rmse(train_labels, predictions)
    if rmse_val < self.best:
      self.best = rmse_val
      self.model.save('alberto_best_model_'+str(rmse_val)+'.h5')
    print("RMSE is: "+str(rmse_val))
    print('Evaluating: batch {} ends at {}'.format(batch, datetime.datetime.now().time()))


callbacks_list = [
    MyCustomCallback()
]

modelk.fit(
    indices_train,
    train_labels,
    epochs=10,
    batch_size=32,
    callbacks=callbacks_list
)

In [0]:
#modelk.save('/content/drive/My Drive/codiesp (1)/ate_absita/alberto_final_model_01.h5')
#modelk.load_weights('/content/drive/My Drive/codiesp/ate_absita/alberto_sa_00009-0_95063_1_0334.hdf5')

In [0]:
# @title Predict
predicts = modelk.predict(indices_train, verbose=True)

In [0]:
#RMSE function
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())



In [0]:
predictions= []
for a in predicts:
  predictions.append(a[0])

In [0]:
rmse_val = rmse(train_labels, predictions)
print("RMS error is: " + str(rmse_val))

#RMS error is: 1.03338