In [None]:
!pip install transformers

In [9]:
# All libraries
import re
import os
import logging
import random
import math
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from transformers import RobertaTokenizerFast, TFRobertaModel
import tensorflow as tf
tf.get_logger().setLevel(logging.ERROR)
from tensorflow.keras import backend as K
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply, Embedding, Reshape, Flatten, Dropout, GRU, Dense, RepeatVector, Dense, Activation, Lambda, Softmax, Conv1D, LayerNormalization, Softmax, Multiply
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.initializers import Constant
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping ,LearningRateScheduler
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import RootMeanSquaredError

In [11]:
class Model():

  def __init__(self, pretrained_name="roberta-base"):
    # Choose the pretrained model to use, such as "roberta-base", "roberta-large", etc.
    self.pretrained_name = pretrained_name


  # This function output (features, labels) of a dataset.
  # Each element in 'features' is an array represent one specific feature of all datapoints in the dataset. 
  # Same for 'labels'.
  def process_df(self, dataframe, label_names):
    features = []
    labels = []
    for key in list(dataframe.keys()):
      if key in label_names:
        labels = labels.append(np.array(dataframe[key]))
      else:
        features = features.appned(np.array(dataframe[key]))
    return (features, labels)


  # Load pretrained models from Hugging Face: 
  def load_pretrained(self):
    # 1. Pretrained tokenizer (to turn text into numerical inputs)
    tokenizer = RobertaTokenizerFast.from_pretrained(self.pretrained_name)

    # 2. Pretrained BERT (takes in the tokenized inputs and predicts)
    roberta = TFRobertaModel.from_pretrained(self.pretrained_name)

    return tokenizer, roberta
  

  # Produce numerical outputs for texts
  def tokenize_texts(self, texts, tokenizer, max_tweet_len):
    # 1 text can be tokenized in many ways, hence, tokenizer.batch_encode_plus outputs a dictionary result 
    tokenized = tokenizer.batch_encode_plus(texts.tolist(), padding='max_length', max_length=max_tweet_len, truncation=True, return_tensors='np')

    # We will choose 2 of the tokenized results for learning
    return tokenized['input_ids'], tokenized['attention_mask']


  # The rough template for BERT, add more layer to fine-tune the model
  def build_model(self, featrues, labels, max_tweet_len, learning_rate, decay):
    inputs = []
    outputs = []
    # Inputs: IDs and Attention Mask
    ids = Input(shape=(max_tweet_len,), dtype='int64')
    mask = Input(shape=(max_tweet_len,), dtype='int64')
    inputs = inputs.append([ids, mask])

    for i in range(len(features)):
      ft = Input(shape=(1,))
      inputs = inputs.append(ft)
    
    # Input goes through pretrained BERT
    x = roberta(input_ids=ids, attention_mask=mask)['last_hidden_state']
    # We use only the classification token of the output
    x = x[:, 0, :]
    # Regularization. Drop rate ([0, 1]) is usually shared among many layers
    drop_rate = 0.2 
    x = Dropout(drop_rate)(x)

    # Fine-tuning
    # Add more layers on top of BERT here: ...
    # Example: z = Dense(10, ...)(inputs[2])

    # Final output here, change it to whatever we want to predict
    prediction_1 = Dense(1, activation='linear')(x)
    # Append all the output as specified in 'labels'
    outputs = outputs.append(prediction_1)
    # Examples: outputs = outputs.append(prediction_2)...
    
    # Produce model
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer = Adam(learning_rate=learning_rate, decay=decay), loss = [MeanSquaredError()], metrics = [RootMeanSquaredError()])
    return model


  # Function to seed randomness
  def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)


  # Combine all the previous step and train the model
  def train_model(dataframe, label_names, max_tweet_len, learning_rate, decay, batch, epoch_size, cp_path, seed):
    # Process information from dataframe
    features, labels = process_df(dataframe, label_names)

    # Create model
    model = build_model(featrues, labels, max_tweet_len, learning_rate, decay)
    model.summary()
      
    # Seed 
    seed_everything(seed)

    # Checkpoints for training
    checkpoint = ModelCheckpoint(cp_path, monitor = "val_root_mean_squared_error", verbose = 2, save_best_only = True, save_weights_only = True, mode = 'min')
    reduce_lr = ReduceLROnPlateau(monitor="val_root_mean_squared_error", factor=0.8, patience=5, min_lr=1e-8)
    early_stopping = EarlyStopping(monitor="val_root_mean_squared_error", min_delta=0, patience=5, verbose=2, mode="min", restore_best_weights=True)

    # Use 5-fold CV
    kfold = KFold(n_splits = 5, shuffle = True, random_state = seed)
    
    for fold, (train_indexes, val_indexes) in enumerate(kfold.split(len(features[0]))):
      # Verbosity
      print('\nFold', fold+1, '*'*50)

      # Extract train/validation sets
      train_features = []
      val_features = []
      train_labels = []
      val_labels = []
      for ft in features:
        train_features = train_features.append(ft[train_indexes])
        val_features = val_features.append(ft[val_indexes])
      for lb in labels:
        train_labels = train_labels.append(ft[train_indexes])
        val_labels = val_labels.append(ft[val_indexes])

      # Training and Validating
      try:
        model.load_weights(cp_path + str(fold+1))
      except:
        pass
      model.fit((train_features, train_labels), batch_size=batch, epochs=epoch_size, validation_data = (val_features, val_labels), callbacks = [checkpoint, reduce_lr, early_stopping])
      model.load_weights(cp_path + str(fold+1))
   
    return model