#**Installs & Imports**

In [1]:
!pip install transformers
#!pip install tensorflow-addons

import re
import os
import logging
import random
import math

import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold

from transformers import RobertaTokenizerFast, TFRobertaModel

import tensorflow as tf
tf.get_logger().setLevel(logging.ERROR)
#import tensorflow_addons as tfa
from tensorflow.keras import backend as K
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply, Embedding, Reshape, Flatten, Dropout, GRU, Dense, RepeatVector, Dense, Activation, Lambda, Softmax, Conv1D, LayerNormalization, Softmax, Multiply, BatchNormalization
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.initializers import Constant
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.callbacks import ReduceLROnPlateau,ModelCheckpoint, EarlyStopping ,LearningRateScheduler



#**Transformers:** RoBERTa-Large & RoBERTa-Base

In [2]:
# Pretrained RoBERTa-Large
tokenizer_large = RobertaTokenizerFast.from_pretrained('../input/initial-tokenizer-large/tokenizer-large')

# Pretrained RoBERTa-Base
tokenizer_base = RobertaTokenizerFast.from_pretrained('../input/initial-tokenizer-base/tokenizer-base')

#**Preprocessing**

Features:
1. RoBERTa-Large tokenizers
2. RoBERTa-Base tokenizers
3. Rarity per word (GloVe)
5. Character per word
4. Word per sentence

Labels:
1. Target score - Readbility

In [3]:
# Feature 1+2: 
# RoBERTa tokenizers

def tokenize_texts(texts, tokenizer, max_len):
    tok = tokenizer.batch_encode_plus(texts.tolist(), 
                                      padding='max_length', 
                                      max_length=max_len, 
                                      truncation=True, 
                                      return_tensors='np')
    return tok['input_ids'], tok['attention_mask']

In [4]:
# Combine all features

def prepare_sets(address, max_len):
    # Retrieve dataframes
    df        = pd.read_csv(address)
    
    # Retrieve inputs: IDs and mask
    excerpts  = df['excerpt']

    # Feature 1
    tokenized_large = tokenize_texts(excerpts, tokenizer_large, max_len)
    ids_large       = tokenized_large[0]
    msk_large       = tokenized_large[1]

    # Feature 2
    tokenized_base = tokenize_texts(excerpts, tokenizer_base, max_len)
    ids_base       = tokenized_base[0]
    msk_base       = tokenized_base[1]

    # Retrieve labels
    targets = None
    if address == 'train.csv':
        targets = np.array(df['target'])
  
    return ids_large, msk_large, ids_base, msk_base, targets, df

In [5]:
# Features + Labels:
# Combine all features and labels into tensors to feed into model
# It is important that the input order remains the same until this point

def transform_to_tensors(x_trn, y_trn, x_val, y_val, seed, batch_size):

    train_dataset = (tf.data.Dataset
                     .from_tensor_slices(({'input_roberta_large_ids': x_trn[0], 
                                         'input_roberta_large_msk': x_trn[1],
                                         'input_roberta_base_ids':  x_trn[2], 
                                         'input_roberta_base_msk':  x_trn[3]},
                                        y_trn))
                     .shuffle(seed)
                     .batch(batch_size)
                     .prefetch(tf.data.experimental.AUTOTUNE))
    
    valid_dataset = (tf.data.Dataset
                     .from_tensor_slices(({'input_roberta_large_ids': x_val[0], 
                                         'input_roberta_large_msk': x_val[1],
                                         'input_roberta_base_ids':  x_val[2],
                                         'input_roberta_base_msk':  x_val[3]},
                                        y_val))
                     .batch(batch_size)
                     .prefetch(tf.data.experimental.AUTOTUNE))
    
    return train_dataset, valid_dataset

#**Modeling**

In [6]:
def create_model(max_len, learning_rate, dc):
    
    ################
    #    Input     #
    ################
    roberta_large = TFRobertaModel.from_pretrained('../input/initial-roberta-large/roberta-large')
    roberta_base = TFRobertaModel.from_pretrained('../input/initial-roberta-base/roberta-base')

    # RoBERTa-Large
    ids_large = Input(shape=(max_len,), dtype='int64', name='input_roberta_large_ids')
    msk_large = Input(shape=(max_len,), dtype='int64', name='input_roberta_large_msk')

    # RoBERTa-Base
    ids_base = Input(shape=(max_len,), dtype='int64', name='input_roberta_base_ids')
    msk_base = Input(shape=(max_len,), dtype='int64', name='input_roberta_base_msk')

    ################
    #  Processing  #
    ################

    dropout_rate = 0

    # RoBERTa-Large
    
    ids_large_1 = ids_large[:, :128]
    msk_large_1 = msk_large[:, :128]
    x_large_1 = roberta_large(input_ids = ids_large_1, attention_mask = msk_large_1)['last_hidden_state']
    x_large_1 = x_large_1[:, 0, :]
    x_large_1 = Dropout(dropout_rate)(x_large_1)
    x_large_1 = Dense(1, activation='linear', kernel_regularizer=regularizers.l2(1e-2))(x_large_1)
    
    ids_large_2 = ids_large[:, 128:]
    msk_large_2 = msk_large[:, 128:]
    x_large_2 = roberta_large(input_ids = ids_large_2, attention_mask = msk_large_2)['last_hidden_state']
    x_large_2 = x_large_2[:, 0, :]
    x_large_2 = Dropout(dropout_rate)(x_large_2)
    x_large_2 = Dense(1, activation='linear', kernel_regularizer=regularizers.l2(1e-2))(x_large_2)
    
    # RoBERTa-Base
    ids_base_1 = ids_base[:, :128]
    msk_base_1 = msk_base[:, :128]
    x_base_1 = roberta_base(input_ids = ids_base_1, attention_mask = msk_base_1)['last_hidden_state']
    x_base_1 = x_base_1[:, 0, :]
    x_base_1 = Dropout(dropout_rate)(x_base_1)
    x_base_1 = Dense(1, activation='linear', kernel_regularizer=regularizers.l2(1e-2))(x_base_1)

    ids_base_2 = ids_base[:, 128:]
    msk_base_2 = msk_base[:, 128:]
    x_base_2 = roberta_base(input_ids = ids_base_2, attention_mask = msk_base_2)['last_hidden_state']
    x_base_2 = x_base_2[:, 0, :]
    x_base_2 = Dropout(dropout_rate)(x_base_2)
    x_base_2 = Dense(1, activation='linear', kernel_regularizer=regularizers.l2(1e-2))(x_base_2)
    
    ################
    #    Output    #
    ################

    pred = Concatenate(axis=-1)([x_large_1, x_large_2, x_base_1, x_base_2])
    coef = Dense(4, activation='softmax', kernel_regularizer=regularizers.l2(1e-2))(pred)
    pred = Multiply()([pred, coef])
    pred = Dense(1, activation='linear', kernel_initializer=Constant(1), trainable=False)(pred)

    ################
    #     Model    #
    ################

    model = Model(inputs=[ids_large, msk_large, ids_base, msk_base], outputs=pred)

    #opt = tfa.optimizers.AdamW(weight_decay=0.1, learning_rate=learning_rate)

    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss = [tf.keras.losses.MeanSquaredError()],
                  metrics = [tf.keras.metrics.RootMeanSquaredError()])
  
    return model

#**Training & Evaluating**

In [7]:
# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

In [8]:
def train_model(max_len, lr, dc, batch, epoch_size, cp_path, seed):
    # Prepare sets
    train_set = prepare_sets('../input/commonlitreadabilityprize/train.csv', max_len)
    test_set = prepare_sets('../input/commonlitreadabilityprize/test.csv', max_len)

    # Create model
    model = create_model(max_len, lr, dc)
    model.summary()
  
    model.load_weights('../input/ckpt-clrp-1h5/ckpt_clrp.h5')
    
    # Predict test
    test_pred = model.predict([test_set[0], test_set[1], test_set[2], test_set[3]])
    
    return test_pred, model

In [9]:
MAX_LEN = 256
LR = 1e-5
DECAY = 0
BATCH_SIZE = 4
EPOCHS = 5
CP_PATH = 'ckpt_clrp.h5'
SEED = 123

output = train_model(MAX_LEN, LR, DECAY, BATCH_SIZE, EPOCHS, CP_PATH, SEED)

All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/initial-roberta-large/roberta-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.
All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/initial-roberta-base/roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_roberta_large_ids (InputL [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_roberta_large_msk (InputL [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_roberta_base_ids (InputLa [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_roberta_base_msk (InputLa [(None, 256)]        0                                            
______________________________________________________________________________________________

In [10]:
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
pred_df = {'id': test_df['id'], 'target': output[0].reshape(-1)}
pd.DataFrame(pred_df).to_csv('submission.csv', index=False)

In [11]:
#output[1].save_weights(CP_PATH)