In [1]:
import re
import os
import numpy as np
import pandas as pd
import random
import math
import tensorflow as tf
import logging
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from tensorflow.keras import backend as K
from transformers import RobertaTokenizerFast, TFXLMRobertaModel
from kaggle_datasets import KaggleDatasets
tf.get_logger().setLevel(logging.ERROR)
from kaggle_datasets import KaggleDatasets
from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply, Embedding, Reshape, Flatten, Dropout, GRU, Dense
from keras.layers import RepeatVector, Dense, Activation, Lambda, Softmax, Conv1D
from keras.optimizers import Adam, SGD
from keras.utils import to_categorical
from keras.models import load_model, Model
from tensorflow.keras.callbacks import ReduceLROnPlateau,ModelCheckpoint, EarlyStopping ,LearningRateScheduler

In [2]:
tokenizer = RobertaTokenizerFast.from_pretrained('../input/initial-tokenizer-large/tokenizer-large')
roberta = TFXLMRobertaModel.from_pretrained('../input/initial-xlmroberta-large/roberta-large')

All model checkpoint layers were used when initializing TFXLMRobertaModel.

All the layers of TFXLMRobertaModel were initialized from the model checkpoint at ../input/initial-xlmroberta-large/roberta-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.


In [3]:
# Tokenize a bunch of texts
def tokenize_texts(texts, tokenizer, max_len):
    tok = tokenizer.batch_encode_plus(texts.tolist(), 
                                          padding='max_length', 
                                          max_length=max_len, 
                                          truncation=True, 
                                          return_tensors='np')
    return tok['input_ids'], tok['attention_mask']

# Prepare sets
def prepare_sets(is_train_csv, tokenizer, max_len):
    # Retrieve dataframes
    df        = pd.read_csv('../input/commonlitreadabilityprize/' + is_train_csv)
    
    # Retrieve inputs: IDs and mask
    excerpts  = df['excerpt']
    tokenized = tokenize_texts(excerpts, tokenizer, max_len)
    ids       = tokenized[0]
    mask      = tokenized[1]
    
    # Retrieve labels
    targets = None
    if is_train_csv == 'train.csv':
        targets   = np.array(df['target'])
    
    return ids, mask, targets, df

In [4]:
# Function to transform arrays to tensors
def transform_to_tensors(x_train, x_val, y_train, y_val):
    
    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices(({'input_1': x_train[0], 'input_2': x_train[1]}, y_train))
        .shuffle(2048)
        .batch(8)
        .prefetch(tf.data.experimental.AUTOTUNE)
    )
    
    valid_dataset = (
        tf.data.Dataset
        .from_tensor_slices(({'input_1': x_val[0], 'input_2': x_val[1]}, y_val))
        .batch(8)
        .prefetch(tf.data.experimental.AUTOTUNE)
    )
    
    return train_dataset, valid_dataset


In [5]:
def create_model(max_len, learning_rate):
    # Inputs: IDs and mask
    ids = Input(shape=(max_len,), dtype='int64')
    mask = Input(shape=(max_len,), dtype='int64')
    
    # Processing
    x = roberta(input_ids=ids, attention_mask=mask)['last_hidden_state']
    x = x[:, 0, :]
    x = Dropout(0.2)(x)
    x = Dense(1, activation='linear')(x)
    
    # Output
    pred = x
    
    # Model
    model = Model(inputs=[ids, mask], outputs=pred)
    model.compile(optimizer = tf.keras.optimizers.Adam(lr = learning_rate),
                  loss = [tf.keras.losses.MeanSquaredError()],
                  metrics = [tf.keras.metrics.RootMeanSquaredError()])
    return model

In [6]:
# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

In [7]:
def train_model(max_len, lr, batch, epoch_size, cp_path, seed):
    # Prepare sets
    train_set = prepare_sets('train.csv', tokenizer, max_len)
    test_set = prepare_sets('test.csv', tokenizer, max_len)
    
    # Create model
    model = create_model(max_len, lr)
    model.summary()
    
    # Randomize
    seed_everything(seed)
    
    checkpoint = tf.keras.callbacks.ModelCheckpoint(cp_path, 
                                                    monitor = 'val_root_mean_squared_error', 
                                                    verbose = 2, 
                                                    save_best_only = True,
                                                    save_weights_only = True, 
                                                    mode = 'min')
    
    reduce_lr=ReduceLROnPlateau(monitor="val_root_mean_squared_error",
                                factor=0.2,
                                patience=5,
                                min_lr=1e-8)
    
    early_stopping=EarlyStopping(monitor="val_root_mean_squared_error",
                                 min_delta=0,
                                 patience=5,
                                 verbose=2,
                                 mode="min",
                                 restore_best_weights=True)
    
    # Use 5-fold CV
    kfold = KFold(n_splits = 5, shuffle = True, random_state = seed)
    out_of_fold_pred = np.zeros((len(train_set[2])))
    
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train_set[3])):
        print('\nFold', fold+1, '*'*50)
        
        sets = transform_to_tensors(x_train = (train_set[0][trn_ind], train_set[1][trn_ind]),
                                    x_val = (train_set[0][val_ind], train_set[1][val_ind]),
                                    y_train = train_set[2][trn_ind],
                                    y_val = train_set[2][val_ind])
    
        model.fit(sets[0], 
                  batch_size=batch, epochs=epoch_size,
                  validation_data = sets[1],
                  callbacks = [checkpoint, reduce_lr, early_stopping])
    
        model.load_weights(cp_path)
        cv_pred = model.predict(sets[1])
        out_of_fold_pred[val_ind] = cv_pred.reshape(-1)
    
    cv_rmse = np.sqrt(mean_squared_error(train_set[2], out_of_fold_pred))
    print('Out-of-fold Root Mean Square Error is:', cv_rmse)
    
    # Predict test
    test_pred = model.predict([test_set[0], test_set[1]], test_set[2])
    
    return test_pred

In [8]:
MAX_LEN = 300
LR = 1e-5
BATCH_SIZE = 8
EPOCHS = 8
CP_PATH = 'cp_xml_roberta_large.h5'
SEED = 2021

test_pred = train_model(MAX_LEN, LR, BATCH_SIZE, EPOCHS, CP_PATH, 2021)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 300)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 300)]        0                                            
__________________________________________________________________________________________________
tfxlm_roberta_model (TFXLMRober TFBaseModelOutputWit 355359744   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
tf.__operators__.getitem (Slici (None, 1024)         0           tfxlm_roberta_model[0][0]    

In [9]:
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
pred_df = {'id': test_df['id'], 'target': test_pred.reshape(-1)}
pd.DataFrame(pred_df).to_csv('submission.csv', index=False)