In [1]:
import re
import os
import numpy as np
import pandas as pd
import random
import math
import tensorflow as tf
import logging
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from tensorflow.keras import backend as K
from transformers import RobertaTokenizerFast, TFRobertaModel
from kaggle_datasets import KaggleDatasets
tf.get_logger().setLevel(logging.ERROR)
from kaggle_datasets import KaggleDatasets
from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply, Embedding, Reshape, Flatten, Dropout, GRU, Dense
from keras.layers import RepeatVector, Dense, Activation, Lambda, Softmax, Conv1D, LayerNormalization, Softmax, Multiply
from keras import regularizers
from keras.optimizers import Adam, SGD
from keras.utils import to_categorical
from keras.initializers import Constant
from keras.models import load_model, Model
from tensorflow.keras.callbacks import ReduceLROnPlateau,ModelCheckpoint, EarlyStopping ,LearningRateScheduler

In [2]:
tokenizer = RobertaTokenizerFast.from_pretrained('../input/initial-tokenizer-large/tokenizer-large')
roberta = TFRobertaModel.from_pretrained('../input/initial-roberta-large/roberta-large')

All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/initial-roberta-large/roberta-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [3]:
# Tokenize a bunch of texts
def tokenize_texts(texts, tokenizer, max_len):
    tok = tokenizer.batch_encode_plus(texts.tolist(), 
                                          padding='max_length', 
                                          max_length=max_len, 
                                          truncation=True, 
                                          return_tensors='np')
    return tok['input_ids'], tok['attention_mask']

# Prepare sets
def prepare_sets(is_train_csv, tokenizer, max_len):
    # Retrieve dataframes
    df        = pd.read_csv('../input/commonlitreadabilityprize/' + is_train_csv)
    
    # Retrieve inputs: IDs and mask
    excerpts  = df['excerpt']
    tokenized = tokenize_texts(excerpts, tokenizer, max_len)
    ids       = tokenized[0]
    mask      = tokenized[1]
    
    # Retrieve labels
    targets = None
    if is_train_csv == 'train.csv':
        targets   = np.array(df['target'])
    
    return ids, mask, targets, df

In [4]:
# Function to transform arrays to tensors
def transform_to_tensors(x_train, x_val, y_train, y_val):
    
    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices(({'input_1': x_train[0], 'input_2': x_train[1]}, y_train))
        .shuffle(2048)
        .batch(8)
        .prefetch(tf.data.experimental.AUTOTUNE)
    )
    
    valid_dataset = (
        tf.data.Dataset
        .from_tensor_slices(({'input_1': x_val[0], 'input_2': x_val[1]}, y_val))
        .batch(8)
        .prefetch(tf.data.experimental.AUTOTUNE)
    )
    
    return train_dataset, valid_dataset


In [5]:
def create_model(max_len, learning_rate, dc):
    
    # Inputs: IDs and mask
    ids = Input(shape=(max_len,), dtype='int64')
    mask = Input(shape=(max_len,), dtype='int64')
    
    # Processing
    x = roberta(input_ids=ids, attention_mask=mask)['last_hidden_state']
    x = x[:, 0, :]
    x = Dropout(0.2)(x)
    
    # Output
    pred = Dense(1, activation='linear', kernel_regularizer=regularizers.l1_l2(l1=1e-4, l2=1e-3),)(x)
    
    # Model
    model = Model(inputs=[ids, mask], outputs=pred)
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, decay=dc),
                  loss = [tf.keras.losses.MeanSquaredError()],
                  metrics = [tf.keras.metrics.RootMeanSquaredError()])
    return model

In [6]:
r_model = create_model(256, 1e-5, 2e-6)
r_model.load_weights('../input/roberta-large-ckpt/ckpt_roberta_large.h5')

test_set = prepare_sets('test.csv', tokenizer, 256)
test_pred = r_model.predict([test_set[0], test_set[1]], test_set[2])

In [7]:
test_df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
pred_df = {'id': test_df['id'], 'target': test_pred.reshape(-1)}
pd.DataFrame(pred_df).to_csv('submission.csv', index=False)