In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import re
import os
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig, DebertaTokenizer, BertTokenizer, TFAutoModel
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

tokenizers.__version__: 0.12.1
transformers.__version__: 4.20.1


In [2]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None
    
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [3]:
TEST_DIR = "../input/feedback-prize-effectiveness"
TRAIN_FOLDS_DIR = '../input/feedbacktrainfolds'

In [4]:
train = pd.read_csv(f"{TRAIN_FOLDS_DIR}/train_folds.csv")
test = pd.read_csv(f"{TEST_DIR}/test.csv")
submission = pd.read_csv(f"{TEST_DIR}/sample_submission.csv")

In [5]:
def get_essay_text(essay_id, directory):
    parent_path = TEST_DIR + "/" + directory
    essay_path = os.path.join(parent_path, f"{essay_id}.txt")
    essay_text = open(essay_path, 'r').read()
    return essay_text

In [6]:
train['essay_text'] = train['essay_id'].apply(get_essay_text, directory='train')
test['essay_text'] = test['essay_id'].apply(get_essay_text, directory='test')

In [7]:
def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

train['discourse_text'] = train['discourse_text'].apply(resolve_encodings_and_normalize)
train['essay_text'] = train['essay_text'].apply(resolve_encodings_and_normalize)

test['discourse_text'] = test['discourse_text'].apply(resolve_encodings_and_normalize)
test['essay_text'] = test['essay_text'].apply(resolve_encodings_and_normalize)

In [8]:
train['text'] = train['discourse_type'] + " [SEP] " + train['discourse_text'] + " [SEP] " + train['essay_text']
test['text'] = test['discourse_type'] + " [SEP] " + test['discourse_text'] + " [SEP] " + test['essay_text']

## labels

In [9]:
label_mapping = {"Adequate":0, "Effective":1, "Ineffective":2}
train['discourse_effectiveness'] = train['discourse_effectiveness'].map(label_mapping)

In [10]:
train['discourse_effectiveness'].value_counts()

0    20977
1     9326
2     6462
Name: discourse_effectiveness, dtype: int64

## Tokenizer

In [11]:
tokenizer = AutoTokenizer.from_pretrained("../input/robertalarge")

In [12]:
# tokenizer.encode_plus(train['text'][0], add_special_tokens=True,
#                               padding='max_length',max_length=512).keys()

In [13]:
# def tokenizer_encode(texts, tokenizer):
    
#     input_ids = np.zeros((train.shape[0], 512))
#     attention_mask_ids = np.zeros((train.shape[0], 512))
    
#     for e,text in enumerate(texts):
#         tokenized_text = tokenizer.encode_plus(text, add_special_tokens=True,
#                               padding='max_length',max_length=512,truncation= True)
#         input_ids[e,:] = tokenized_text['input_ids']
#         attention_mask_ids[e,:] = tokenized_text['attention_mask']

#     return input_ids, attention_mask_ids

In [14]:
# X_train = tokenizer_encode(train['text'].astype(str), tokenizer)

In [15]:
AUTO = tf.data.experimental.AUTOTUNE

In [16]:
# def build_model(fold, roberta_model, max_len=512):  
#     callbacks = tf.keras.callbacks.ModelCheckpoint(f'./roberta{fold}_weights.h5',
#                                                    save_weights_only=True, save_best_only=True)
#     input_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
#     attention_mask = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

#     sequence_output = roberta_model(input_ids, attention_mask=attention_mask)[0]
#     clf_output = sequence_output[:, 0, :]
#     clf_output = tf.keras.layers.Dropout(.1)(clf_output)
#     out = tf.keras.layers.Dense(3, activation='softmax')(clf_output)
    
#     model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=out)
#     model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
#     return model, callbacks

In [17]:
# roberta_model = (transformers.TFAutoModel.from_pretrained('../input/robertalarge',from_pt=True))

# Model Training

In [18]:
@tf.function
def map_function(encodings , target):
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    
    target = tf.cast(target, tf.int8)
    
    return {'input_ids': input_ids , 'attention_mask': attention_mask}, target

In [19]:
def create_model():
    input_id = tf.keras.layers.Input(shape = (512) , dtype = tf.int32, name = 'input_ids')
    attention_mask = tf.keras.layers.Input(shape = (512), dtype = tf.int32, name = 'attention_mask')
    
    transformer_model = transformers.TFAutoModel.from_pretrained('../input/robertalarge', from_pt=True)
    cls_token = transformer_model(input_ids = input_id , attention_mask = attention_mask)[0][:,0,:]
    
    prediction = tf.keras.layers.Dense(3 , activation = "softmax")(cls_token)

    return tf.keras.models.Model(inputs = [input_id, attention_mask] , outputs = prediction)

In [20]:
from sklearn.metrics import log_loss
def competition_metrics(y_true, y_preds):
    return log_loss(y_true, y_preds)

In [21]:
import tensorflow.keras.backend as K

In [22]:
df_train = train.copy()

In [23]:
# histories = []
# scores = []
# for fold in range(4,5):
#     print(f"====== FOLD RUNNING {fold}======")
    
#     X_train = df_train.loc[df_train['kfold'] != fold]['text']
#     y_train = df_train.loc[df_train['kfold'] != fold]['discourse_effectiveness']
    
#     X_test = df_train.loc[df_train['kfold'] == fold]['text']
#     y_test = df_train.loc[df_train['kfold'] == fold]['discourse_effectiveness']
    
#     print(" Train Generating Tokens")
#     train_embeddings = tokenizer(
#         X_train.tolist(),
#         truncation = True, 
#         padding = 'max_length',
#         max_length =512    
#     )
    
#     print(" Validation Generating Tokens")
#     validation_embeddings = tokenizer(
#         X_test.tolist(),
#         truncation = True, 
#         padding = 'max_length',
#         max_length =512  
#     )
    
#     print("Train Generating Dataset")
#     train = tf.data.Dataset.from_tensor_slices((train_embeddings , y_train))
#     train = (
#                 train
#                 .map(map_function, num_parallel_calls= AUTO)
#                 .batch(24)
#                 .prefetch(AUTO)
#             )
        
#     print("Validation Generating Dataset")
#     val = tf.data.Dataset.from_tensor_slices((validation_embeddings , y_test))
#     val = (
#                 val
#                 .map(map_function, num_parallel_calls= AUTO)
#                 .batch(24)
#                 .prefetch(AUTO)
#             )
    
#     #Clearing backend session
#     K.clear_session()
#     print("Backend Cleared")

#     print("Model Creation")
#     with strategy.scope():
#         model = create_model()
#         model.compile(
#           optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-5), 
#           metrics = ['accuracy'],
#           loss = tf.keras.losses.SparseCategoricalCrossentropy()
#       )    
#     early_stopping= tf.keras.callbacks.EarlyStopping(monitor="val_loss",min_delta=0,patience=4,verbose=1,mode="min",restore_best_weights=True)
#     modelchkpt = tf.keras.callbacks.ModelCheckpoint(f'./roberta{fold}_weights.h5',
#                                                    save_weights_only=True, save_best_only=True)
#     hist = model.fit(train , validation_data = val , epochs = 12, callbacks = [early_stopping,modelchkpt])
    
#     # prediction on val
#     print("prediction on validation data")
#     preds = model.predict(val , verbose = 1)
#     score = competition_metrics(y_test.values,preds)
#     scores.append(score)
#     print(f"Log Loss for Fold {fold} is {score}")

#     #saving model
#     #print("saving model")
    
#     #localhost_save_option = tf.saved_model.SaveOptions(experimental_io_device="/job:localhost")
#     #model.save(f'./model_{fold}', options=localhost_save_option)
    
#     del model, X_train , y_train, X_test, y_test, val , train , train_embeddings , validation_embeddings

#     histories.append(hist)

# print("the final average Log Loss is ", np.mean(scores))

In [24]:
label_mapping = {"Adequate":0, "Effective":1, "Ineffective":2}

In [25]:
df_test = test.copy()

In [26]:
@tf.function
def test_map_function(encodings):
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    
    
    return {'input_ids': input_ids , 'attention_mask': attention_mask}

In [27]:
histories = []
scores = []
preds_list = []
for fold in range(4):
    print(f"====== FOLD RUNNING {fold}======")
    
    X_test = df_test['text']
    
    print(" Test Generating Tokens")
    test_embeddings = tokenizer(
        X_test.tolist(),
        truncation = True, 
        padding = 'max_length',
        max_length =512  
    )
    
    print("Test Generating Dataset")
    test = tf.data.Dataset.from_tensor_slices((test_embeddings))
    test = (
                test
                .map(test_map_function, num_parallel_calls= AUTO)
                .batch(24)
                .prefetch(AUTO)
            )
    
    #Clearing backend session
    K.clear_session()
    print("Backend Cleared")

    print("Model Creation")
    with strategy.scope():
        model = create_model()
        model.load_weights(f'../input/roberta-essay-feedback/roberta{fold}_weights.h5')

    print("prediction on test data")
    preds = model.predict(test , verbose = 1)
    preds_list.append(preds)
    
    del model, X_test, test , test_embeddings

 Test Generating Tokens
Test Generating Dataset


2022-08-10 04:52:38.541750: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-10 04:52:38.542956: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-10 04:52:38.543691: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-10 04:52:38.544594: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Backend Cleared
Model Creation


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


prediction on test data


2022-08-10 04:53:20.589805: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


 Test Generating Tokens
Test Generating Dataset
Backend Cleared
Model Creation


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


prediction on test data
 Test Generating Tokens
Test Generating Dataset
Backend Cleared
Model Creation


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


prediction on test data
 Test Generating Tokens
Test Generating Dataset
Backend Cleared
Model Creation


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFRobertaModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


prediction on test data


In [28]:
predictions = preds_list[0]+preds_list[1]+ \
              preds_list[2]+preds_list[3]
predictions = (predictions / 4)
predictions

array([[0.4265421 , 0.5348702 , 0.03858773],
       [0.52945906, 0.44388467, 0.02665626],
       [0.43301445, 0.5315902 , 0.03539534],
       [0.44010103, 0.5229849 , 0.03691405],
       [0.44322604, 0.51865023, 0.03812374],
       [0.42284942, 0.54812944, 0.02902113],
       [0.3497251 , 0.61841923, 0.03185569],
       [0.41266453, 0.54774857, 0.03958687],
       [0.33842966, 0.6246992 , 0.03687121],
       [0.47892803, 0.48834836, 0.0327236 ]], dtype=float32)

In [29]:
df_test['Adequate'] = predictions[:,0]
df_test['Effective'] = predictions[:,1]
df_test['Ineffective'] = predictions[:,2]

In [30]:
df_test[submission.columns]

Unnamed: 0,discourse_id,Ineffective,Adequate,Effective
0,a261b6e14276,0.038588,0.426542,0.53487
1,5a88900e7dc1,0.026656,0.529459,0.443885
2,9790d835736b,0.035395,0.433014,0.53159
3,75ce6d68b67b,0.036914,0.440101,0.522985
4,93578d946723,0.038124,0.443226,0.51865
5,2e214524dbe3,0.029021,0.422849,0.548129
6,84812fc2ab9f,0.031856,0.349725,0.618419
7,c668ff840720,0.039587,0.412665,0.547749
8,739a6d00f44a,0.036871,0.33843,0.624699
9,bcfae2c9a244,0.032724,0.478928,0.488348


In [31]:
df_test[submission.columns].to_csv("submission.csv", index=False)