In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import transformers
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import json
from math import log10, floor

In [2]:
df_train = pd.read_csv('/kaggle/input/data-cleaning/vocabulary.csv')
df_test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
df_train.head()

Unnamed: 0,full_text,vocabulary
0,I think that students would benefit from learn...,3.0
1,When a problem is a change you have to let it ...,3.0
2,"Dear, Principal If u change the school policy ...",3.0
3,The best time in life is when you become yours...,4.5
4,Small act of kindness can impact in other peop...,3.0


In [3]:
df_train['full_text'].replace(r'\s+|\\n', ' ', regex=True, inplace=True) 
df_test['full_text'].replace(r'\s+|\\n', ' ', regex=True, inplace=True)

In [4]:
df_train['full_text'].head()

0    I think that students would benefit from learn...
1    When a problem is a change you have to let it ...
2    Dear, Principal If u change the school policy ...
3    The best time in life is when you become yours...
4    Small act of kindness can impact in other peop...
Name: full_text, dtype: object

In [5]:
tokenizer = transformers.DistilBertTokenizerFast.from_pretrained('../input/huggingface-bert-variants/distilbert-base-uncased/distilbert-base-uncased')

In [6]:
input_ids_layer = tf.keras.Input(shape=512, dtype=tf.int32)
attention_mask_layer = tf.keras.Input(shape=512, dtype=tf.int32)
base_model = transformers.TFDistilBertModel.from_pretrained('../input/huggingface-bert-variants/distilbert-base-uncased/distilbert-base-uncased')
base_model.trainable = False
base_model_output = base_model(input_ids=input_ids_layer, attention_mask = attention_mask_layer)
pooling = tf.keras.layers.GlobalAveragePooling1D()(base_model_output.last_hidden_state)
output = tf.keras.layers.Dense(1, activation="linear")(pooling)
model = tf.keras.Model(inputs=[input_ids_layer, attention_mask_layer], outputs=output)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.003), 
    loss='mse',
)

2022-11-29 09:11:50.903953: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-29 09:11:50.905050: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-29 09:11:50.905725: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-29 09:11:50.906731: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [7]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
tf_distil_bert_model (TFDistilB TFBaseModelOutput(la 66362880    input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
global_average_pooling1d (Globa (None, 768)          0           tf_distil_bert_model[0][0]   

In [8]:
encoded = tokenizer.batch_encode_plus(
    df_train['full_text'].tolist(),
    add_special_tokens=False,
    max_length=512,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,)

encoded_test = tokenizer.batch_encode_plus(
    df_test['full_text'].tolist(),
    add_special_tokens=False,
    max_length=512,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,)

input_ids = pd.Series(encoded['input_ids'])
attention_mask = pd.Series(encoded['attention_mask'])

input_ids_test = tf.convert_to_tensor(encoded_test['input_ids'])
attention_mask_test = tf.convert_to_tensor(encoded_test['attention_mask'])

In [9]:
kf = KFold(n_splits=5, shuffle=True)
for fold, (train_id, test_id) in enumerate(kf.split(df_train['vocabulary'])):
    input_ids_train, attention_mask_train = input_ids.iloc[train_id], attention_mask.iloc[train_id]
    input_ids_val, attention_mask_val = input_ids.iloc[test_id], attention_mask.iloc[test_id]
    y_train, y_test = df_train['vocabulary'].iloc[train_id], df_train['vocabulary'].iloc[test_id]
    
    input_ids_train_tensor = tf.convert_to_tensor([x for x in input_ids_train.tolist()])
    attention_mask_train_tensor = tf.convert_to_tensor([x for x in attention_mask_train.tolist()])
    input_ids_val_tensor = tf.convert_to_tensor([x for x in input_ids_val.tolist()])
    attention_mask_val_tensor = tf.convert_to_tensor([x for x in attention_mask_val.tolist()])

    if fold == 1:
        tf.keras.backend.set_value(model.optimizer.learning_rate, 0.001)
    if fold == 3:
        tf.keras.backend.set_value(model.optimizer.learning_rate, 0.0001)

    model.fit(
        x=(input_ids_train_tensor, attention_mask_train_tensor),
        y=tf.convert_to_tensor([np.array([x]) for x in y_train.tolist()]),
        epochs=5,
        shuffle=True,
        batch_size=16,)
    
    pred = model.predict((input_ids_val_tensor, attention_mask_val_tensor))
    print(mean_squared_error(y_test, pred, squared=False))

2022-11-29 09:12:12.010068: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.44396388108566465
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.46192535417835257
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.44039286615345374
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.43726735859507454
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.4066165524026658


In [10]:
model.save('vocabulary.h5')