In [10]:
from overfero.data_modules.data_modules import TextClassificationDataModule
from overfero.models.transformations import HuggingFaceTokenizationTransformation
import tensorflow as tf
from overfero.models.models import BinaryTextClassificationModel
from overfero.models.backbones import HuggingFaceBackbone
from overfero.models.adapters import DenseAdapter
from overfero.models.heads import SigmoidHead
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam

In [2]:
transformation = HuggingFaceTokenizationTransformation(
    pretrained_tokenizer_name_or_path="trained_tokenizer",
    max_sequence_length=128,
)

In [3]:
data_modules = TextClassificationDataModule(
    "data/processed/train.parquet",
    "data/processed/dev.parquet",
    "data/processed/test.parquet",
    transformation,
    "cleaned_text",
    "label",
    32,
    True
)

In [4]:
data_modules.setup("test")

In [5]:
dataset = data_modules.initialize_dataloader(data_modules.test_dataset)

In [6]:
data_modules.test_dataset

Unnamed: 0,cleaned_text,label
0,wwwyoutubecomwatch vkacwpkaktak a talk natural...,0
1,very nice i tend get tired constant stream rid...,0
2,watch today circumcision viacom,0
3,thinking venues first color layer blocking fig...,0
4,what death penalty perpetrators expelling rema...,0
...,...,...
33930,that game nuts as kid i dropped copy save corr...,1
39433,and that leave could instead pin first tweet f...,1
26695,the head call yesterday bully ask till saliva ...,1
36871,people bagging tall girl movie look see bullyi...,1


In [19]:
inp, out = next(iter(dataset)) # a batch from train_dataset
print(inp, '\n\n', out)

{'input_ids': <tf.Tensor: shape=(32, 126), dtype=int32, numpy=
array([[ 3467,  1461,  1774, ...,     3,     3,     3],
       [  802,   347,   223, ...,     3,     3,     3],
       [ 5888,   309,   639, ...,     3,     3,     3],
       ...,
       [  893,  1015,  4806, ...,     3,     3,     3],
       [  241,  1123, 11502, ...,     3,     3,     3],
       [  971, 25980,  1220, ...,     3,     3,     3]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(32, 126), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(32, 126), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

In [11]:
backbone = HuggingFaceBackbone("prajjwal1/bert-tiny")

In [12]:
adapter = DenseAdapter(256,"relu")

In [13]:
head = SigmoidHead()

In [14]:
model = BinaryTextClassificationModel(backbone, adapter, head)

In [15]:
model.compile(loss=BinaryCrossentropy(), optimizer=Adam(), metrics=["accuracy"])

In [16]:
model.summary()

In [17]:
model.fit(dataset, epochs=5)

Epoch 1/5


2024-04-09 15:18:43.859118: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 15627264 exceeds 10% of free system memory.
2024-04-09 15:18:43.901353: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 15627264 exceeds 10% of free system memory.
2024-04-09 15:18:44.036030: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 15627264 exceeds 10% of free system memory.
1. The `call()` method of your layer may be crashing. Try to `__call__()` the layer eagerly on some test input first to see if it works. E.g. `x = np.random.random((3, 4)); y = layer(x)`
2. If the `call()` method is correct, then you may need to implement the `def build(self, input_shape)` method on your layer. It should create all variables used by the layer (e.g. by calling `layer.build()` on all its children layers).
Exception encountered: ''Only input tensors may be passed as positional arguments. The following argument value should be passed as a k

ValueError: Exception encountered when calling BinaryTextClassificationModel.call().

[1mOnly input tensors may be passed as positional arguments. The following argument value should be passed as a keyword argument: TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<tf.Tensor 'binary_text_classification_model_1/hugging_face_backbone_1/tf_bert_model/bert/encoder/layer_._1/output/LayerNorm/batchnorm/add_1:0' shape=(None, 126, 128) dtype=float32>, pooler_output=<tf.Tensor 'binary_text_classification_model_1/hugging_face_backbone_1/tf_bert_model/bert/pooler/dense/Tanh:0' shape=(None, 128) dtype=float32>, past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None) (of type <class 'transformers.modeling_tf_outputs.TFBaseModelOutputWithPoolingAndCrossAttentions'>)[0m

Arguments received by BinaryTextClassificationModel.call():
  • x={'input_ids': 'tf.Tensor(shape=(None, 126), dtype=int32)', 'token_type_ids': 'tf.Tensor(shape=(None, 126), dtype=int32)', 'attention_mask': 'tf.Tensor(shape=(None, 126), dtype=int32)'}

In [18]:
dataset

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 126), dtype=tf.int32, name=None), 'token_type_ids': TensorSpec(shape=(None, 126), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 126), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [33]:
import pandas as pd

In [80]:
train_dataset = pd.read_parquet("data/processed/test.parquet")

In [81]:
train_dataset = train_dataset[["cleaned_text", "label"]]

In [82]:
def collate_fn(texts):
    encodings = transformation(texts.to_list())
    return encodings

In [83]:
train_encodings = collate_fn(train_dataset["cleaned_text"][:100])

In [84]:
train_encodings

{'input_ids': <tf.Tensor: shape=(100, 50), dtype=int32, numpy=
array([[ 3252,    27,    16, ...,     3,     3,     3],
       [ 2013,  1127,    14, ...,     3,     3,     3],
       [  860,  1149,  7296, ...,     3,     3,     3],
       ...,
       [   32,   366,   639, ...,     3,     3,     3],
       [ 2737,  5349,  1595, ...,     3,     3,     3],
       [ 1467, 12621,   456, ...,     3,     3,     3]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(100, 50), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(100, 50), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

In [85]:
# train_input_ids = tf.data.Dataset.from_tensor_slices(train_encodings.input_ids)

In [86]:
# train_attention_mask = tf.data.Dataset.from_tensor_slices(train_encodings.attention_mask)

In [87]:
train_encodings = tf.data.Dataset.from_tensor_slices(dict(train_encodings))

In [88]:
train_label = tf.data.Dataset.from_tensor_slices(train_dataset.label)

In [89]:
train_dataset = tf.data.Dataset.zip(train_encodings, train_label)

In [90]:
train_dataset = train_dataset.shuffle(buffer_size=2000).batch(32).prefetch(tf.data.AUTOTUNE)

In [91]:
inp, out = next(iter(train_dataset)) # a batch from train_dataset
print(inp, '\n\n', out)

{'input_ids': <tf.Tensor: shape=(32, 50), dtype=int32, numpy=
array([[ 1947, 10971,   363, ...,     3,     3,     3],
       [  399,   617,    14, ...,     3,     3,     3],
       [10499, 20622,   147, ...,     3,     3,     3],
       ...,
       [   46,    46,   981, ...,     3,     3,     3],
       [    6,   402,  1051, ...,     3,     3,     3],
       [  420,  3175, 10035, ...,     3,     3,     3]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(32, 50), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(32, 50), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>} 



In [92]:
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score
from overfero.models.backbones import HuggingFaceBackbone
from tensorflow.keras import Model
# from transformers import AutoConfig, TFBertModel

In [93]:
backbone = HuggingFaceBackbone("prajjwal1/bert-tiny", transformation)

In [96]:
class BERTForClassification(Model):
    
    def __init__(self, bert_model):
        super().__init__()
        self.bert = bert_model
        self.dense1 = Dense(256, "relu")
        self.dense2 = Dense(64, "relu")
        self.dense3 = Dense(16, "relu")
        self.outputs = Dense(1, "sigmoid")
        
    def call(self, inputs):
        x = self.bert(inputs)[1]
        x = self.dense1(x)
        x = self.dense2(x)
        x = self.dense3(x)
        return self.outputs(x)

In [97]:
model = BERTForClassification(backbone)
model.compile(loss=BinaryCrossentropy(), optimizer=Adam(), metrics=["accuracy"])

In [102]:
model.summary()

In [103]:
history = model.fit(train_dataset, epochs=3)

Epoch 1/3
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 109ms/step - accuracy: 0.5904 - loss: 0.6888
Epoch 2/3
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 0.8614 - loss: 0.6029 
Epoch 3/3
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.8885 - loss: 0.5418


In [104]:
len(train_dataset)

4