In [1]:
import numpy as np
import tensorflow as tf
from transformers import TFDistilBertModel, DistilBertConfig

In [2]:
tf.config.experimental.list_physical_devices('GPU')

[]

In [3]:
with open('xids.npy', 'rb') as f:
    Xids = np.load(f)

In [4]:
Xids

array([[  101,   138,  1326, ...,  1104,  1134,   102],
       [  101,   138,  1326, ...,     0,     0,     0],
       [  101,   138,  1326, ...,     0,     0,     0],
       ...,
       [  101,   170, 25247, ...,     0,     0,     0],
       [  101,   170, 25247, ...,     0,     0,     0],
       [  101, 22572, 12148, ...,     0,     0,     0]])

In [5]:
with open('xmask.npy', 'rb') as f:
    Xmask = np.load(f)
    
with open('labels.npy', 'rb') as f:
    labels = np.load(f)

In [6]:
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

2023-04-16 18:28:06.765010: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
for item in dataset.take(1):
    print(item)

(<tf.Tensor: shape=(30,), dtype=int64, numpy=
array([  101,   138,  1326,  1104, 13936, 25265, 16913, 15107,  1103,
        8050,  2553,  1115,  1184,  1110,  1363,  1111,  1103, 20398,
        1110,  1145,  1363,  1111,  1103,   176,  9900,   117,  1199,
        1104,  1134,   102])>, <tf.Tensor: shape=(30,), dtype=int64, numpy=
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1])>, <tf.Tensor: shape=(3,), dtype=float64, numpy=array([1., 0., 0.])>)


In [8]:
def map_func(input_ids, mask, label):
    return{'input_ids': input_ids, 'attention_mask': mask}, label

In [9]:
dataset = dataset.map(map_func)

In [10]:
for item in dataset.take(1):
    print(item)

({'input_ids': <tf.Tensor: shape=(30,), dtype=int64, numpy=
array([  101,   138,  1326,  1104, 13936, 25265, 16913, 15107,  1103,
        8050,  2553,  1115,  1184,  1110,  1363,  1111,  1103, 20398,
        1110,  1145,  1363,  1111,  1103,   176,  9900,   117,  1199,
        1104,  1134,   102])>, 'attention_mask': <tf.Tensor: shape=(30,), dtype=int64, numpy=
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1])>}, <tf.Tensor: shape=(3,), dtype=float64, numpy=array([1., 0., 0.])>)


In [11]:
dataset = dataset.shuffle(10000).batch(64)        

In [12]:
for size in range(len(dataset)):
    pass

In [13]:
size

2438

In [17]:
split = 0.9

train = dataset.take(round(size*split))
val = dataset.skip(round(size*split))

In [18]:
split*size

2194.2000000000003

In [19]:
from transformers import TFDistilBertForSequenceClassification

In [21]:
config = DistilBertConfig(num_labels = 3)

In [24]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)
model.summary()

Downloading:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_projector', 'vocab_layer_norm', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'dropout_19', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
Total params: 66,955,779
Trainable params: 66,955,779
Non-trainable params: 0
_________________________________________________________________


In [26]:
model.layers[0].trainable = False
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
Total params: 66,955,779
Trainable params: 592,899
Non-trainable params: 66,362,880
_________________________________________________________________


In [29]:
optimizer = tf.keras.optimizers.Adam(0.02)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [31]:
history = model.fit(
    train,
    validation_data=val,
    epochs=2
)

Epoch 1/2

KeyboardInterrupt: 