In [2]:
# AutoTokenizer preprocess the data 
''' tokenizer will split the given text in words (tokens)
second it converts these tokens into numbers and after that it builds a tensor and feed them to the model'''
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
text_1 = tokenizer("New York is the capital of the whole world!")
print(text_1)
# what we can see is that it gives a dictionarystring to list of ints.


{'input_ids': [101, 2047, 2259, 2003, 1996, 3007, 1997, 1996, 2878, 2088, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [8]:
# we can use tokenizer() with other attributes like padding, truncation, max_length and return_tensors.
text_2 = tokenizer(["New York is the capital of the whole world!","Washington is the capital of states."],
padding = True,
truncation = True,
max_length=  256,
return_tensors='tf')

print(text_2)


{'input_ids': <tf.Tensor: shape=(2, 12), dtype=int32, numpy=
array([[ 101, 2047, 2259, 2003, 1996, 3007, 1997, 1996, 2878, 2088,  999,
         102],
       [ 101, 2899, 2003, 1996, 3007, 1997, 2163, 1012,  102,    0,    0,
           0]])>, 'attention_mask': <tf.Tensor: shape=(2, 12), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])>}


In [9]:
for key, value in text_2.items():
    print(f'{key}: {value.numpy().tolist()}')
    

input_ids: [[101, 2047, 2259, 2003, 1996, 3007, 1997, 1996, 2878, 2088, 999, 102], [101, 2899, 2003, 1996, 3007, 1997, 2163, 1012, 102, 0, 0, 0]]
attention_mask: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]


In [10]:
# after preprocessing text_1 we can now go to the model 
model_1 = tf_model(text_2)
print(model_1)


TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[-4.054861 ,  4.363096 ],
       [-3.7506948,  4.004222 ]], dtype=float32)>, hidden_states=None, attentions=None)


In [11]:
import tensorflow as tf




In [14]:
tf_predictions = tf.nn.softmax(model_1[0], axis=-1)
print(tf_predictions)

tf.Tensor(
[[2.2081658e-04 9.9977916e-01]
 [4.2844625e-04 9.9957150e-01]], shape=(2, 2), dtype=float32)


In [16]:
# if i need labels i can provide it to the model
tf_outputs = tf_model(text_2, labels = tf.constant([1,0]))

In [18]:
# after that i have to save the model
tokenizer.save_pretrained(save_directory=r'C:\Users\User\Desktop\Github\Github\Github_publish')


('C:\\Users\\User\\Desktop\\Github\\Github\\Github_publish\\tokenizer_config.json',
 'C:\\Users\\User\\Desktop\\Github\\Github\\Github_publish\\special_tokens_map.json',
 'C:\\Users\\User\\Desktop\\Github\\Github\\Github_publish\\vocab.txt',
 'C:\\Users\\User\\Desktop\\Github\\Github\\Github_publish\\added_tokens.json',
 'C:\\Users\\User\\Desktop\\Github\\Github\\Github_publish\\tokenizer.json')