<a href="https://colab.research.google.com/github/mmgyorke/distilbert-base-sequence-classification/blob/main/huggingface_distilbert_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [59]:
!pip install transformers



In [60]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

In [61]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

classifier = pipeline("sentiment-analysis", model = model_name, tokenizer = tokenizer)
results = classifier(["He saved the drowning child.", 
                      "She filed a lawsuit.", 
                      "Marcel won the gold medal.",
                      "The store was way too crowded."])

for result in results:
  print(result)

{'label': 'POSITIVE', 'score': 0.9972121119499207}
{'label': 'NEGATIVE', 'score': 0.9983137845993042}
{'label': 'POSITIVE', 'score': 0.9997240900993347}
{'label': 'NEGATIVE', 'score': 0.9997655749320984}


In [62]:
tokens = tokenizer.tokenize("He saved the drowning child.")
token_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = tokenizer("He saved the drowning child.")

print(f'  Tokens: {tokens}')
print(f'Token IDs: {token_ids}')
print(f'Input IDs: {input_ids}')

  Tokens: ['he', 'saved', 'the', 'drowning', 'child', '.']
Token IDs: [2002, 5552, 1996, 14759, 2775, 1012]
Input IDs: {'input_ids': [101, 2002, 5552, 1996, 14759, 2775, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


In [63]:
X_train = ["He saved the drowning child.", 
           "She filed a lawsuit.", 
           "Marcel won the gold medal.", 
           "There were many books in the library."]

batch = tokenizer(X_train, padding = True, truncation = True, max_length = 512, return_tensors = "pt")
print(batch)

{'input_ids': tensor([[  101,  2002,  5552,  1996, 14759,  2775,  1012,   102,     0,     0],
        [  101,  2016,  6406,  1037,  9870,  1012,   102,     0,     0,     0],
        [  101, 13389,  2180,  1996,  2751,  3101,  1012,   102,     0,     0],
        [  101,  2045,  2020,  2116,  2808,  1999,  1996,  3075,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [64]:
with torch.no_grad():
    outputs = model(**batch)
    print(outputs)
    predictions = F.softmax(outputs.logits, dim = 1) 
    print(predictions)
    labels = torch.argmax(predictions, dim = 1)
    print(labels)
    labels = [model.config.id2label[label_id] for label_id in labels.tolist()]
    print(labels)

SequenceClassifierOutput(loss=None, logits=tensor([[-2.9233,  2.9563],
        [ 3.5349, -2.8486],
        [-4.0332,  4.1618],
        [ 0.7429, -0.6320]]), hidden_states=None, attentions=None)
tensor([[2.7879e-03, 9.9721e-01],
        [9.9831e-01, 1.6862e-03],
        [2.7595e-04, 9.9972e-01],
        [7.9817e-01, 2.0183e-01]])
tensor([1, 0, 1, 0])
['POSITIVE', 'NEGATIVE', 'POSITIVE', 'NEGATIVE']


In [65]:
save_directory = "saved"
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

tokenizer = AutoTokenizer.from_pretrained(save_directory)
model = AutoModelForSequenceClassification.from_pretrained(save_directory)

## Let's try a model trained on a Spanish language corpus

In [68]:
model_name = "dpalominop/spanish-bert-apoyo"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

X_train_spanish = ["ayer gané la carrera.", 
          "ella lloró cuando se cayó.", 
          "gracias por la planta que me diste.", 
          "la guerra cobró muchas vidas."]

batch = tokenizer(X_train_spanish, padding = True, truncation = True, max_length = 512, return_tensors = "pt")
print(batch)

with torch.no_grad():
     outputs = model(**batch)
     print(outputs)
     predictions = F.softmax(outputs.logits, dim = 1) 
     print(predictions)
     labels = torch.argmax(predictions, dim = 1)
     print(labels)
     labels = [model.config.id2label[label_id] for label_id in labels.tolist()]
     print(labels)

{'input_ids': tensor([[    4,  5023, 19593,  1032,  3630,  1008,     5,     1,     1,     1],
        [    4,  1512,  4878, 30957,  1351,  1057, 24959,  1008,     5,     1],
        [    4,  1542,  1076,  1032,  5340,  1041,  1094,  9669,  1008,     5],
        [    4,  1032,  2274, 22181,  2458,  5394,  1008,     5,     1,     1]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])}
SequenceClassifierOutput(loss=None, logits=tensor([[-2.6419, -0.2875,  1.2996],
        [ 0.2926,  1.2322, -2.0869],
        [-2.9871,  0.0106,  1.0275],
        [-0.9706,  0.3721, -0.5588]]), hidden_states=None, attentions=None)
tensor([[0.0159, 0.1671, 0.8170],
        [0.2739, 0.7008, 0.0254],
      

##Sentence translation Spanish - English:

"ayer gané la carrera." = "i won the race yesterday."

"ella lloró cuando se cayó." = "she cried when she fell."

"gracias por la planta que me diste." = "thank you for the plant you gave me."

"la guerra cobró muchas vidas." = "the war cost many lives."

##Labels

In the config file of this model ("dpalominop/spanish-bert-apoyo") - hugging face hub

These labels are shown:

- "0": "LABEL_0"
- "1": "LABEL_1"
- "2": "LABEL_2"

I will assume this mapping:

- Label_1 = Negative
- Label_2 = Positive

## !Salud!