# model example
* model https://huggingface.co/docs/transformers/main_classes/model

# Question Answering Model

The goal is to provide answers to questions based on a given context. 
The context contains the information needed to answer the questions.



In [40]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

model_checkpoint = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

question = "Who released the first version of Linux?"
context = """Linux  is a family of open-source Unix-like operating systems based on the Linux kernel,
an operating system kernel first released on September 17, 1991, by Linus Torvalds."""
inputs = tokenizer(question, context, return_tensors="pt")
inputs

{'input_ids': tensor([[  101,  2627,  1308,  1103,  1148,  1683,  1104, 11735,   136,   102,
         11735,  1110,   170,  1266,  1104,  1501,   118,  2674, 27272,   118,
          1176,  3389,  2344,  1359,  1113,  1103, 11735, 18670,   117,  1126,
          3389,  1449, 18670,  1148,  1308,  1113,  1347,  1542,   117,  1984,
           117,  1118, 12221,  1361, 19928,  7501,  3680,   119,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])}

In [55]:
type(model)

transformers.models.distilbert.modeling_distilbert.DistilBertForQuestionAnswering

In [41]:
type(inputs)

transformers.tokenization_utils_base.BatchEncoding

In [42]:
inputs.input_ids.shape

torch.Size([1, 49])

In [43]:
outputs = model(**inputs)
outputs

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-2.3153, -4.2984, -5.1707, -4.6476, -5.2568, -5.8371, -6.9937, -3.5902,
         -2.8165, -3.0625,  0.6904, -4.2836, -2.4276, -3.2922, -5.6233, -1.9401,
         -5.6786, -4.5005, -1.1397, -5.6055, -4.5696, -3.4442, -3.5298, -3.3858,
         -4.7425, -0.9832,  1.7179, -2.4405, -3.0431, -0.2407, -0.6212, -3.5745,
         -0.6386, -0.5249, -0.9561, -0.1684,  1.1976, -1.6041, -3.6732,  0.5870,
         -2.3126,  3.2979, 11.6171,  2.3333,  3.9388,  1.3816,  0.9345, -0.9856,
         -3.0625]], grad_fn=<CloneBackward0>), end_logits=tensor([[ 0.1464, -3.7836, -5.0152, -5.3969, -5.4530, -4.7866, -6.5632, -2.8844,
         -2.8747, -3.0770,  0.3805, -3.7702, -4.5461, -2.9247, -5.2478, -4.1494,
         -5.2456, -3.0263, -0.7369, -4.0945, -3.5612, -4.1459, -1.2482, -4.0121,
         -5.0074, -4.6150,  0.1491,  0.2839, -0.3344, -4.1845, -3.8579, -1.7281,
          0.4740, -1.4528, -0.9145, -2.6955, -1.3705, -0.5907, -1.2423,  2.7562

In [44]:
type(outputs)

transformers.modeling_outputs.QuestionAnsweringModelOutput

In [45]:
outputs.start_logits.shape

torch.Size([1, 49])

In [46]:
outputs.end_logits.shape

torch.Size([1, 49])

In [47]:
import pandas as pd

df = pd.DataFrame({"id": inputs.input_ids[0], "token": inputs.tokens() })
df

Unnamed: 0,id,token
0,101,[CLS]
1,2627,Who
2,1308,released
3,1103,the
4,1148,first
5,1683,version
6,1104,of
7,11735,Linux
8,136,?
9,102,[SEP]


In [52]:
# choose the index of the largest logit
torch.argmax(outputs.start_logits, dim=-1) # reduce the last dimension

tensor([42])

In [57]:
torch.argmax(outputs.start_logits) #  argmax of the flattened input is returned

tensor(42)

In [48]:
# Get the predicted answer span
answer_start = torch.argmax(outputs.start_logits).item()
answer_end = torch.argmax(outputs.end_logits).item()
answer_start, answer_end 

(42, 46)

In [49]:
# Get the answer span from the context
answer = tokenizer.convert_tokens_to_string(
    tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end+1]) # answer_end th token included
    )

In [50]:
# Print the result
print("Question:", question)
print("Answer:", answer)


Question: Who released the first version of Linux?
Answer: Linus Torvalds


# Model for Sequence classification


In [59]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_checkpoint = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
type(model)

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

transformers.models.bert.modeling_bert.BertForSequenceClassification

In [60]:
model.config.id2label

{0: '1 star', 1: '2 stars', 2: '3 stars', 3: '4 stars', 4: '5 stars'}

In [63]:
review = "I love using this product! It's amazing."

# Tokenize the input text
inputs = tokenizer(review, return_tensors="pt")

# Get model predictions
outputs = model(**inputs)

# Get the predicted sentiment label
predicted_sentiment = torch.argmax(outputs.logits).item()

# Print the result
print("Input Text:", review)
print("Predicted Sentiment Label:", predicted_sentiment)


Input Text: I love using this product! It's amazing.
Predicted Sentiment Label: 4


predict sentiments based on reviews

In [94]:
reviews = [
    "This the best computer available today!!!!", 
    "I love it!", 
    "High quality.", 
    "I like it.", 
    "Not bad.", 
    "Low quality high price",
    "Extreamly disappointed.",
    "Exploded!",
    "Garbage!",
    "Junk!",
    ]
inputs = tokenizer(reviews, return_tensors='pt', padding=True)
outputs = model(**inputs)
predicted_sentiments = torch.argmax(outputs.logits, dim=-1)
predicted_sentiments

tensor([4, 4, 4, 3, 2, 1, 1, 0, 0, 0])

In [146]:
import pandas as pd

print("Each setiment is between 0 and 4.")
df = pd.DataFrame({'review': reviews, 'predicted_sentiment': predicted_sentiments})
df

Each setiment is between 0 and 4.


Unnamed: 0,review,predicted_sentiment
0,This the best computer available today!!!!,4
1,I love it!,4
2,High quality.,4
3,I like it.,3
4,Not bad.,2
5,Low quality high price,1
6,Extreamly disappointed.,1
7,Exploded!,0
8,Garbage!,0
9,Junk!,0


# model for translation

* marian https://huggingface.co/docs/transformers/model_doc/marian

In [98]:
# %pip install sacremoses

In [99]:
from transformers import MarianMTModel, MarianTokenizer

# Choose a translation model and its corresponding tokenizer for English to Spanish
model_name = "Helsinki-NLP/opus-mt-en-es"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

In [100]:
type(model)

transformers.models.marian.modeling_marian.MarianMTModel

In [101]:
type(tokenizer)

transformers.models.marian.tokenization_marian.MarianTokenizer

translate a english text into Spanish

In [129]:
english_text = "Nice to meet you!"
input_ids = tokenizer.encode(english_text, return_tensors="pt")
input_ids # tensor

tensor([[8676,   13, 1504,   40,   55,    0]])

In [130]:
outputs = model.generate(input_ids) 
outputs

tensor([[65000,   107,  1500, 31701,   156,     4, 27620,    55,     0]])

In [120]:
tokenizer.decode(outputs[0])

'<pad> ¡Encantado de conocerte!</s>'

In [121]:
# Decode the generated translation
translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
translated_text

'¡Encantado de conocerte!'

In [122]:
# Print the result
print("Input Text (English):", english_text)
print("Translated Text (Spanish):", translated_text)

Input Text (English): Nice to meet you!
Translated Text (Spanish): ¡Encantado de conocerte!


translate english texts into spanish ones

In [142]:
english_texts = ["Good morning!", "Hi!", "God bless you.", "Thanks.", "Do you speak Spanish?"]
inputs = tokenizer(english_texts, return_tensors='pt', padding=True)
inputs


{'input_ids': tensor([[ 1922,  2731,    55,     0, 65000, 65000],
        [ 2745,    55,     0, 65000, 65000, 65000],
        [  386, 21319,    40,     3,     0, 65000],
        [ 3566,     3,     0, 65000, 65000, 65000],
        [  670,    40,  3159,  2036,    21,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1]])}

In [143]:
outputs = model.generate(**inputs)
outputs

tensor([[65000,   107, 30286,   628,    55,     0, 65000],
        [65000,   107,  7728,    55,     0, 65000, 65000],
        [65000,   392,   178, 35586,     3,     0, 65000],
        [65000,  1124,     3,     0, 65000, 65000, 65000],
        [65000,    50, 23416,     9,  4522,    21,     0]])

In [144]:
translated_texts =  [tokenizer.decode(t, skip_special_tokens=True) for t in outputs]
translated_texts

['¡Buenos días!', '¡Hola!', 'Dios te bendiga.', 'Gracias.', '¿Hablas español?']

In [145]:
import pandas as pd

df = pd.DataFrame({'source': english_texts, 'targets': translated_texts})
df

Unnamed: 0,source,targets
0,Good morning!,¡Buenos días!
1,Hi!,¡Hola!
2,God bless you.,Dios te bendiga.
3,Thanks.,Gracias.
4,Do you speak Spanish?,¿Hablas español?
