# model example
* model https://huggingface.co/docs/transformers/main_classes/model

# Question Answering Model

The goal is to provide answers to questions based on a given context. 
The context contains the information needed to answer the questions.



In [98]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

model_checkpoint = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

question = "Who released the first version of Linux?"
context = """Linux  is a family of open-source Unix-like operating systems based on the Linux kernel,
an operating system kernel first released on September 17, 1991, by Linus Torvalds."""
inputs = tokenizer(question, context, return_tensors="pt")
inputs

{'input_ids': tensor([[  101,  2627,  1308,  1103,  1148,  1683,  1104, 11735,   136,   102,
         11735,  1110,   170,  1266,  1104,  1501,   118,  2674, 27272,   118,
          1176,  3389,  2344,  1359,  1113,  1103, 11735, 18670,   117,  1126,
          3389,  1449, 18670,  1148,  1308,  1113,  1347,  1542,   117,  1984,
           117,  1118, 12221,  1361, 19928,  7501,  3680,   119,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])}

In [99]:
type(model)

transformers.models.distilbert.modeling_distilbert.DistilBertForQuestionAnswering

In [100]:
type(inputs)

transformers.tokenization_utils_base.BatchEncoding

In [101]:
inputs.input_ids.shape

torch.Size([1, 49])

In [102]:
outputs = model(**inputs)
outputs

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-2.3153, -4.2984, -5.1707, -4.6476, -5.2568, -5.8371, -6.9937, -3.5902,
         -2.8165, -3.0625,  0.6904, -4.2836, -2.4276, -3.2922, -5.6233, -1.9401,
         -5.6786, -4.5005, -1.1397, -5.6055, -4.5696, -3.4442, -3.5298, -3.3858,
         -4.7425, -0.9832,  1.7179, -2.4405, -3.0431, -0.2407, -0.6212, -3.5745,
         -0.6386, -0.5249, -0.9561, -0.1684,  1.1976, -1.6041, -3.6732,  0.5870,
         -2.3126,  3.2979, 11.6171,  2.3333,  3.9388,  1.3816,  0.9345, -0.9856,
         -3.0625]], grad_fn=<CloneBackward0>), end_logits=tensor([[ 0.1464, -3.7836, -5.0152, -5.3969, -5.4530, -4.7866, -6.5632, -2.8844,
         -2.8747, -3.0770,  0.3805, -3.7702, -4.5461, -2.9247, -5.2478, -4.1494,
         -5.2456, -3.0263, -0.7369, -4.0945, -3.5612, -4.1459, -1.2482, -4.0121,
         -5.0074, -4.6150,  0.1491,  0.2839, -0.3344, -4.1845, -3.8579, -1.7281,
          0.4740, -1.4528, -0.9145, -2.6955, -1.3705, -0.5907, -1.2423,  2.7562

In [103]:
type(outputs)

transformers.modeling_outputs.QuestionAnsweringModelOutput

In [104]:
outputs.start_logits.shape, outputs.end_logits.shape

(torch.Size([1, 49]), torch.Size([1, 49]))

In [105]:
import pandas as pd

df = pd.DataFrame({"id": inputs.input_ids[0], "token": inputs.tokens() })
df

Unnamed: 0,id,token
0,101,[CLS]
1,2627,Who
2,1308,released
3,1103,the
4,1148,first
5,1683,version
6,1104,of
7,11735,Linux
8,136,?
9,102,[SEP]


In [106]:
# choose the index of the largest logit
torch.argmax(outputs.start_logits, dim=-1) # reduce the last dimension

tensor([42])

In [107]:
torch.argmax(outputs.start_logits) #  argmax of the flattened input is returned

tensor(42)

In [108]:
# Get the predicted answer span
answer_start = torch.argmax(outputs.start_logits).item()
answer_end = torch.argmax(outputs.end_logits).item()
answer_start, answer_end 

(42, 46)

In [109]:
# Get the answer span from the context
answer = tokenizer.convert_tokens_to_string(
    tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end+1]) # answer_end th token included
    )

In [110]:
# Print the result
print("Question:", question)
print("Answer:", answer) # correct

Question: Who released the first version of Linux?
Answer: Linus Torvalds


### we shuould not choose answer from question


In [111]:
sequence_ids = inputs.sequence_ids() # list
attension_mask = inputs.attention_mask[0] # 1d tensor
print(sequence_ids)
print(attension_mask)

[None, 0, 0, 0, 0, 0, 0, 0, 0, None, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, None]
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1])


In [112]:
# inputs contains encoded question
tokenizer.decode(inputs.input_ids[0]) 

'[CLS] Who released the first version of Linux? [SEP] Linux is a family of open - source Unix - like operating systems based on the Linux kernel, an operating system kernel first released on September 17, 1991, by Linus Torvalds. [SEP]'

In [113]:
inputs.input_ids.shape, inputs.attention_mask.shape

(torch.Size([1, 49]), torch.Size([1, 49]))

In [114]:
import pandas as pd

df = pd.DataFrame({'token': inputs.tokens(), 'sequence_id': sequence_ids, 'attension_mask': attension_mask, })
df

Unnamed: 0,token,sequence_id,attension_mask
0,[CLS],,1
1,Who,0.0,1
2,released,0.0,1
3,the,0.0,1
4,first,0.0,1
5,version,0.0,1
6,of,0.0,1
7,Linux,0.0,1
8,?,0.0,1
9,[SEP],,1


In [115]:
# mask for start_logits and end_logits
mask = [i != 1  for i in sequence_ids] # list of bool

# unmask [CLS] token
mask[0] = False

mask = torch.tensor(mask).unsqueeze(0)
mask, mask.shape, start_logits.shape


(tensor([[False,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False,  True]]),
 torch.Size([1, 49]),
 torch.Size([1, 49]))

In [116]:
start_logits = outputs.start_logits
end_logits = outputs.end_logits

start_logits[mask] = -99999
end_logits[mask] = -99999

start_logits

tensor([[-2.3153e+00, -9.9999e+04, -9.9999e+04, -9.9999e+04, -9.9999e+04,
         -9.9999e+04, -9.9999e+04, -9.9999e+04, -9.9999e+04, -9.9999e+04,
          6.9035e-01, -4.2836e+00, -2.4276e+00, -3.2922e+00, -5.6233e+00,
         -1.9401e+00, -5.6786e+00, -4.5005e+00, -1.1397e+00, -5.6055e+00,
         -4.5696e+00, -3.4442e+00, -3.5298e+00, -3.3858e+00, -4.7425e+00,
         -9.8318e-01,  1.7179e+00, -2.4405e+00, -3.0431e+00, -2.4069e-01,
         -6.2124e-01, -3.5745e+00, -6.3859e-01, -5.2492e-01, -9.5606e-01,
         -1.6843e-01,  1.1976e+00, -1.6041e+00, -3.6732e+00,  5.8699e-01,
         -2.3126e+00,  3.2979e+00,  1.1617e+01,  2.3333e+00,  3.9388e+00,
          1.3816e+00,  9.3445e-01, -9.8556e-01, -9.9999e+04]],
       grad_fn=<IndexPutBackward0>)

In [117]:
start_probabilities = torch.nn.functional.softmax(start_logits, dim=-1)
end_probabilities = torch.nn.functional.softmax(end_logits, dim=-1)
start_probabilities, end_probabilities

(tensor([[8.8873e-07, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.7952e-05, 1.2416e-07,
          7.9437e-07, 3.3458e-07, 3.2519e-08, 1.2934e-06, 3.0768e-08, 9.9944e-08,
          2.8796e-06, 3.3102e-08, 9.3271e-08, 2.8742e-07, 2.6383e-07, 3.0469e-07,
          7.8463e-08, 3.3675e-06, 5.0159e-05, 7.8418e-07, 4.2924e-07, 7.0756e-06,
          4.8361e-06, 2.5229e-07, 4.7529e-06, 5.3251e-06, 3.4601e-06, 7.6059e-06,
          2.9813e-05, 1.8098e-06, 2.2858e-07, 1.6189e-05, 8.9114e-07, 2.4352e-04,
          9.9898e-01, 9.2821e-05, 4.6227e-04, 3.5836e-05, 2.2915e-05, 3.3595e-06,
          0.0000e+00]], grad_fn=<SoftmaxBackward0>),
 tensor([[9.2858e-06, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.1735e-05, 1.8487e-07,
          8.5097e-08, 4.3060e-07, 4.2184e-08, 1.2653e-07, 4.2279e-08, 3.8900e-07,
          3.8390e-06, 1.3367e-07, 2.2783e-07,

In [118]:
# create start-end-probability table
start_probabilities.squeeze()[:, None]

tensor([[8.8873e-07],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [0.0000e+00],
        [1.7952e-05],
        [1.2416e-07],
        [7.9437e-07],
        [3.3458e-07],
        [3.2519e-08],
        [1.2934e-06],
        [3.0768e-08],
        [9.9944e-08],
        [2.8796e-06],
        [3.3102e-08],
        [9.3271e-08],
        [2.8742e-07],
        [2.6383e-07],
        [3.0469e-07],
        [7.8463e-08],
        [3.3675e-06],
        [5.0159e-05],
        [7.8418e-07],
        [4.2924e-07],
        [7.0756e-06],
        [4.8361e-06],
        [2.5229e-07],
        [4.7529e-06],
        [5.3251e-06],
        [3.4601e-06],
        [7.6059e-06],
        [2.9813e-05],
        [1.8098e-06],
        [2.2858e-07],
        [1.6189e-05],
        [8.9114e-07],
        [2.4352e-04],
        [9.9898e-01],
        [9.2821e-05],
        [4.6227e-04],
        [3

In [119]:
# start-end probability table
scores = start_probabilities.squeeze()[:, None] * end_probabilities
scores

tensor([[8.2525e-12, 0.0000e+00, 0.0000e+00,  ..., 8.0982e-07, 7.8144e-08,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [2.1278e-10, 0.0000e+00, 0.0000e+00,  ..., 2.0881e-05, 2.0149e-06,
         0.0000e+00],
        [3.1195e-11, 0.0000e+00, 0.0000e+00,  ..., 3.0612e-06, 2.9539e-07,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00]], grad_fn=<MulBackward0>)

In [120]:
scores = torch.triu(scores) # prob shoud be 0 where start > end
scores

tensor([[8.2525e-12, 0.0000e+00, 0.0000e+00,  ..., 8.0982e-07, 7.8144e-08,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 2.0881e-05, 2.0149e-06,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 2.9539e-07,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00]], grad_fn=<TriuBackward0>)

In [121]:
scores[42, 46] # high probability 

tensor(0.9103, grad_fn=<SelectBackward0>)

In [122]:
max_index = scores.argmax().item() 
answer_start = max_index // scores.shape[1]
answer_start

42

In [123]:
answer_end = max_index % scores.shape[1]
answer_end

46

In [124]:
answer = tokenizer.convert_tokens_to_string(
    inputs.tokens()[answer_start: answer_end+1]
)
answer

'Linus Torvalds'

In [125]:
inputs_with_offsets = tokenizer(question, context, return_offsets_mapping=True)
inputs_with_offsets

{'input_ids': [101, 2627, 1308, 1103, 1148, 1683, 1104, 11735, 136, 102, 11735, 1110, 170, 1266, 1104, 1501, 118, 2674, 27272, 118, 1176, 3389, 2344, 1359, 1113, 1103, 11735, 18670, 117, 1126, 3389, 1449, 18670, 1148, 1308, 1113, 1347, 1542, 117, 1984, 117, 1118, 12221, 1361, 19928, 7501, 3680, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 3), (4, 12), (13, 16), (17, 22), (23, 30), (31, 33), (34, 39), (39, 40), (0, 0), (0, 5), (7, 9), (10, 11), (12, 18), (19, 21), (22, 26), (26, 27), (27, 33), (34, 38), (38, 39), (39, 43), (44, 53), (54, 61), (62, 67), (68, 70), (71, 74), (75, 80), (81, 87), (87, 88), (89, 91), (92, 101), (102, 108), (109, 115), (116, 121), (122, 130), (131, 133), (134, 143), (144, 146), (146, 147), (148, 152), (152, 153), (154, 156), (157, 160), (160, 162), (163, 166), (166, 169), (169, 171), (171, 172), (0,

In [126]:
import pandas as pd

starts = list(map(lambda x: x[0], inputs_with_offsets['offset_mapping']))
ends = list(map(lambda x: x[1], inputs_with_offsets['offset_mapping']))
tokens = inputs.tokens()

df = pd.DataFrame({'token': tokens, 'start': starts, 'end': ends})
df

Unnamed: 0,token,start,end
0,[CLS],0,0
1,Who,0,3
2,released,4,12
3,the,13,16
4,first,17,22
5,version,23,30
6,of,31,33
7,Linux,34,39
8,?,39,40
9,[SEP],0,0


In [127]:
context[154: 156]

'by'

In [128]:
offsets = inputs_with_offsets["offset_mapping"]

start_char, _ = offsets[answer_start]
_, end_char = offsets[answer_end]
answer = context[start_char:end_char]
answer

'Linus Torvalds'

In [129]:
result = {
    "answer": answer,
    "start": start_char,
    "end": end_char,
    "score": scores[answer_start, answer_end],
}
print(result)

{'answer': 'Linus Torvalds', 'start': 157, 'end': 171, 'score': tensor(0.9103, grad_fn=<SelectBackward0>)}


# Model for Sequence classification


In [130]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_checkpoint = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
type(model)

transformers.models.bert.modeling_bert.BertForSequenceClassification

In [131]:
model.config.id2label

{0: '1 star', 1: '2 stars', 2: '3 stars', 3: '4 stars', 4: '5 stars'}

In [132]:
review = "I love using this product! It's amazing."

# Tokenize the input text
inputs = tokenizer(review, return_tensors="pt")
inputs


{'input_ids': tensor([[  101,   151, 11157, 13136, 10372, 20058,   106, 10197,   112,   161,
         39854,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [133]:
# Get model predictions
outputs = model(**inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-2.5989, -2.9319, -1.2934,  1.5799,  4.2139]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [134]:
# Get the predicted sentiment label
predicted_sentiment = torch.argmax(outputs.logits).item()

# Print the result
print("Input Text:", review)
print("Predicted Sentiment Label:", predicted_sentiment)

Input Text: I love using this product! It's amazing.
Predicted Sentiment Label: 4


predict sentiments based on reviews

In [135]:
reviews = [
    "This is the best computer available today!!!!", 
    "I love it!", 
    "High quality.", 
    "I like it.", 
    "Not bad.", 
    "Low quality high price",
    "Extreamly disappointed.",
    "Exploded!",
    "Garbage!",
    "Junk!",
    ]
inputs = tokenizer(reviews, return_tensors='pt', padding=True)
inputs

{'input_ids': tensor([[  101, 10372, 10127, 10103, 11146, 14831, 14685, 13980,   106,   106,
           106,   106,   102],
        [  101,   151, 11157, 10197,   106,   102,     0,     0,     0,     0,
             0,     0,     0],
        [  101, 11053, 19468,   119,   102,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101,   151, 11531, 10197,   119,   102,     0,     0,     0,     0,
             0,     0,     0],
        [  101, 10497, 12428,   119,   102,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101, 14298, 19468, 11053, 16993,   102,     0,     0,     0,     0,
             0,     0,     0],
        [  101, 11460, 53148, 10563, 31021, 54894, 83912,   119,   102,     0,
             0,     0,     0],
        [  101, 11460, 37904, 20298,   106,   102,     0,     0,     0,     0,
             0,     0,     0],
        [  101, 15406, 66395,   106,   102,     0,     0,     0,     0,     0,
             0,     0,     

In [136]:
inputs.input_ids.shape

torch.Size([10, 13])

In [137]:
len(reviews)

10

In [138]:
outputs = model(**inputs)
predicted_sentiments = torch.argmax(outputs.logits, dim=-1)
predicted_sentiments

tensor([4, 4, 4, 3, 2, 1, 1, 0, 0, 0])

In [139]:
import pandas as pd

print("Each setiment is between 0 and 4.")
df = pd.DataFrame({'review': reviews, 'predicted_sentiment': predicted_sentiments})
df

Each setiment is between 0 and 4.


Unnamed: 0,review,predicted_sentiment
0,This is the best computer available today!!!!,4
1,I love it!,4
2,High quality.,4
3,I like it.,3
4,Not bad.,2
5,Low quality high price,1
6,Extreamly disappointed.,1
7,Exploded!,0
8,Garbage!,0
9,Junk!,0


# model for translation

* marian https://huggingface.co/docs/transformers/model_doc/marian

In [140]:
# %pip install sacremoses

In [141]:
from transformers import MarianMTModel, MarianTokenizer

# Choose a translation model and its corresponding tokenizer for English to Spanish
model_name = "Helsinki-NLP/opus-mt-en-es"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

In [142]:
type(model)

transformers.models.marian.modeling_marian.MarianMTModel

In [143]:
type(tokenizer)

transformers.models.marian.tokenization_marian.MarianTokenizer

translate a english text into Spanish

In [144]:
english_text = "Nice to meet you!"
input_ids = tokenizer.encode(english_text, return_tensors="pt")
input_ids # tensor

tensor([[8676,   13, 1504,   40,   55,    0]])

In [145]:
outputs = model.generate(input_ids) 
outputs

tensor([[65000,   107,  1500, 31701,   156,     4, 27620,    55,     0]])

In [146]:
tokenizer.decode(outputs[0])

'<pad> ¡Encantado de conocerte!</s>'

In [147]:
# Decode the generated translation
translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
translated_text

'¡Encantado de conocerte!'

In [148]:
# Print the result
print("Input Text (English):", english_text)
print("Translated Text (Spanish):", translated_text)

Input Text (English): Nice to meet you!
Translated Text (Spanish): ¡Encantado de conocerte!


translate english texts into spanish ones

In [149]:
english_texts = ["Good morning!", "Hi!", "God bless you.", "Thanks.", "Do you speak Spanish?"]
inputs = tokenizer(english_texts, return_tensors='pt', padding=True)
inputs


{'input_ids': tensor([[ 1922,  2731,    55,     0, 65000, 65000],
        [ 2745,    55,     0, 65000, 65000, 65000],
        [  386, 21319,    40,     3,     0, 65000],
        [ 3566,     3,     0, 65000, 65000, 65000],
        [  670,    40,  3159,  2036,    21,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 0],
        [1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1]])}

In [150]:
outputs = model.generate(**inputs)
outputs

tensor([[65000,   107, 30286,   628,    55,     0, 65000],
        [65000,   107,  7728,    55,     0, 65000, 65000],
        [65000,   392,   178, 35586,     3,     0, 65000],
        [65000,  1124,     3,     0, 65000, 65000, 65000],
        [65000,    50, 23416,     9,  4522,    21,     0]])

In [151]:
translated_texts =  [tokenizer.decode(t, skip_special_tokens=True) for t in outputs]
translated_texts

['¡Buenos días!', '¡Hola!', 'Dios te bendiga.', 'Gracias.', '¿Hablas español?']

In [152]:
import pandas as pd

df = pd.DataFrame({'source': english_texts, 'target': translated_texts})
df

Unnamed: 0,source,target
0,Good morning!,¡Buenos días!
1,Hi!,¡Hola!
2,God bless you.,Dios te bendiga.
3,Thanks.,Gracias.
4,Do you speak Spanish?,¿Hablas español?
