In [12]:
from transformers import AutoConfig, AutoTokenizer, TFAutoModel, TFBertForTokenClassification
import tensorflow as tf
import numpy as np

config = AutoConfig.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")
model = TFAutoModel.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")

Some layers from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [13]:
text = "ما در هوشواره معتقدیم با انتقال صحیح دانش و آگاهی، همه افراد می‌توانند از ابزارهای هوشمند استفاده کنند. شعار ما هوش مصنوعی برای همه است."
# tokenizer.tokenize(text)

In [14]:
# model.config

In [15]:
encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="tf")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [16]:
input_ids = tf.convert_to_tensor(encoded_input["input_ids"].numpy())

In [17]:
attention_mask = tf.convert_to_tensor(encoded_input["attention_mask"].numpy())

In [18]:
token_type_ids = tf.convert_to_tensor(encoded_input["token_type_ids"].numpy())

In [19]:
output = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

In [20]:
# output = model(encoded_input)

In [1]:
# token_embeddings = output.last_hidden_state
# for i in range(token_embeddings.shape[1]):
#     token_id = encoded_input["input_ids"][0][i].numpy()
#     token = tokenizer.convert_ids_to_tokens(np.array([token_id]))
#     embedding = token_embeddings[0][i].numpy()
#     print(f"Token: {token}")
#     print(f"Embedding: {embedding}")

In [11]:
# Prepare the training data
train_sentences = ["این یک مثال است", "مثال دیگری است"]
train_labels = [
    ["DET", "NUM", "NOUN", "VERB", "PUNCT"],
    ["NOUN", "ADJ", "VERB", "DET", "ADJ", "NOUN", "PUNCT"],
]

train_tokenized = tokenizer(train_sentences, padding=True, truncation=True, return_tensors="tf")
train_input_ids = train_tokenized["input_ids"].numpy()
train_attention_mask = train_tokenized["attention_mask"]
train_labels_encoded = np.array([[tokenizer.convert_tokens_to_ids(label) for label in labels] for labels in train_labels])

# Define the model architecture
num_labels = len(tokenizer.get_vocab())
token_class_model = TFBertForTokenClassification.from_pretrained("HooshvareLab/bert-base-parsbert-uncased", num_labels=num_labels)

# Trainthe model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
token_class_model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])
token_class_model.fit(
    x=(train_input_ids, train_attention_mask),
    y=train_labels_encoded,
    epochs=3
)

# Test the model
test_sentence = "این یک مثال دیگر است"
test_tokenized = tokenizer(test_sentence, padding=True, truncation=True, return_tensors="tf")
test_input_ids = test_tokenized["input_ids"]
test_attention_mask = test_tokenized["attention_mask"]
test_logits = token_class_model.predict((test_input_ids, test_attention_mask))[0]
test_labels = tf.argmax(test_logits, axis=-1).numpy()
test_tokens = tokenizer.convert_ids_to_tokens(test_input_ids[0])
test_reordered_tokens = [test_tokens[i] for i in test_labels[1:-1]]
test_reordered_sentence = tokenizer.convert_tokens_to_string(test_reordered_tokens)
print(test_reordered_sentence)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  train_labels_encoded = np.array([[tokenizer.convert_tokens_to_ids(label) for label in labels] for labels in train_labels])


NameError: name 'TFBertForTokenClassification' is not defined

In [4]:
from transformers import TFAutoModelForMaskedLM, AutoTokenizer
import random

# Load the pre-trained ParsBERT model and tokenizer
model_name = "HooshvareLab/bert-base-parsbert-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForMaskedLM.from_pretrained(model_name)

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [19]:
# Define the input sentence
text = "ای دوست بیا تا"

# Tokenize the input sentence and replace a random token with the [UNK] token
tokenized_text = tokenizer.encode(text, add_special_tokens=True)
unk_token_index = random.randint(1, len(tokenized_text) - 2)  # Choose a random token to replace
tokenized_text[unk_token_index] = tokenizer.unk_token_id

In [20]:
import tensorflow as tf
# Convert the tokenized sentence to a tensor and run the model to obtain the probability distribution
input_ids = tf.constant(tokenized_text)[None, :]  # Batch size 1
outputs = model(input_ids)

In [21]:
predictions = tf.nn.softmax(outputs.logits[0, unk_token_index], axis=-1)

In [22]:
# Get the top 5 most likely filled-in tokens and their probabilities
k = 10
top_k = tf.math.top_k(predictions, k=k)
for i, token_index in enumerate(top_k.indices.numpy()):
    token = tokenizer.decode([token_index])
    score = top_k.values.numpy()[i]
    print(f"{i+1}. '{token}' ({score:.2f})")

1. 'ای' (0.24)
2. 'با' (0.03)
3. 'سلام' (0.03)
4. 'بیا' (0.03)
5. 'اقای' (0.02)
6. 'به' (0.02)
7. 'یه' (0.02)
8. 'یک' (0.02)
9. 'خانهی' (0.02)
10. 'خانم' (0.02)


In [24]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

# Load the pre-trained ParsBERT model and tokenizer for sequence classification
model_name = "HooshvareLab/bert-base-parsbert-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
sentences = [
    'من چقدر خوشحالم',
    'لعنت به این شانس',
]

# Tokenize the input sentences and convert them to a tensor
max_length = 128  # Maximum sequence length for the model
encoded_inputs = tokenizer(sentences, padding=True, truncation=True, max_length=max_length, return_tensors="tf")
input_ids = encoded_inputs["input_ids"]

In [43]:
# Run the model to obtain the probability distribution over the classes
outputs = model(input_ids)
probabilities = tf.nn.softmax(outputs.logits, axis=-1)

In [44]:
# Print the probabilities for each input sentence
for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}: '{sentence}'")
    print(f"Probability: {probabilities[i, 1]:.4f}\n")

Sentence 1: 'من چقدر خوشحالم'
Probability: 0.4683

Sentence 2: 'لعنت به این شانس'
Probability: 0.4526



In [45]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForMaskedLM

# Load the pre-trained ParsBERT model and tokenizer
model_name = "HooshvareLab/bert-base-parsbert-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForMaskedLM.from_pretrained(model_name)

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [73]:
sentences = [
    "من به مدرسه می‌روم.",
    "می‌روم به مدرسه من.",
    'من می‌روم به مدرسه.',
    'به مدرسه من می‌روم.',
    'به مدرسه می‌روم من.',
    'می‌روم من به مدرسه.',
]

# Tokenize the input sentences and convert them to a tensor
max_length = 128
encoded_inputs = tokenizer(sentences, padding=True, truncation=True, max_length=max_length, return_tensors="tf")
input_ids = encoded_inputs["input_ids"]

In [74]:
import numpy as np
# Compute the log-likelihood score of each sentence
log_likelihoods = []
for i in range(len(sentences)):
    outputs = model(input_ids[i][None, :])
    log_probabilities = tf.nn.log_softmax(outputs.logits[0], axis=-1)
    is_special_token = tf.math.logical_or(tf.math.equal(input_ids[i], tokenizer.cls_token_id),
                                          tf.math.logical_or(tf.math.equal(input_ids[i], tokenizer.sep_token_id),
                                                             tf.math.equal(input_ids[i], tokenizer.pad_token_id)))
    token_log_probabilities = tf.boolean_mask(log_probabilities, tf.math.logical_not(is_special_token))
    sentence_log_likelihood = tf.reduce_sum(token_log_probabilities).numpy()
    log_likelihoods.append(sentence_log_likelihood)

# Convert log-likelihood scores to probabilities using the softmax function
probabilities = tf.nn.softmax(log_likelihoods).numpy()

In [71]:
for i in range(len(sentences)):
    print(f"Sentence {i+1}: '{sentences[i]}'")
    print(f"Probability: {probabilities[i]:.4f}\n")

Sentence 1: 'من به مدرسه می‌روم.'
Probability: 0.0000

Sentence 2: 'می‌روم به مدرسه من.'
Probability: 0.0000

Sentence 3: 'من می‌روم به مدرسه.'
Probability: 0.0000

Sentence 4: 'به مدرسه من می‌روم.'
Probability: 0.0000

Sentence 5: 'به مدرسه می‌روم من.'
Probability: 1.0000

Sentence 6: 'می‌روم من به مدرسه.'
Probability: 0.0000



In [75]:
# Print the top-k predicted tokens for each token in the input sentence
k = 5
for i in range(len(sentences)):
    print(f"Sentence {i+1}: '{sentences[i]}'")
    inputs = tf.constant(input_ids[i][None, :])
    outputs = model(inputs)
    logits = outputs.logits[0]
    top_k = tf.math.top_k(logits, k=k)
    for j in range(len(encoded_inputs['input_ids'][i])):
        print(f"Token: {tokenizer.decode([encoded_inputs['input_ids'][i][j]])}")
        print(f"Top-{k} predictions: {tokenizer.decode(top_k.indices[j])}\n")

Sentence 1: 'من به مدرسه می‌روم.'
Token: [CLS]
Top-5 predictions: ، و. من دارم

Token: من
Top-5 predictions: من ما دارم بنده الان

Token: به
Top-5 predictions: به گاهی هم من همیشه

Token: مدرسه
Top-5 predictions: مدرسه خانه مدارس دبستان باشگاه

Token: میروم
Top-5 predictions: میروم میرود میرویم میروند نمیروم

Token: .
Top-5 predictions: . : ؟! -

Token: [SEP]
Top-5 predictions: . : ؟ کد!

Sentence 2: 'می‌روم به مدرسه من.'
Token: [CLS]
Top-5 predictions: . میروم ، مدرسه و

Token: میروم
Top-5 predictions: میروم میرود میرویم بروید بروم

Token: به
Top-5 predictions: به در سمت پیش یه

Token: مدرسه
Top-5 predictions: مدرسه خانه کلاس مدرسهی دبیرستان

Token: من
Top-5 predictions: من ما شما او تو

Token: .
Top-5 predictions: .! ؟ : -

Token: [SEP]
Top-5 predictions: . : ؟! یک

Sentence 3: 'من می‌روم به مدرسه.'
Token: [CLS]
Top-5 predictions: ،. میروم و مدرسه

Token: من
Top-5 predictions: من ما بعد دارم الان

Token: میروم
Top-5 predictions: میروم میرم میرود میرویم بروم

Token: به
Top-5 predictio

In [76]:
sentences = [
    "من به مدرسه میروم.",
    "من میروم به مدرسه.",
    "به مدرسه من میروم.",
    "میروم من به مدرسه."
]

# Tokenize the input sentences and convert them to a tensor
max_length = 128
encoded_inputs = tokenizer(sentences, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
input_ids = encoded_inputs["input_ids"]

# Compute the log-likelihood score of each sentence
log_likelihoods = []
for i in range(len(sentences)):
    outputs = model(input_ids[i][None, :])
    log_probabilities = torch.nn.functional.log_softmax(outputs.logits[0], dim=-1)
    is_special_token = torch.logical_or(torch.eq(input_ids[i], tokenizer.cls_token_id),
                                         torch.logical_or(torch.eq(input_ids[i], tokenizer.sep_token_id),
                                                          torch.eq(input_ids[i], tokenizer.pad_token_id)))
    token_log_probabilities = torch.masked_select(log_probabilities, torch.logical_not(is_special_token))
    sentence_log_likelihood = torch.sum(token_log_probabilities).item()
    log_likelihoods.append(sentence_log_likelihood)

# Sort the sentences based on their log-likelihood scores
sorted_sentences = [sentences[i] for i in torch.argsort(torch.tensor(log_likelihoods), descending=True).tolist()]

# Print the sorted sentences
print("Sorted sentences:")
for sentence in sorted_sentences:
    print(sentence)

ValueError: Exception encountered when calling layer 'tf_bert_for_masked_lm_3' (type TFBertForMaskedLM).

Data of type <class 'torch.Tensor'> is not allowed only (<class 'tensorflow.python.framework.ops.Tensor'>, <class 'bool'>, <class 'int'>, <class 'transformers.utils.generic.ModelOutput'>, <class 'tuple'>, <class 'list'>, <class 'dict'>, <class 'numpy.ndarray'>, <class 'keras.engine.keras_tensor.KerasTensor'>) is accepted for input_ids.

Call arguments received by layer 'tf_bert_for_masked_lm_3' (type TFBertForMaskedLM):
  • input_ids=tensor([[    2,  2078,  2031,  5000, 14182,    15,     4]])
  • attention_mask=None
  • token_type_ids=None
  • position_ids=None
  • head_mask=None
  • inputs_embeds=None
  • output_attentions=None
  • output_hidden_states=None
  • return_dict=None
  • labels=None
  • training=False

In [79]:
import itertools
import tensorflow as tf
from transformers import TFAutoModelForMaskedLM, AutoTokenizer

# Load the ParsBERT model
model_name = "HooshvareLab/bert-base-parsbert-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForMaskedLM.from_pretrained(model_name)

# Define the input sentence
input_sentence = "این یک جمله تست است"

# Tokenize the input sentence
tokens = tokenizer(input_sentence, return_tensors="tf")["input_ids"]

# Generate all possible permutations of tokens
permutations = list(itertools.permutations(tokens.numpy()[0]))

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [80]:
# Calculate the probability of each permutation
scores = []
for permutation in permutations:
    sequence = tf.constant(permutation, shape=(1, len(permutation)))
    masked_sequence = tf.concat([tf.ones_like(tokens) * tokenizer.mask_token_id, sequence], axis=1)
    probabilities = model(masked_sequence)[0][:, 0, :]
    score = tf.math.reduce_prod(tf.gather(probabilities, tokens[0]), axis=0).numpy()[0]
    scores.append(score)

# Sort the permutations by their scores
results = sorted(zip(scores, permutations), reverse=True)

# Print the results
for score, permutation in results:
    sentence = tokenizer.decode(permutation)
    print(f"Score: {score:.5f} Sentence: {sentence}")

InvalidArgumentError: {{function_node __wrapped__GatherV2_device_/job:localhost/replica:0/task:0/device:CPU:0}} indices[1] = 2042 is not in [0, 1) [Op:GatherV2]

In [84]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification

# Load the pre-trained ParsBERT model for sequence classification
model_name = "HooshvareLab/bert-base-parsbert-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertForSequenceClassification.from_pretrained(model_name)



All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Ordering 1: من
Score: [0.55159485 0.44840518]
Ordering 2: پردازش
Score: [0.5159384  0.48406163]


In [None]:
# The input sentences with different orderings
orderings = [
    "من",
    "پردازش",
]

# Tokenize the sentences
input_ids = tokenizer(orderings, padding=True, truncation=True, return_tensors="tf").input_ids

# Get scores from the model
logits = model(input_ids).logits

# Compute the probabilities using softmax
probs = tf.nn.softmax(logits, axis=1)

# Print the scores for each sentence
for i, ordering in enumerate(orderings):
    print(f"Ordering {i + 1}: {ordering}")
    print(f"Score: {probs[i].numpy()}")

# Find the most correct ordering
# most_correct_idx = tf.argmax(probs, axis=0).numpy()
# print(f"\nMost correct ordering: {orderings[most_correct_idx]}")



In [85]:
logits

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[0.39822698, 0.1911104 ],
       [0.31495565, 0.2511805 ]], dtype=float32)>

In [87]:
import tensorflow as tf
from transformers import TFAutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")
model = TFAutoModelForCausalLM.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")



If you want to use `TFBertLMHeadModel` as a standalone, add `is_decoder=True.`
All model checkpoint layers were used when initializing TFBertLMHeadModel.

All the layers of TFBertLMHeadModel were initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertLMHeadModel for predictions without further training.


TypeError: bad operand type for unary -: 'NoneType'

In [89]:
# Define the sentence permutations
sentences = ["من به مدرسه می‌روم", "به مدرسه می‌روم من"]

# Encode the sentences into tokens
encoded_sentences = [tokenizer.encode(s) for s in sentences]

# Calculate the probability scores for each sentence permutation
for i, encoded_sentence in enumerate(encoded_sentences):
    input_ids = tf.constant([encoded_sentence])
    outputs = model(input_ids)
    loss = outputs.loss
    print(encoded_sentence, outputs.loss)
    probability_score = tf.exp(-loss)
    print(f"Sentence {i + 1}: {probability_score.numpy()[0][0]}")

AttributeError: 'TFBertLMHeadModel' object has no attribute 'train'

In [91]:
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model

# Define your model architecture
input_seq = Input(shape=(128,))
x = LSTM(128)(input_seq)
x = Dense(vocab_size, activation='softmax')(x)
model = Model(input_seq, x)

# Compile your model
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Train your model
model.fit(x_train, y_train, validation_data=(x_val, y_val), batch_size=32, epochs=10)

# Evaluate your model
score = model.evaluate(x_test, y_test, batch_size=32)
print('Test score:', score)

# Generate probabilities for sentence permutations
permutations = ['من به مدرسه میروم', 'به مدرسه میروم من']
for perm in permutations:
    words = perm.split()
    probs = []
    for i in range(len(words)):
        seq = tokenizer.texts_to_sequences([words[:i+1]])
        seq = pad_sequences(seq, maxlen=max_len, padding='pre')
        prob = model.predict(seq)[0][-1]
        probs.append(prob)
    sentence_prob = np.prod(probs)
    print(perm, sentence_prob)

ValueError: Input 0 of layer "lstm" is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: (None, 128)

In [92]:
ref = ["پسر خردسال در حیاط با گربه بازی می‌کند"]
cand = ["پسر بچه در حیاط با یک گربه دارد بازی می‌کند"]

In [95]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")
model = AutoModelForSequenceClassification.from_pretrained("HooshvareLab/bert-base-parsbert-uncased")

ref_encodings = tokenizer(ref, padding=True, truncation=True, max_length=128)
cand_encodings = tokenizer(cand, padding=True, truncation=True, max_length=128)

ref_tensors = {key: torch.tensor(val) for key, val in ref_encodings.items()}
cand_tensors = {key: torch.tensor(val) for key, val in cand_encodings.items()}

ref_outputs = model(**ref_tensors)
cand_outputs = model(**cand_tensors)

ref_embeddings = ref_outputs.last_hidden_state[:, 0, :]
cand_embeddings = cand_outputs.last_hidden_state[:, 0, :]



KeyboardInterrupt: 

In [None]:
import sacrebleu

ref_sentences = [[sent] for sent in ref]
cand_sentence = [sent for sent in cand]

bleu = sacrebleu.corpus_bleu(cand_sentence, ref_sentences)
print("BLEU score:", bleu.score)

In [96]:
import tensorflow as tf
from transformers import TFBertForMaskedLM, BertTokenizer

# Load the pre-trained BERT model and tokenizer
model = TFBertForMaskedLM.from_pretrained('HooshvareLab/bert-base-parsbert-uncased')
tokenizer = BertTokenizer.from_pretrained('HooshvareLab/bert-base-parsbert-uncased')

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


Token: [CLS]
Probabilities: [4.4809287e-05 1.6985367e-05 5.1590276e-05 2.0360555e-04 3.2640313e-04
 2.5878873e-01 3.2047261e-03 1.0163234e-01 1.2900239e-02 3.8944837e-01
 1.7244516e-01 6.0937028e-02]
Token: [MASK]
Probabilities: [7.7144883e-05 6.0363291e-05 8.5927575e-05 1.6650952e-04 5.0698168e-04
 2.2023416e-01 2.9697830e-02 1.2470782e-01 1.7611835e-02 3.2659408e-01
 6.5764397e-02 2.1449293e-01]
Token: [MASK]
Probabilities: [7.04608465e-05 6.42156228e-05 7.42365446e-05 1.63019446e-04
 6.09883748e-04 2.82955229e-01 2.65347492e-02 1.14709921e-01
 1.29546113e-02 3.36161941e-01 1.06002465e-01 1.19699247e-01]
Token: [MASK]
Probabilities: [5.7504280e-05 4.4545246e-05 8.8364184e-05 1.6260204e-04 5.5753108e-04
 3.5771888e-01 2.3241306e-02 6.7435980e-02 1.2004697e-02 2.8813416e-01
 1.4786080e-01 1.0269360e-01]
Token: [MASK]
Probabilities: [4.6513425e-05 3.6335132e-05 8.1242637e-05 1.3376902e-04 4.3901056e-04
 3.6924934e-01 1.7306471e-02 4.8218898e-02 1.1280631e-02 2.2637163e-01
 2.0911914e-01

In [98]:
# Define the input sentence
sentence = "سلام من به مدرسه رفتم."

# Tokenize the sentence
tokenized_sentence = tokenizer.tokenize(sentence)

# Replace each token with [MASK] and tokenize again
masked_token_indices = []
for i, token in enumerate(tokenized_sentence):
    if token not in ['[CLS]', '[SEP]']:
        masked_token_indices.append(i)
        tokenized_sentence[i] = '[MASK]'
        
input_ids = tokenizer.encode(tokenized_sentence, return_tensors='tf')

# Use the model to predict the masked tokens
outputs = model(input_ids)
predictions = outputs.logits
masked_predictions = tf.gather(predictions[0], masked_token_indices, axis=1)

# Calculate the probability distribution for each masked token
softmax_fn = tf.keras.layers.Softmax()
probabilities = softmax_fn(masked_predictions)

# Print out the results
for i, token_index in enumerate(masked_token_indices):
    token = tokenizer.convert_ids_to_tokens([input_ids[0][token_index].numpy()])[0]
    print(f"Token: {token}")
    print(f"Probabilities: {probabilities[i].numpy()}")

Token: [CLS]
Probabilities: [3.5259771e-05 1.9251916e-05 2.1564220e-04 9.8753713e-05 2.0632017e-04
 9.9942482e-01]
Token: [MASK]
Probabilities: [3.0996930e-04 2.6423519e-04 1.2717439e-03 5.6489184e-04 1.5339991e-03
 9.9605513e-01]
Token: [MASK]
Probabilities: [1.8119441e-04 1.8839908e-04 9.4285549e-04 4.2124020e-04 1.5720628e-03
 9.9669433e-01]
Token: [MASK]
Probabilities: [1.0453192e-04 8.9604750e-05 6.4787071e-04 2.3261992e-04 1.0475052e-03
 9.9787784e-01]
Token: [MASK]
Probabilities: [7.0846130e-05 3.9859427e-05 1.8473469e-04 9.9069090e-05 5.3867843e-04
 9.9906689e-01]
Token: [MASK]
Probabilities: [1.5545944e-03 8.8933198e-04 3.8920178e-05 2.2776834e-04 5.3936901e-04
 9.9675012e-01]


In [99]:
masked_predictions

<tf.Tensor: shape=(8, 6), dtype=float32, numpy=
array([[-6.8146734 , -7.4198055 , -5.0037956 , -5.784787  , -5.0479865 ,
         3.437519  ],
       [-7.1932235 , -7.352857  , -5.781553  , -6.593063  , -5.5940638 ,
         0.88186073],
       [-8.633314  , -8.594322  , -6.9839716 , -7.7896814 , -6.4727407 ,
        -0.02068512],
       [-8.856813  , -9.010898  , -7.032614  , -8.0569    , -6.552139  ,
         0.30708072],
       [-8.530693  , -9.1058445 , -7.572283  , -8.195386  , -6.5020847 ,
         1.0233738 ],
       [-1.9143311 , -2.47283   , -5.601788  , -3.8349717 , -2.9729009 ,
         4.5489545 ],
       [-5.8808074 , -5.4052696 , -6.227822  , -5.6999063 , -3.674399  ,
        12.559     ],
       [-3.1178765 , -4.7900357 , -6.8098793 , -1.4210021 , -1.4866896 ,
         4.5351663 ]], dtype=float32)>

In [100]:
from transformers import TFAutoModelForMaskedLM, AutoTokenizer

model_name = "HooshvareLab/bert-base-parsbert-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForMaskedLM.from_pretrained(model_name)

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


['انجا']


In [161]:
def calculate_sentence_score(sentence_tokens):
    sentence_score = 1
    sentence = ' '.join(sentence_tokens)
    for i,token in enumerate(sentence_tokens):
        found = False
        masked_text = ' '.join(tokens[:i])+' [MASK] '+' '.join(tokens[i+1:])

        inputs = tokenizer(masked_text, return_tensors="tf")
        outputs = model(inputs)
        logits = outputs.logits

        mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
        # mask_token_index = i
        mask_token_logits = logits[0, mask_token_index, :]

        k = 1000

        top_k_values, top_k_indices = tf.math.top_k(mask_token_logits, k=k)
        predicted_token_indices = top_k_indices.numpy()
        probabilities = tf.nn.softmax(top_k_values).numpy()
        predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_token_indices)

        for j,predicted_token in enumerate(predicted_tokens):
            if predicted_token == token:
                sentence_score *= probabilities[j]
                # print('found',sentence_score)
                found = True
                break

        epsilon = 0.00001

        if not found:
            sentence_score *= epsilon
            # print('not found',sentence_score)

        # print(token)
        # for i in range(k):
        #     print(predicted_tokens[i],probabilities[i])
        # print('----')

    soft_parameter = 1000000000000

    return sentence_score*soft_parameter


import itertools
text = "سلام خوبی؟"
tokens = tokenizer.tokenize(text)
permutations = list(itertools.permutations(tokens))
scores = []
for i,permutation in enumerate(permutations):
    print(' '.join(permutation))
    scores.append((' '.join(permutation),calculate_sentence_score(list(permutation))))
    print(i+1,'/',len(permutations))
scores.sort(key=lambda x: x[1], reverse=True)
for score in scores:
    print(score)


سلام خوبی ؟
1 / 6
سلام ؟ خوبی
2 / 6
خوبی سلام ؟
3 / 6
خوبی ؟ سلام
4 / 6
؟ سلام خوبی
5 / 6
؟ خوبی سلام
6 / 6
('سلام خوبی ؟', 10790.347008575605)
('خوبی سلام ؟', 4307.226293668622)
('؟ خوبی سلام', 25.940788701154002)
('خوبی ؟ سلام', 6.993595142937502)
('؟ سلام خوبی', 4.644772194668399)
('سلام ؟ خوبی', 3.1370356348312494)


In [147]:
from transformers import pipeline
mask_filler = pipeline("fill-mask", model=model,tokenizer=tokenizer)

In [160]:
# def calculate_sentence_score(sentence_tokens):

#     sentence_score = 1
#     for i,token in enumerate(sentence_tokens):
#         found = False
#         masked_text = ' '.join(tokens[:i])+' [MASK] '+' '.join(tokens[i+1:])
#         # print(masked_text)

#         # inputs = tokenizer(masked_text, return_tensors="tf")
#         # outputs = model(inputs)
#         # logits = outputs.logits

#         # mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
#         # mask_token_logits = logits[0, mask_token_index, :]

#         k = 1000
#         result = mask_filler(masked_text, top_k=k)
        
#         # for j in range(len(result)):
#         #     print(result[j])
#         # print('+++')

#         # top_k_values, top_k_indices = tf.math.top_k(mask_token_logits, k=k)
#         # predicted_token_indices = top_k_indices.numpy()
#         # probabilities = tf.nn.softmax(top_k_values).numpy()
#         # predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_token_indices)

#         # for j,predicted_token in enumerate(predicted_tokens):
#         #     if predicted_token == token:
#         #         sentence_score *= probabilities[j]
#         #         # print('found',sentence_score)
#         #         found = True
#         #         break

#         for res in result:
#             if res['token_str'] == token:
#                 sentence_score *= res['score']
#                 # print('found',sentence_score)
#                 found = True
#                 break

#         epsilon = 0.00001

#         if not found:
#             sentence_score *= epsilon
#             # print('not found',sentence_score)


#     soft_parameter = 1000000000000

#     return sentence_score*soft_parameter


# import itertools
# text = "مدرسه به میروم من"
# tokens = tokenizer.tokenize(text)
# permutations = list(itertools.permutations(tokens))
# scores = []
# for i,permutation in enumerate(permutations):
#     sentence = ' '.join(permutation)
#     score = calculate_sentence_score(list(permutation))
#     print(sentence)
#     print(score)
#     scores.append((sentence,score))
#     print(i+1,'/',len(permutations))
# scores.sort(key=lambda x: x[1], reverse=True)
# for score in scores:
#     print(score)

مدرسه به میروم من
1.0030545629170588e-07
1 / 24
مدرسه به من میروم
4.661605925357525e-06
2 / 24
مدرسه میروم به من
4.108130995346479e-05
3 / 24
مدرسه میروم من به
0.00018507368135433228
4 / 24
مدرسه من به میروم
4.7450013952038365e-05
5 / 24
مدرسه من میروم به
4.599659947623106e-06
6 / 24
به مدرسه میروم من
0.00016036466352045958
7 / 24
به مدرسه من میروم
0.0074528035993467155
8 / 24
به میروم مدرسه من
0.04333817848017966
9 / 24
به میروم من مدرسه
0.0013425347248704118
10 / 24
به من مدرسه میروم
0.05005675758318943
11 / 24
به من میروم مدرسه
3.336618776419605e-05
12 / 24
میروم مدرسه به من
0.02359523562444523
13 / 24
میروم مدرسه من به
0.10629790345988384
14 / 24
میروم به مدرسه من
0.015569209748703522
15 / 24
میروم به من مدرسه
0.000482304643606211
16 / 24
میروم من مدرسه به
0.7139499000831163
17 / 24
میروم من به مدرسه
0.004909330053782355
18 / 24
من مدرسه به میروم
0.22939533224943526
19 / 24
من مدرسه میروم به
0.022236885388188497
20 / 24
من به مدرسه میروم
0.1513654747937684
21 / 24
من به میروم مدرسه

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


InvalidArgumentError: {{function_node __wrapped__StridedSlice_device_/job:localhost/replica:0/task:0/device:CPU:0}} slice index 0 of dimension 0 out of bounds. [Op:StridedSlice] name: strided_slice/