In [17]:
from transformers import pipeline

In [188]:
unmasker = pipeline('fill-mask', model="bert-base-german-cased")
# unmasker = pipeline('fill-mask', model="distilbert-base-german-cased")

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [189]:
unmasker("Ich habe [MASK] Hund")

[{'sequence': '[CLS] Ich habe einen Hund [SEP]',
  'score': 0.554667592048645,
  'token': 303,
  'token_str': 'einen'},
 {'sequence': '[CLS] Ich habe keinen Hund [SEP]',
  'score': 0.32294565439224243,
  'token': 1875,
  'token_str': 'keinen'},
 {'sequence': '[CLS] Ich habe meinen Hund [SEP]',
  'score': 0.04057816043496132,
  'token': 9685,
  'token_str': 'meinen'},
 {'sequence': '[CLS] Ich habe den Hund [SEP]',
  'score': 0.019943149760365486,
  'token': 86,
  'token_str': 'den'},
 {'sequence': '[CLS] Ich habe diesen Hund [SEP]',
  'score': 0.008157292380928993,
  'token': 1377,
  'token_str': 'diesen'}]

In [190]:
def sentence_correction(sentence: str):
    print(f'Original Sentence: {sentence} \n')
    delimeter = " "
    sentence = sentence.split(delimeter)

    for idx, original_word in enumerate(sentence):
        masked_sentence_list = sentence.copy()
        masked_sentence_list[idx] = "[MASK]"
        masked_sentence = delimeter.join(masked_sentence_list)
        print(f'  Masked Sentence: {masked_sentence}')
        _get_masked_sentence_results(masked_sentence, original_word)
        
        
        print('---------')

In [191]:
def _get_masked_sentence_results(masked_sentence, original_word, max_difference=0.3):
    
    unmasked_list = unmasker(masked_sentence) 
    
    true_score = 0
    better_word_replacements = []
    better_word_replacement_scores = []
    for potential_replacement in unmasked_list:
        # unmasked_sentence is in sorted order from highest score to lowest score.
        if potential_replacement['token_str'] == original_word:
            true_score = float(potential_replacement['score'])
            break
    
    sentence_and_score_list = [(word['token_str'], word['score'], word['sequence']) for word in unmasked_list if word['score'] - true_score > max_difference]
    print(f"    Score for original word {original_word.upper()} is {true_score}")
    for idx in range(len(sentence_and_score_list)):
        print(f"      Recommended word/score/sentence: {sentence_and_score_list[idx]}")
    
    
    
    

In [199]:
sentence_correction("Ich denke, dass Nick perfekt Deutsch sprechen kann")

Original Sentence: Ich denke, dass Nick perfekt Deutsch sprechen kann 

  Masked Sentence: [MASK] denke, dass Nick perfekt Deutsch sprechen kann
    Score for original word ICH is 0.998205304145813
---------
  Masked Sentence: Ich [MASK] dass Nick perfekt Deutsch sprechen kann
    Score for original word DENKE, is 0
      Recommended word/score/sentence: ('hoffe', 0.670002281665802, '[CLS] Ich hoffe dass Nick perfekt Deutsch sprechen kann [SEP]')
---------
  Masked Sentence: Ich denke, [MASK] Nick perfekt Deutsch sprechen kann
    Score for original word DASS is 0.9511033892631531
---------
  Masked Sentence: Ich denke, dass [MASK] perfekt Deutsch sprechen kann
    Score for original word NICK is 0
      Recommended word/score/sentence: ('ich', 0.6851903796195984, '[CLS] Ich denke, dass ich perfekt Deutsch sprechen kann [SEP]')
---------
  Masked Sentence: Ich denke, dass Nick [MASK] Deutsch sprechen kann
    Score for original word PERFEKT is 0.039176106452941895
      Recommended wor