# Installation

In [1]:
!pip install git+https://github.com/miidas/mlm-scoring

Collecting git+https://github.com/miidas/mlm-scoring
  Cloning https://github.com/miidas/mlm-scoring to /tmp/pip-req-build-eks2v35m
  Running command git clone -q https://github.com/miidas/mlm-scoring /tmp/pip-req-build-eks2v35m
Collecting gluonnlp~=0.8.3
  Downloading gluonnlp-0.8.3.tar.gz (236 kB)
[K     |████████████████████████████████| 236 kB 5.3 MB/s 
Collecting sacrebleu
  Downloading sacrebleu-2.0.0-py3-none-any.whl (90 kB)
[K     |████████████████████████████████| 90 kB 8.6 MB/s 
[?25hCollecting mosestokenizer
  Downloading mosestokenizer-1.2.1.tar.gz (37 kB)
Collecting transformers~=3.3.1
  Downloading transformers-3.3.1-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 24.2 MB/s 
Collecting sentencepiece!=0.1.92
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 45.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895

In [2]:
!pip install mxnet fugashi ipadic

Collecting mxnet
  Downloading mxnet-1.9.0-py3-none-manylinux2014_x86_64.whl (47.3 MB)
[K     |████████████████████████████████| 47.3 MB 1.4 MB/s 
[?25hCollecting fugashi
  Downloading fugashi-1.1.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (490 kB)
[K     |████████████████████████████████| 490 kB 43.9 MB/s 
[?25hCollecting ipadic
  Downloading ipadic-1.0.0.tar.gz (13.4 MB)
[K     |████████████████████████████████| 13.4 MB 31.2 MB/s 
[?25hCollecting graphviz<0.9.0,>=0.8.1
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Building wheels for collected packages: ipadic
  Building wheel for ipadic (setup.py) ... [?25l[?25hdone
  Created wheel for ipadic: filename=ipadic-1.0.0-py3-none-any.whl size=13556723 sha256=1b03d197c36256158872d9b199fccb793c61d023740baeb6aefed1c276ae583e
  Stored in directory: /root/.cache/pip/wheels/33/8b/99/cf0d27191876637cd3639a560f93aa982d7855ce826c94348b
Successfully built ipadic
Installing collected packages: graphviz, mxnet, ipadic

# Example

In [3]:
from mlm.scorers import MLMScorer, MLMScorerPT, LMScorer
from mlm.models import get_pretrained
import mxnet as mx

ctxs = [mx.cpu()]
#ctxs = [mx.gpu(0)]

# MXNet MLMs (use names from mlm.models.SUPPORTED_MLMS)
model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-en-cased')
scorer = MLMScorer(model, vocab, tokenizer, ctxs)
print(scorer.score_sentences(["Hello world!"]))
# >> [-12.410664200782776]
print(scorer.score_sentences(["Hello world!"], per_token=True))
# >> [[None, -6.126736640930176, -5.501412391662598, -0.7825151681900024, None]]

# EXPERIMENTAL: PyTorch MLMs (use names from https://huggingface.co/transformers/pretrained_models.html)
model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-cased')
scorer = MLMScorerPT(model, vocab, tokenizer, ctxs)
print(scorer.score_sentences(["Hello world!"]))
# >> [-12.411025047302246]
print(scorer.score_sentences(["Hello world!"], per_token=True))
# >> [[None, -6.126738548278809, -5.501765727996826, -0.782496988773346, None]]

# MXNet LMs (use names from mlm.models.SUPPORTED_LMS)
model, vocab, tokenizer = get_pretrained(ctxs, 'gpt2-117m-en-cased')
scorer = LMScorer(model, vocab, tokenizer, ctxs)
print(scorer.score_sentences(["Hello world!"]))
# >> [-15.995375633239746]
print(scorer.score_sentences(["Hello world!"], per_token=True))
# >> [[-8.293947219848633, -6.387561798095703, -1.3138668537139893]]

# Japanese (MXNet MLMs)
model, vocab, tokenizer = get_pretrained(ctxs, 'cl-tohoku/bert-base-japanese')
scorer = MLMScorerPT(model, vocab, tokenizer, ctxs)
print(scorer.score_sentences(["こんにちは、世界!"]))
# >> [-11.893018969480181]
print(scorer.score_sentences(["こんにちは、世界!"], per_token=True))
# >> [[None, -0.00012838016846217215, -0.2347521334886551, -0.00629359669983387, -1.2839869260787964, -5.64776611328125, -4.720091819763184, None]]

model, vocab, tokenizer = get_pretrained(ctxs, 'cl-tohoku/bert-base-japanese-whole-word-masking')
scorer = MLMScorerPT(model, vocab, tokenizer, ctxs)
print(scorer.score_sentences(["こんにちは、世界!"]))
# >> [-21.076359182945453]
print(scorer.score_sentences(["こんにちは、世界!"], per_token=True))
# >> [[None, -0.0014072287594899535, -0.07990188151597977, -2.3677682876586914, -1.0058914422988892, -10.913361549377441, -6.708028793334961, None]]

model, vocab, tokenizer = get_pretrained(ctxs, 'cl-tohoku/bert-base-japanese-char')
scorer = MLMScorerPT(model, vocab, tokenizer, ctxs)
print(scorer.score_sentences(["こんにちは、世界!"]))
# >> [-8.106433772969467]
print(scorer.score_sentences(["こんにちは、世界!"], per_token=True))
# >> [[None, -0.00017212340026162565, -7.128461584215984e-05, -0.0003200257197022438, -0.000310730334604159, -0.04472477734088898, -2.0046865940093994, -0.06974268704652786, -0.013195215724408627, -5.973210334777832, None]]

model, vocab, tokenizer = get_pretrained(ctxs, 'cl-tohoku/bert-base-japanese-char-whole-word-masking')
scorer = MLMScorerPT(model, vocab, tokenizer, ctxs)
print(scorer.score_sentences(["こんにちは、世界!"]))
# >> [-20.892313688993454]
print(scorer.score_sentences(["こんにちは、世界!"], per_token=True))
# >> [[None, -4.659216403961182, -0.42353877425193787, -0.2219201922416687, -1.5305637121200562, -1.9007164239883423, -3.515786647796631, -0.7679498791694641, -1.087415337562561, -6.785206317901611, None]]

  Optimizer.opt_registry[name].__name__))


Vocab file is not found. Downloading.
Downloading /root/.mxnet/models/1641289963.9859726book_corpus_wiki_en_cased-2d62af22.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/vocab/book_corpus_wiki_en_cased-2d62af22.zip...
Downloading /root/.mxnet/models/bert_12_768_12_book_corpus_wiki_en_cased-5656dac6.zipc2419d3f-34e1-4cfd-9342-58c05419a35a from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/bert_12_768_12_book_corpus_wiki_en_cased-5656dac6.zip...




[-12.410386085510254]




[[None, -6.1266608238220215, -5.501406669616699, -0.7823185920715332, None]]


Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLMOptimized: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLMOptimized from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLMOptimized from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]



[-12.411005198955536]
[[None, -6.126739978790283, -5.501766204833984, -0.7824990153312683, None]]
Vocab file is not found. Downloading.
Downloading /root/.mxnet/models/1641290003.2195394openai_webtext-f917dc78.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/vocab/openai_webtext-f917dc78.zip...
Downloading /root/.mxnet/models/gpt2_117m_openai_webtext-26416f2e.zipb39c7bb9-e753-400f-b949-c4065b2902aa from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/gpt2_117m_openai_webtext-26416f2e.zip...
BPE rank file is not found. Downloading.
Downloading /root/.mxnet/models/1641290027.4706082openai_webtext_bpe_ranks-396d4d8e.json from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/vocab/openai_webtext_bpe_ranks-396d4d8e.zip...




[-15.995341300964355]




[[-8.293933868408203, -6.387538909912109, -1.3138686418533325]]


Downloading:   0%|          | 0.00/479 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese were not used when initializing BertForMaskedLMOptimized: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLMOptimized from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLMOptimized from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/258k [00:00<?, ?B/s]



[-11.893025764409686]




[[None, -0.00012838016846217215, -0.2347521334886551, -0.00629359669983387, -1.283991813659668, -5.647768974304199, -4.720090866088867, None]]


Downloading:   0%|          | 0.00/479 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForMaskedLMOptimized: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLMOptimized from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLMOptimized from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/258k [00:00<?, ?B/s]



[-21.07635427301284]




[[None, -0.0014072287594899535, -0.07990221679210663, -2.367760419845581, -1.0058892965316772, -10.913368225097656, -6.708026885986328, None]]


Downloading:   0%|          | 0.00/478 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/359M [00:00<?, ?B/s]

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-char were not used when initializing BertForMaskedLMOptimized: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLMOptimized from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLMOptimized from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/15.7k [00:00<?, ?B/s]



[-8.106435332003457]




[[None, -0.00017212340026162565, -7.128461584215984e-05, -0.0003200257197022438, -0.000310730334604159, -0.04472455009818077, -2.004688262939453, -0.06974268704652786, -0.013195333071053028, -5.973210334777832, None]]


Downloading:   0%|          | 0.00/478 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/359M [00:00<?, ?B/s]

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-char-whole-word-masking were not used when initializing BertForMaskedLMOptimized: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLMOptimized from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLMOptimized from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/15.7k [00:00<?, ?B/s]



[-20.89231112599373]
[[None, -4.659210205078125, -0.42354074120521545, -0.2219197154045105, -1.5305646657943726, -1.9007188081741333, -3.5157885551452637, -0.7679501175880432, -1.0874181985855103, -6.785200119018555, None]]
