In [56]:
import transformers
from transformers import (
    CONFIG_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    AutoConfig,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    DataCollatorForPermutationLanguageModeling,
    DataCollatorForWholeWordMask,
    HfArgumentParser,
    LineByLineTextDataset,
    LineByLineWithRefDataset,
    PreTrainedTokenizer,
    TextDataset,
    Trainer,
    TrainingArguments,
    set_seed,
    pipeline
)

In [57]:
CACHE = './hfcache'
MODEL_NAME = 'google/electra-small-generator'
MODIFIED_MODEL = './electra_plus_contrast_mlm/'
config = AutoConfig.from_pretrained(MODIFIED_MODEL, cache_dir=CACHE)

In [58]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=CACHE, use_fast=True)

In [59]:
model = AutoModelForMaskedLM.from_pretrained(
    MODIFIED_MODEL,
    from_tf=False,
    config=config,
    cache_dir=CACHE,
)
model.resize_token_embeddings(len(tokenizer))

Embedding(30522, 128, padding_idx=0)

In [60]:
p = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [61]:
p(f"HuggingFace is creating a {tokenizer.mask_token} that the community uses to solve NLP tasks.")

[{'sequence': 'huggingface is creating a thing that the community uses to solve nlp tasks.',
  'score': 0.1659284383058548,
  'token': 2518,
  'token_str': 'thing'},
 {'sequence': 'huggingface is creating a one that the community uses to solve nlp tasks.',
  'score': 0.08882203698158264,
  'token': 2028,
  'token_str': 'one'},
 {'sequence': 'huggingface is creating a lake that the community uses to solve nlp tasks.',
  'score': 0.07363732159137726,
  'token': 2697,
  'token_str': 'lake'},
 {'sequence': 'huggingface is creating a quilt that the community uses to solve nlp tasks.',
  'score': 0.06791206449270248,
  'token': 27565,
  'token_str': 'quilt'},
 {'sequence': 'huggingface is creating a umbrella that the community uses to solve nlp tasks.',
  'score': 0.05017251521348953,
  'token': 12977,
  'token_str': 'umbrella'}]

In [62]:
p(f"Dan is very tall.  Dan is not {tokenizer.mask_token}.")

[{'sequence': 'dan is very tall. dan is not tall.',
  'score': 0.9813898205757141,
  'token': 4206,
  'token_str': 'tall'},
 {'sequence': 'dan is very tall. dan is not short.',
  'score': 0.005566936451941729,
  'token': 2460,
  'token_str': 'short'},
 {'sequence': 'dan is very tall. dan is not small.',
  'score': 0.0019572521559894085,
  'token': 2235,
  'token_str': 'small'},
 {'sequence': 'dan is very tall. dan is not fat.',
  'score': 0.0010749083012342453,
  'token': 6638,
  'token_str': 'fat'},
 {'sequence': 'dan is very tall. dan is not big.',
  'score': 0.0009656721958890557,
  'token': 2502,
  'token_str': 'big'}]

In [63]:
p(f"I thought that Mark was smart, but instead he was very {tokenizer.mask_token}.")

[{'sequence': 'i thought that mark was smart, but instead he was very smart.',
  'score': 0.7903743386268616,
  'token': 6047,
  'token_str': 'smart'},
 {'sequence': 'i thought that mark was smart, but instead he was very stupid.',
  'score': 0.17495791614055634,
  'token': 5236,
  'token_str': 'stupid'},
 {'sequence': 'i thought that mark was smart, but instead he was very intelligent.',
  'score': 0.005439497996121645,
  'token': 9414,
  'token_str': 'intelligent'},
 {'sequence': 'i thought that mark was smart, but instead he was very wise.',
  'score': 0.0017061398830264807,
  'token': 7968,
  'token_str': 'wise'},
 {'sequence': 'i thought that mark was smart, but instead he was very brave.',
  'score': 0.0014950091717764735,
  'token': 9191,
  'token_str': 'brave'}]

In [64]:
p(f"I wanted a fun toy, but I got a {tokenizer.mask_token} one instead.")

[{'sequence': 'i wanted a fun toy, but i got a good one instead.',
  'score': 0.029720524325966835,
  'token': 2204,
  'token_str': 'good'},
 {'sequence': 'i wanted a fun toy, but i got a stupid one instead.',
  'score': 0.02953438088297844,
  'token': 5236,
  'token_str': 'stupid'},
 {'sequence': 'i wanted a fun toy, but i got a easy one instead.',
  'score': 0.028891297057271004,
  'token': 3733,
  'token_str': 'easy'},
 {'sequence': 'i wanted a fun toy, but i got a polite one instead.',
  'score': 0.024854343384504318,
  'token': 13205,
  'token_str': 'polite'},
 {'sequence': 'i wanted a fun toy, but i got a smart one instead.',
  'score': 0.01905311644077301,
  'token': 6047,
  'token_str': 'smart'}]

In [65]:
p(f"The day was excessively beautiful, but she had been hoping for a {tokenizer.mask_token} one instead.")

[{'sequence': 'the day was excessively beautiful, but she had been hoping for a beautiful one instead.',
  'score': 0.6622408032417297,
  'token': 3376,
  'token_str': 'beautiful'},
 {'sequence': 'the day was excessively beautiful, but she had been hoping for a ugly one instead.',
  'score': 0.11599694192409515,
  'token': 9200,
  'token_str': 'ugly'},
 {'sequence': 'the day was excessively beautiful, but she had been hoping for a stupid one instead.',
  'score': 0.013757024891674519,
  'token': 5236,
  'token_str': 'stupid'},
 {'sequence': 'the day was excessively beautiful, but she had been hoping for a salty one instead.',
  'score': 0.011508408933877945,
  'token': 23592,
  'token_str': 'salty'},
 {'sequence': 'the day was excessively beautiful, but she had been hoping for a nasty one instead.',
  'score': 0.009772603400051594,
  'token': 11808,
  'token_str': 'nasty'}]

In [66]:
model = AutoModelForMaskedLM.from_pretrained(
    MODEL_NAME,
    from_tf=False,
    config=config,
    cache_dir=CACHE,
)
model.resize_token_embeddings(len(tokenizer))
p = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [67]:
p(f"HuggingFace is creating a {tokenizer.mask_token} that the community uses to solve NLP tasks.")

[{'sequence': 'huggingface is creating a system that the community uses to solve nlp tasks.',
  'score': 0.15001358091831207,
  'token': 2291,
  'token_str': 'system'},
 {'sequence': 'huggingface is creating a tool that the community uses to solve nlp tasks.',
  'score': 0.120941162109375,
  'token': 6994,
  'token_str': 'tool'},
 {'sequence': 'huggingface is creating a solution that the community uses to solve nlp tasks.',
  'score': 0.06042460724711418,
  'token': 5576,
  'token_str': 'solution'},
 {'sequence': 'huggingface is creating a database that the community uses to solve nlp tasks.',
  'score': 0.053126465529203415,
  'token': 7809,
  'token_str': 'database'},
 {'sequence': 'huggingface is creating a computer that the community uses to solve nlp tasks.',
  'score': 0.03361189365386963,
  'token': 3274,
  'token_str': 'computer'}]

In [68]:
p(f"Dan is very tall.  Dan is not {tokenizer.mask_token}.")

[{'sequence': 'dan is very tall. dan is not tall.',
  'score': 0.3833930492401123,
  'token': 4206,
  'token_str': 'tall'},
 {'sequence': 'dan is very tall. dan is not strong.',
  'score': 0.0479818657040596,
  'token': 2844,
  'token_str': 'strong'},
 {'sequence': 'dan is very tall. dan is not big.',
  'score': 0.03165790066123009,
  'token': 2502,
  'token_str': 'big'},
 {'sequence': 'dan is very tall. dan is not taller.',
  'score': 0.028651991859078407,
  'token': 12283,
  'token_str': 'taller'},
 {'sequence': 'dan is very tall. dan is not long.',
  'score': 0.024816956371068954,
  'token': 2146,
  'token_str': 'long'}]

In [69]:
p(f"I thought that Mark was smart, but instead he was very {tokenizer.mask_token}.")

[{'sequence': 'i thought that mark was smart, but instead he was very smart.',
  'score': 0.6189573407173157,
  'token': 6047,
  'token_str': 'smart'},
 {'sequence': 'i thought that mark was smart, but instead he was very intelligent.',
  'score': 0.039767418056726456,
  'token': 9414,
  'token_str': 'intelligent'},
 {'sequence': 'i thought that mark was smart, but instead he was very good.',
  'score': 0.026786595582962036,
  'token': 2204,
  'token_str': 'good'},
 {'sequence': 'i thought that mark was smart, but instead he was very clever.',
  'score': 0.017436936497688293,
  'token': 12266,
  'token_str': 'clever'},
 {'sequence': 'i thought that mark was smart, but instead he was very young.',
  'score': 0.01190208736807108,
  'token': 2402,
  'token_str': 'young'}]

In [70]:
p(f"I wanted a fun toy, but I got a {tokenizer.mask_token} one instead.")

[{'sequence': 'i wanted a fun toy, but i got a new one instead.',
  'score': 0.24864517152309418,
  'token': 2047,
  'token_str': 'new'},
 {'sequence': 'i wanted a fun toy, but i got a fun one instead.',
  'score': 0.08383011817932129,
  'token': 4569,
  'token_str': 'fun'},
 {'sequence': 'i wanted a fun toy, but i got a little one instead.',
  'score': 0.05334790423512459,
  'token': 2210,
  'token_str': 'little'},
 {'sequence': 'i wanted a fun toy, but i got a good one instead.',
  'score': 0.048895809799432755,
  'token': 2204,
  'token_str': 'good'},
 {'sequence': 'i wanted a fun toy, but i got a cute one instead.',
  'score': 0.029207438230514526,
  'token': 10140,
  'token_str': 'cute'}]

In [71]:
p(f"The day was excessively beautiful, but she had been hoping for a {tokenizer.mask_token} one instead.")

[{'sequence': 'the day was excessively beautiful, but she had been hoping for a better one instead.',
  'score': 0.17477267980575562,
  'token': 2488,
  'token_str': 'better'},
 {'sequence': 'the day was excessively beautiful, but she had been hoping for a new one instead.',
  'score': 0.1298745721578598,
  'token': 2047,
  'token_str': 'new'},
 {'sequence': 'the day was excessively beautiful, but she had been hoping for a good one instead.',
  'score': 0.06588517874479294,
  'token': 2204,
  'token_str': 'good'},
 {'sequence': 'the day was excessively beautiful, but she had been hoping for a bigger one instead.',
  'score': 0.04379922151565552,
  'token': 7046,
  'token_str': 'bigger'},
 {'sequence': 'the day was excessively beautiful, but she had been hoping for a different one instead.',
  'score': 0.04136064276099205,
  'token': 2367,
  'token_str': 'different'}]