In [11]:
import pandas as pd

from transformers import pipeline, BertTokenizerFast

In [2]:
TEST_PATH = './test-mlm'

## Test Mask Prediction

In [3]:
fill_mask = pipeline(
    "fill-mask",
    model=TEST_PATH,
    tokenizer=TEST_PATH
)

Some weights of BertModel were not initialized from the model checkpoint at ./test-mlm and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
result = fill_mask("The [MASK] prescribed me some medicine")
result

[{'sequence': 'the doctor prescribed me some medicine',
  'score': 0.226003497838974,
  'token': 3460,
  'token_str': 'doctor'},
 {'sequence': 'the nurse prescribed me some medicine',
  'score': 0.030406033620238304,
  'token': 6821,
  'token_str': 'nurse'},
 {'sequence': 'the doctors prescribed me some medicine',
  'score': 0.028362751007080078,
  'token': 7435,
  'token_str': 'doctors'},
 {'sequence': 'the man prescribed me some medicine',
  'score': 0.027514001354575157,
  'token': 2158,
  'token_str': 'man'},
 {'sequence': 'the lady prescribed me some medicine',
  'score': 0.01853339932858944,
  'token': 3203,
  'token_str': 'lady'}]

## Test Tokenization

In [6]:
# load tokenizer
tokenizer = BertTokenizerFast.from_pretrained(TEST_PATH)

In [40]:
TEST_SENTENCE = '[USER], yet you keep replying long after I stopped. [EMOJI] [URL]'

# test tokenizer
print(f'tokenized sequence: {tokenizer.tokenize(TEST_SENTENCE)} \n')
print(f'tokens in sequence: {len(tokenizer.tokenize(TEST_SENTENCE))}')
print(f'characters in sequence: {len((TEST_SENTENCE))} \n')

# print special tokens
tokenizer.get_special_tokens_mask

tokenized sequence: ['[USER]', ',', 'yet', 'you', 'keep', 'reply', '##ing', 'long', 'after', 'i', 'stopped', '.', '[EMOJI]', '[URL]'] 

tokens in sequence: 14
characters in sequence: 65 



<bound method PreTrainedTokenizerBase.get_special_tokens_mask of PreTrainedTokenizerFast(name_or_path='./test-mlm', vocab_size=30522, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]', 'additional_special_tokens': ['[USER]', '[EMOJI]', '[URL]']})>

## Check distribution of lengths of tokenized vectors to set max_sequence_length

In [34]:
seq_length_df = pd.read_csv("../0_data/clean/labelled_ghc/train_random.csv")

seq_length_df['tokenized_length'] = seq_length_df.clean_text.apply(lambda x: len(tokenizer.tokenize(x)))

print(seq_length_df.tokenized_length.describe(),'\n')

n_total = seq_length_df.shape[0]
for cutoff in [64, 128, 256, 512]:
    n_affected = seq_length_df[seq_length_df.tokenized_length>cutoff].shape[0]
    print('cutoff {}: affects {:.1%} of data'.format(cutoff, n_affected/n_total))

count    20680.000000
mean        33.637331
std         33.723105
min          1.000000
25%         15.000000
50%         25.000000
75%         45.000000
max        826.000000
Name: tokenized_length, dtype: float64 

cutoff 64: affects 11.7% of data
cutoff 128: affects 0.7% of data
cutoff 256: affects 0.3% of data
cutoff 512: affects 0.1% of data
