In [1]:
# data from https://github.com/cbaziotis/ekphrasis/blob/master/ekphrasis/utils/helpers.py
# reuploaded to husein's S3
# !wget https://malaya-dataset.s3-ap-southeast-1.amazonaws.com/counts_1grams.txt

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [3]:
with open('counts_1grams.txt') as fopen:
    f = fopen.read().split('\n')[:-1]
    
words = {}
for l in f:
    w, c = l.split('\t')
    c = int(c)
    words[w] = c + words.get(w, 0)

In [4]:
# original from https://github.com/cbaziotis/ekphrasis/blob/master/ekphrasis/classes/spellcorrect.py
# improved it

import re
from collections import Counter

class SpellCorrector:
    """
    The SpellCorrector extends the functionality of the Peter Norvig's
    spell-corrector in http://norvig.com/spell-correct.html
    """

    def __init__(self):
        """
        :param corpus: the statistics from which corpus to use for the spell correction.
        """
        super().__init__()
        self.WORDS = words
        self.N = sum(self.WORDS.values())
        
    @staticmethod
    def tokens(text):
        return REGEX_TOKEN.findall(text.lower())

    def P(self, word):
        """
        Probability of `word`.
        """
        return self.WORDS[word] / self.N

    def most_probable(self, words):
        _known = self.known(words)
        if _known:
            return max(_known, key=self.P)
        else:
            return []

    @staticmethod
    def edit_step(word):
        """
        All edits that are one edit away from `word`.
        """
        letters = 'abcdefghijklmnopqrstuvwxyz'
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes = [L + R[1:] for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
        inserts = [L + c + R for L, R in splits for c in letters]
        return set(deletes + transposes + replaces + inserts)

    def edits2(self, word):
        """
        All edits that are two edits away from `word`.
        """
        return (e2 for e1 in self.edit_step(word)
                for e2 in self.edit_step(e1))

    def known(self, words):
        """
        The subset of `words` that appear in the dictionary of WORDS.
        """
        return set(w for w in words if w in self.WORDS)

    def edit_candidates(self, word, assume_wrong=False, fast=True):
        """
        Generate possible spelling corrections for word.
        """

        if fast:
            ttt = self.known(self.edit_step(word)) or {word}
        else:
            ttt = self.known(self.edit_step(word)) or self.known(self.edits2(word)) or {word}
        
        ttt = self.known([word]) | ttt
        return list(ttt)

In [5]:
corrector = SpellCorrector()

In [6]:
possible_states = corrector.edit_candidates('eting')
possible_states

['edting',
 'reting',
 'etang',
 'eling',
 'beting',
 'eating',
 'ering',
 'eking',
 'ebing',
 'eting',
 'geting',
 'etting',
 'ating',
 'enting',
 'eying',
 'meting',
 'epting',
 'etling',
 'ting',
 'sting',
 'elting',
 'eing',
 'etin',
 'kting',
 'ewing']

In [7]:
# !wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
# !unzip uncased_L-12_H-768_A-12.zip

In [8]:
BERT_VOCAB = 'uncased_L-12_H-768_A-12/vocab.txt'
BERT_INIT_CHKPNT = 'uncased_L-12_H-768_A-12/bert_model.ckpt'
BERT_CONFIG = 'uncased_L-12_H-768_A-12/bert_config.json'

In [9]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling
import tensorflow as tf
import numpy as np

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])





  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [10]:
tokenization.validate_case_matches_checkpoint(True,BERT_INIT_CHKPNT)
tokenizer = tokenization.FullTokenizer(
      vocab_file=BERT_VOCAB, do_lower_case=True)




In [11]:
text = 'scientist suggests eting berger can lead to obesity'
text_mask = text.replace('eting', '**mask**')
text_mask

'scientist suggests **mask** berger can lead to obesity'

In [12]:
def tokens_to_masked_ids(tokens, mask_ind):
    masked_tokens = tokens[:]
    masked_tokens[mask_ind] = "[MASK]"
    masked_tokens = ["[CLS]"] + masked_tokens + ["[SEP]"]
    masked_ids = tokenizer.convert_tokens_to_ids(masked_tokens)
    return masked_ids

In [13]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

In [14]:
class Model:
    def __init__(
        self,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=False,
            input_ids=self.X,
            use_one_hot_embeddings=False)
        
        output_layer = model.get_sequence_output()
        embedding = model.get_embedding_table()
        
        with tf.variable_scope('cls/predictions'):
            with tf.variable_scope('transform'):
                input_tensor = tf.layers.dense(
                    output_layer,
                    units = bert_config.hidden_size,
                    activation = modeling.get_activation(bert_config.hidden_act),
                    kernel_initializer = modeling.create_initializer(
                        bert_config.initializer_range
                    ),
                )
                input_tensor = modeling.layer_norm(input_tensor)
            
            output_bias = tf.get_variable(
            'output_bias',
            shape = [bert_config.vocab_size],
            initializer = tf.zeros_initializer(),
            )
            logits = tf.matmul(input_tensor, embedding, transpose_b = True)
            self.logits = tf.nn.bias_add(logits, output_bias)

In [15]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model()

sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')




The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.dense instead.


In [16]:
cls = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'cls')
cls

[<tf.Variable 'cls/predictions/transform/dense/kernel:0' shape=(768, 768) dtype=float32_ref>,
 <tf.Variable 'cls/predictions/transform/dense/bias:0' shape=(768,) dtype=float32_ref>,
 <tf.Variable 'cls/predictions/transform/LayerNorm/beta:0' shape=(768,) dtype=float32_ref>,
 <tf.Variable 'cls/predictions/transform/LayerNorm/gamma:0' shape=(768,) dtype=float32_ref>,
 <tf.Variable 'cls/predictions/output_bias:0' shape=(30522,) dtype=float32_ref>]

In [17]:
saver = tf.train.Saver(var_list = var_lists + cls)
saver.restore(sess, BERT_INIT_CHKPNT)

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from uncased_L-12_H-768_A-12/bert_model.ckpt


In [18]:
replaced_masks = [text_mask.replace('**mask**', state) for state in possible_states]
replaced_masks

['scientist suggests edting berger can lead to obesity',
 'scientist suggests reting berger can lead to obesity',
 'scientist suggests etang berger can lead to obesity',
 'scientist suggests eling berger can lead to obesity',
 'scientist suggests beting berger can lead to obesity',
 'scientist suggests eating berger can lead to obesity',
 'scientist suggests ering berger can lead to obesity',
 'scientist suggests eking berger can lead to obesity',
 'scientist suggests ebing berger can lead to obesity',
 'scientist suggests eting berger can lead to obesity',
 'scientist suggests geting berger can lead to obesity',
 'scientist suggests etting berger can lead to obesity',
 'scientist suggests ating berger can lead to obesity',
 'scientist suggests enting berger can lead to obesity',
 'scientist suggests eying berger can lead to obesity',
 'scientist suggests meting berger can lead to obesity',
 'scientist suggests epting berger can lead to obesity',
 'scientist suggests etling berger can 

In [21]:
tokens = tokenizer.tokenize(replaced_masks[0])
input_ids = [tokens_to_masked_ids(tokens, i) for i in range(len(tokens))]
input_ids

[[101, 103, 6083, 3968, 3436, 16758, 2064, 2599, 2000, 24552, 102],
 [101, 7155, 103, 3968, 3436, 16758, 2064, 2599, 2000, 24552, 102],
 [101, 7155, 6083, 103, 3436, 16758, 2064, 2599, 2000, 24552, 102],
 [101, 7155, 6083, 3968, 103, 16758, 2064, 2599, 2000, 24552, 102],
 [101, 7155, 6083, 3968, 3436, 103, 2064, 2599, 2000, 24552, 102],
 [101, 7155, 6083, 3968, 3436, 16758, 103, 2599, 2000, 24552, 102],
 [101, 7155, 6083, 3968, 3436, 16758, 2064, 103, 2000, 24552, 102],
 [101, 7155, 6083, 3968, 3436, 16758, 2064, 2599, 103, 24552, 102],
 [101, 7155, 6083, 3968, 3436, 16758, 2064, 2599, 2000, 103, 102]]

In [22]:
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
tokens_ids

[7155, 6083, 3968, 3436, 16758, 2064, 2599, 2000, 24552]

In [24]:
def generate_ids(mask):
    tokens = tokenizer.tokenize(mask)
    input_ids = [tokens_to_masked_ids(tokens, i) for i in range(len(tokens))]
    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    return tokens, input_ids, tokens_ids

In [26]:
ids = [get_score(mask) for mask in replaced_masks]
tokens, input_ids, tokens_ids = list(zip(*ids))

In [29]:
indices, ids = [], []
for i in range(len(input_ids)):
    indices.extend([i] * len(input_ids[i]))
    ids.extend(input_ids[i])

In [33]:
ids[0]

[101, 103, 6083, 3968, 3436, 16758, 2064, 2599, 2000, 24552, 102]

In [34]:
masked_padded = tf.keras.preprocessing.sequence.pad_sequences(ids,padding='post')
masked_padded.shape

(221, 11)

In [35]:
preds = sess.run(tf.nn.log_softmax(model.logits), feed_dict = {model.X: masked_padded})
preds.shape

(221, 11, 30522)

In [38]:
indices = np.array(indices)
scores = []

for i in range(len(tokens)):
    filter_preds = preds[indices == i]
    total = np.sum([filter_preds[k, k + 1, x] for k, x in enumerate(tokens_ids[i])])
    scores.append(total)
    
scores

[-70.87423,
 -63.164944,
 -62.3369,
 -63.397655,
 -69.86493,
 -45.841267,
 -62.576523,
 -57.582092,
 -73.42107,
 -71.33391,
 -70.08537,
 -67.14623,
 -67.53539,
 -62.374245,
 -61.71485,
 -60.225086,
 -73.1943,
 -73.97394,
 -67.466835,
 -63.56203,
 -67.8916,
 -65.7337,
 -67.74832,
 -73.778435,
 -62.557587]

In [39]:
prob_scores = np.array(scores) / np.sum(scores)
prob_scores

array([0.04307465, 0.03838924, 0.03788599, 0.03853067, 0.04246124,
       0.02786057, 0.03803162, 0.0349962 , 0.04462252, 0.04335402,
       0.04259521, 0.04080892, 0.04104543, 0.03790868, 0.03750793,
       0.03660251, 0.0444847 , 0.04495853, 0.04100376, 0.03863057,
       0.04126192, 0.03995043, 0.04117484, 0.04483971, 0.03802011],
      dtype=float32)

In [41]:
probs = list(zip(possible_states, prob_scores))
probs.sort(key = lambda x: x[1])  
probs

[('eating', 0.02786057),
 ('eking', 0.034996197),
 ('meting', 0.03660251),
 ('eying', 0.03750793),
 ('etang', 0.037885986),
 ('enting', 0.037908684),
 ('ewing', 0.03802011),
 ('ering', 0.03803162),
 ('reting', 0.03838924),
 ('eling', 0.038530674),
 ('sting', 0.038630575),
 ('eing', 0.039950434),
 ('etting', 0.040808916),
 ('ting', 0.041003764),
 ('ating', 0.04104543),
 ('etin', 0.04117484),
 ('elting', 0.041261923),
 ('beting', 0.042461235),
 ('geting', 0.04259521),
 ('edting', 0.04307465),
 ('eting', 0.043354023),
 ('epting', 0.044484697),
 ('ebing', 0.044622518),
 ('kting', 0.044839714),
 ('etling', 0.04495853)]