In [10]:
import os
import streamlit as st
import torch
import string
import pynput
from transformers import BertTokenizer, BertForMaskedLM
from pynput import keyboard
import translators as ts

# Initial tok_k : number of suggestions
top_k = 5

def decode(tokenizer, pred_idx, top_clean):
  """This is for predictions to back from numbers to words

  Args:
      tokenizer (BertTokenizer): tokenizer of Bert
      pred_idx (list): list of predicted word indexis
      top_clean (int): number of top words

  Returns:
      string: all decoded words
  """
  ignore_tokens = string.punctuation + '[PAD]'
  tokens = []
  for w in pred_idx:
    token = ''.join(tokenizer.decode(w).split())
    if token not in ignore_tokens:
      tokens.append(token.replace('##', ''))
  return '\n'.join(tokens[:top_clean])

In [11]:
def encode(tokenizer, text_sentence, add_special_tokens=True):
  """encode words to indexes

  Args:
      tokenizer (BertTokenizer): tokenizer
      text_sentence (string): input string
      add_special_tokens (bool, optional): special tokens for sentence and words. Defaults to True.

  Returns:
      tuple(torch.tensor,torch.tensor): input and mask ids, which are inputs for transformer
      and which are the masked positions need to be predicted
  """
  text_sentence = text_sentence.replace('<mask>', tokenizer.mask_token)
    # if <mask> is the last token, append a "." so that models dont predict punctuation.
  if tokenizer.mask_token == text_sentence.split()[-1]:
    text_sentence += ' .'

    input_ids = torch.tensor([tokenizer.encode(text_sentence, add_special_tokens=add_special_tokens)])
    mask_idx = torch.where(input_ids == tokenizer.mask_token_id)[1].tolist()[0]
  return input_ids, mask_idx


In [12]:
def get_all_predictions(text_sentence, top_clean=5):
  """predicts mask tokens

  Args:
      text_sentence (string): string of input
      top_clean (int, optional): number of words suggested. Defaults to 5.
  """

  input_ids, mask_idx = encode(bert_tokenizer, text_sentence)

  with torch.no_grad():
    predict = bert_model(input_ids)[0]
  bert = decode(bert_tokenizer, predict[0, mask_idx, :].topk(top_k).indices.tolist(), top_clean)
  return {'bert': bert}

In [13]:

def get_prediction_eos(input_text):
  """adds <mask> to token, predicts it, translates

  Args:
      input_text (string): row input

  Returns:
      dict: bert format predictions
  """
  try:
    result = ts.google(input_text,to_language='en')

    result += ' <mask>'
    res = get_all_predictions(result, top_clean=int(5))

    fin_result = dict()
    
    model_name = list(res.keys())[0]
    for i in res[model_name].split("\n"):

        if model_name in fin_result:
           fin_result[model_name]+='\n'
        else:
           fin_result[model_name]=''

        fin_result[model_name]+=ts.google(i, to_language='hy')

    return fin_result
  except Exception as error:
    pass

In [14]:
# loading tokenizer and model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased').eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
input_text = input("Input Text")

In [16]:
#get input text predictions
res = get_prediction_eos(input_text)

In [17]:
# find top_k words
answer = []
print(res['bert'].split("\n"))
for i in res['bert'].split("\n"):
    answer.append(i)

['հեռու', 'անկայուն', 'դուրս', 'արագ', 'ներսի']
