import libraries

In [2]:
from keras.models import load_model
import numpy as np

import pickle
import heapq

current_path = 'drive/My Drive/Colab Notebooks/Next word prediction/for_github/'
number_of_previous_words = 4
word2index = {}
index2word = {}
# characters = ['ض', 'ص', 'ث', 'ق', 'ف', 'غ', 'ع', 'ه', 'خ', 'ح', 'ج', 'چ', 'پ', 'ش', 'س', 'ی', 'ب', 'ل', 'أ' ,'ا', 'آ', 'ت',
#               'ن', 'م', 'ک', 'گ', 'ظ', 'ط', 'ز', 'ر', 'ژ', 'ذ', 'د', 'ئ', 'ء', 'و', 'إ', 'ؤ', 'ي', 'ة', '۱', '۲', '۳', '۴', '۵',
#               '۶', '۷', '۸', '۹', '۰', ' ']
# persian_numbers = ['\u06F0', '\u06F1', '\u06F2', '\u06F3', '\u06F4', '\u06F5', '\u06F6', '\u06F7', '\u06F8', '\u06F9']

### set parameters
Load parameters from file and set to varables.

This file is saved in preprocess_data file.

In [3]:
def set_parameters(path):
    f = open(current_path  + path, 'rb') 
    parameters = pickle.load(f)
    f.close()
    number_of_previous_words = parameters['number_of_previous_words']
    characters = parameters['characters']
    persian_numbers = parameters['persian_numbers']
    return number_of_previous_words, characters, persian_numbers

number_of_previous_words, characters, persian_numbers = set_parameters('data/parameters.p')

### load files
Load files that is saved in preprocessing of data.



In [4]:
def load_words_from_file():
    with open(current_path + 'data/word2index.p', 'rb') as f:
        word2index = pickle.load(f)

    with open(current_path + 'data/index2word.p', 'rb') as f:
        index2word = pickle.load(f)
    return word2index, index2word

word2index, index2word = load_words_from_file()

### load trained model

In [5]:
model = load_model(current_path + 'model/model.h5')

### Test new input text 
Following functions is for preparing input text and get results or model for that.

In [6]:
def replace_arabic_chars(text):
    text = text.replace('\u0660', '\u06F0').replace('\u0661', '\u06F1').replace('\u0662', '\u06F2').replace('\u0663', '\u06F3').replace('\u0664', '\u06F4').replace('\u0665', '۵\u06F5').replace('\u0666', '\u06F6').replace('\u0667', '\u06F7').replace('\u0668', '\u06F8').replace('\u0669', '\u06F9')  # arabic number to persian number
    text = text.replace('\u0643', '\u06A9').replace('u\0649', '\u06CC').replace('\u064A', '\u06CC').replace('\u06D5', '\u0647')  # "ك" to "ک", "ى" to "ی", "ي" to "ی" , "ە" to "ه"
    return text

def replace_english_chars(text):
    text = text.replace('0', '۰').replace('1', '۱').replace('2', '۲').replace('3', '۳').replace('4', '۴').replace('5', '۵').replace('6', '۶').replace('7', '۷').replace('8', '۸').replace('9', '۹')  # non-breaking space
    text = text.replace(';', '؛').replace('?', '؟').replace(',', '،')
    return text

def replace_other_chars(text):
    text = text.replace('\n', '').replace('\u200c', ' ').replace('\xa0', ' ')  # non-breaking space
    return text

def clean_text(text):
    new_chars = set(text) - set(characters)
    has_new_char = False
    for char in new_chars:
        text = text.replace(char, '')
        has_new_char = True
    return text, has_new_char

def prepare_input(text):
    """
      This function makes related vector of input text to  use it in model
    """
    text = text.split()[-number_of_previous_words:]  # get n last words of text
    text = ' '.join(text)
    text = replace_arabic_chars(text)
    text = replace_english_chars(text)
    text = replace_other_chars(text)
    text, _ = clean_text(text)
    input_vector = np.zeros((1, number_of_previous_words), dtype=int)
    words = text.split()
    for t, word in enumerate(words):   #  add index of input text to end of vector
        input_vector[0, t + (number_of_previous_words - len(words))] = word2index[word]
    return input_vector

In [7]:
def sample(preds, top_n=3):
    """
      This function selecte n best predections of model
    """
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    res_index = heapq.nlargest(top_n, range(len(preds)), preds.take)
    res_prob = [preds[i] for i in res_index]
    return [res_index, res_prob]

In [9]:
def predict_completions(text, n=3):
    """
      This function uses tow previous functions to predict n best word as next word for input text
    """
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    result = sample(preds, n)
    next_indices = result[0]
    return [index2word[idx] for idx in next_indices]

Get an input text and predict 3 best next word of it

In [11]:
test =  input()
next_word = predict_completions(test, 3)
print('input text: ' + test)
print('next word: ' + str(next_word))

مجلس شورای

input text: مجلس شورای
next word: ['اسلامی', 'شد', 'تواند']

