import libraries and predefined main parameters

In [9]:
from sklearn.model_selection import train_test_split
import numpy as np
import pickle

characters = ['ض', 'ص', 'ث', 'ق', 'ف', 'غ', 'ع', 'ه', 'خ', 'ح', 'ج', 'چ', 'پ', 'ش', 'س', 'ی', 'ب', 'ل', 'أ' ,'ا', 'آ', 'ت',
              'ن', 'م', 'ک', 'گ', 'ظ', 'ط', 'ز', 'ر', 'ژ', 'ذ', 'د', 'ئ', 'ء', 'و', 'إ', 'ؤ', 'ي', 'ة', '۱', '۲', '۳', '۴', '۵',
              '۶', '۷', '۸', '۹', '۰', ' ']
persian_numbers = ['\u06F0', '\u06F1', '\u06F2', '\u06F3', '\u06F4', '\u06F5', '\u06F6', '\u06F7', '\u06F8', '\u06F9']

number_of_previous_words = 4
current_path = 'drive/My Drive/Colab Notebooks/Next word prediction/for_github/'

### Clear input text
Check every characters of text and if it is not in predefined our characher list, remove it.

In [10]:
def replace_arabic_chars(text):
    text = text.replace('\u0660', '\u06F0').replace('\u0661', '\u06F1').replace('\u0662', '\u06F2').replace('\u0663', '\u06F3').replace('\u0664', '\u06F4').replace('\u0665', '۵\u06F5').replace('\u0666', '\u06F6').replace('\u0667', '\u06F7').replace('\u0668', '\u06F8').replace('\u0669', '\u06F9')  # arabic number to persian number
    text = text.replace('\u0643', '\u06A9').replace('u\0649', '\u06CC').replace('\u064A', '\u06CC').replace('\u06D5', '\u0647')  # "ك" to "ک", "ى" to "ی", "ي" to "ی" , "ە" to "ه"
    return text

def replace_english_chars(text):
    text = text.replace('0', '۰').replace('1', '۱').replace('2', '۲').replace('3', '۳').replace('4', '۴').replace('5', '۵').replace('6', '۶').replace('7', '۷').replace('8', '۸').replace('9', '۹')  # non-breaking space
    text = text.replace(';', '؛').replace('?', '؟').replace(',', '،')
    return text
    
def replace_other_chars(text):
    text = text.replace('\n', '').replace('\u200c', ' ').replace('\xa0', ' ')  # non-breaking space
    return text

def clean_text(text):
    new_chars = set(text) - set(characters)
    has_new_char = False
    for char in new_chars:
        text = text.replace(char, '')
        has_new_char = True
    return text, has_new_char

### load data
Load data from text file and in every lines of it clear text and extraxt any ngram that we define in parameters section.

In [11]:
def read_data(path, line_limit):
    f = open(path, 'r', encoding='UTF-8')
    lines = f.readlines()[:line_limit]
    previous_words = []
    next_words = []
    unique_words = set()
    for line in lines:
        line = replace_arabic_chars(line)
        line = replace_english_chars(line)
        line = replace_other_chars(line)
        line, _ = clean_text(line)
        words = line.split()
        if len(words) < number_of_previous_words+1:
            continue
        for i in range(len(words)-number_of_previous_words):
            previous_words.append(words[i: i+number_of_previous_words])
            next_words.append(words[i+number_of_previous_words])
            unique_words.update(set(words))
    return previous_words, next_words, list(unique_words)

# Exmaple of results of "read_data" function
previous_worsds, next_words, unique_words = read_data(current_path + 'data/training_set.txt', 100)
print(previous_worsds[:5])
print(next_words[0])
print(unique_words[:10])
print(len(unique_words))

[['بزرگترین', 'واحد', 'تولید', 'پارازایلین'], ['واحد', 'تولید', 'پارازایلین', 'خاورمیانه'], ['تولید', 'پارازایلین', 'خاورمیانه', 'به'], ['پارازایلین', 'خاورمیانه', 'به', 'زودی'], ['خاورمیانه', 'به', 'زودی', 'افتتاح']]
خاورمیانه
['نقالی', 'سپه', 'مستحق', 'وارد', 'یابند', 'هایده', 'نگاه', 'اصلاحی', 'توپ', 'هندو']
6218


### word indexing
Create tow dictionary for unique word to index of it and index of unique word to unique word.
"PAD" is for padding matrix for littele sentences.


In [12]:
def word_indexing(unique_words):
    word2index = dict((c, i+1) for i, c in enumerate(unique_words))
    word2index['PAD'] = 0
    index2word = dict((i+1, c) for i, c in enumerate(unique_words))
    index2word[0] = 'PAD'
    return word2index, index2word

word2index, index2word = word_indexing(unique_words)

### train and test data
Split previous_worsds and next_words to train and test data and lebels

In [13]:
def create_train_test_data(previous_worsds, next_words):
    data_train, data_test, labels_train, labels_test = train_test_split(previous_worsds, next_words, test_size=0.05)
    return data_train, data_test, labels_train, labels_test

data_train, data_test, labels_train, labels_test = create_train_test_data(previous_worsds, next_words)

### create matrices
Use numpy to create numerical matrices of data and labels for train and test.

Save These matrices and main parameters as objects 

In [14]:
def save_data(data, path):
    f = open(current_path + path, 'wb')  
    pickle.dump(data, f)
    f.close()

def create_matrices(previous_worsds, next_words, word2index, data_path, label_path):
    data = np.zeros((len(previous_worsds), number_of_previous_words), dtype=int)
    labels = np.zeros((len(next_words), 1), dtype=int)
    for i, words_list in enumerate(previous_worsds):
        for j, word in enumerate(words_list):
            data[i, j] =  word2index[word]
        labels[i] = word2index[next_words[i]]

    save_data(data, data_path)
    save_data(labels, label_path)

create_matrices(data_train, labels_train, word2index, 'data/data_train.p', 'data/labels_train.p')
create_matrices(data_test, labels_test, word2index, 'data/data_test.p', 'data/labels_test.p')
parameters = {'number_of_previous_words': number_of_previous_words, 'persian_numbers': persian_numbers, 'characters': characters}
save_data(parameters, 'data/parameters.p')
save_data(word2index, 'data/word2index.p')
save_data(index2word, 'data/index2word.p')