In [2]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.layers import LSTM, Embedding, Dense
from keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [4]:
file = open('data/tir.txt', 'r', encoding='utf8')

lines = []
for i in file:
    lines.append(i)
    
# Convert list to string
data = ''
for i in lines:
    data = ' '.join(lines)
    
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('"', '').replace("'", '')

data = data.split()
data = ' '.join(data)
data[:500]

'ሓይሊ ኣየር እስራኤል ንዝተተኮሱ ሚሳይላትን ሞርታርን ግብረ መልሲ ኣብ ልዕሊ ካብ ጋዛ ናብ እስራኤል ዝወስዱ ናይ ውሽጣ ውሽጢ መተሓላለፍታት መጥቃዕቲ ፈንዩ። ብህላወ እግዚኣብሄር ዘይትኣምን ኣየርላንድ፣ እየሱስ ክርስቶስ፣ መሓመድ፣ ሳልማን ሩሽዲ፣ ማርክ ትዌይንን ሊቀጳጳስ ቤኔዲክት ዓሰርተ ሽድሽተን ዝብሉ ኣስማት ምሕታም ዝኽልክል ሓድሽ ሕጊ እንተሃለወ’ኳ ኮነይ ኢላ ኣብ መርበብ ገፃ ከምዝጠቀዐት ተፈሊጡ። ግብፃዊያን ተመራመርቲ ስነኳዕቲ ዛጊድ ካብ ዝተረኸቡ እቲ ዝዓበየ መቓብር ኣብ ሳኳራ ኔክሮፖሊስ ረኺቦም። ኣብ ሻንሺ ቻይና ነዳዲ ዲዝል ምስ ላሓዀ ናብቲ ብሚልዮናት ንዝቝጸሩ ሰባት ምንጪ ማይ ዝዀነ ብጫ ፈለግ በጺሑ። ስሎቫክያ ድሕሪ እቲ ኣብ ልዕሊ ሓደ ሰቪላዊ ሰብ ከም መፈተኒ ኢላ ዝተኸለቶ ተተኳሲ፣ ኣብ ዳብለን ኣየርላንድ ጎደና ዶርሰት ቦምባ ተተኺሉ ምህላው ዝሕብር ዓቢ መጠንቀቕታ '

In [5]:
# Tokenization
tokenizer = Tokenizer()
tir_token = tokenizer.fit_on_texts([data])

pickle.dump(tokenizer, open('token_tir.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]

[93, 50, 297, 4580, 4581, 4582, 247, 678, 1, 31, 4, 2718, 7, 297, 2719]

In [6]:
len(sequence_data)

44158

In [7]:
# index + 1 because index 0 will be reserved for padding

vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

13926


In [8]:
sequences = []
no_of_words = 3       # No of consecutive words used to predict

for i in range(no_of_words, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)
    
print('Length of Sequence: ', len(sequences))
sequences = np.array(sequences)
sequences[:10]

the length of sequence are:  44155


array([[  93,   50,  297, 4580],
       [  50,  297, 4580, 4581],
       [ 297, 4580, 4581, 4582],
       [4580, 4581, 4582,  247],
       [4581, 4582,  247,  678],
       [4582,  247,  678,    1],
       [ 247,  678,    1,   31],
       [ 678,    1,   31,    4],
       [   1,   31,    4, 2718],
       [  31,    4, 2718,    7]])

In [9]:
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])
    
X = np.array(X)
y = np.array(y)

In [10]:
print(('Data: ', X[:10]))
print(('Response: ', y[:10]))

('Data: ', array([[  93,   50,  297],
       [  50,  297, 4580],
       [ 297, 4580, 4581],
       [4580, 4581, 4582],
       [4581, 4582,  247],
       [4582,  247,  678],
       [ 247,  678,    1],
       [ 678,    1,   31],
       [   1,   31,    4],
       [  31,    4, 2718]]))
('Response: ', array([4580, 4581, 4582,  247,  678,    1,   31,    4, 2718,    7]))


In [11]:
# convert from class vector to binary class metrix

y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [12]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             139260    
                                                                 
 lstm (LSTM)                 (None, 3, 1000)           4044000   
                                                                 
 lstm_1 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 13926)             13939926  
                                                                 
Total params: 27,128,186
Trainable params: 27,128,186
Non-trainable params: 0
_________________________________________________________________


In [16]:
from tensorflow.keras.callbacks import ModelCheckpoint

cheackpoint = ModelCheckpoint('model.h5', monitor='loss', verbose=1, save_best_only=True)
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001))

model.fit(X, y, epochs=60, batch_size=64, callbacks=[cheackpoint])

Epoch 1/60
Epoch 1: loss improved from inf to 8.38280, saving model to model.h5
Epoch 2/60
Epoch 2: loss improved from 8.38280 to 8.06901, saving model to model.h5
Epoch 3/60
Epoch 3: loss improved from 8.06901 to 7.76685, saving model to model.h5
Epoch 4/60
Epoch 4: loss improved from 7.76685 to 7.38738, saving model to model.h5
Epoch 5/60
Epoch 5: loss improved from 7.38738 to 6.95747, saving model to model.h5
Epoch 6/60
Epoch 6: loss improved from 6.95747 to 6.44851, saving model to model.h5
Epoch 7/60
Epoch 7: loss improved from 6.44851 to 5.85687, saving model to model.h5
Epoch 8/60
Epoch 8: loss improved from 5.85687 to 5.21679, saving model to model.h5
Epoch 9/60
Epoch 9: loss improved from 5.21679 to 4.56447, saving model to model.h5
Epoch 10/60
Epoch 10: loss improved from 4.56447 to 3.89934, saving model to model.h5
Epoch 11/60
Epoch 11: loss improved from 3.89934 to 3.25537, saving model to model.h5
Epoch 12/60
Epoch 12: loss improved from 3.25537 to 2.66619, saving model to

<keras.callbacks.History at 0x7ff7c02d6cd0>

In [17]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

model = load_model('model.h5')
tokenizer = pickle.load(open('token_tir.pkl', 'rb'))

def predict_next_word(model, tokenizer, text):
    
    sequence = tokenizer.texts_to_sequences([text])
    sequence = np.array(sequence)
    preds = np.argmax(model.predict(sequence))
    predicted_word = ''
    pred_list = []
    
    for key, val in tokenizer.word_index.items():
        # All Match
        pred_list.append(key)
        if val == preds:
            # Best Match
            predicted_word = key
            break
        
    print(predicted_word, pred_list)
    return predicted_word, pred_list

In [None]:
# ነዊሕ ጕዕዞ በቲ ቀዳማይ ስጕምቲ ይጅምር ።
# ዓብዪ ተዛራባይ ዓብዪ ሓሳዊ እዩ ።
# ምስላ ኣብ ነዊሕ ተመክሮ እተመስረተ ሓጺር ምሉእ ሓሳብ እዩ ።
# ሓንቲ ኣደ ዘላቶ እሞ ገና እትበኪ ዘላ ፡ ብዘይ ኣደ እተበኪ እያ እትኸውን ።

while(True):    
    text = input('Enter your line... ')
    
    if text == '0':
        print('Execution completed...')
        break
        
    else:
        try:
            text = text.split(' ')
            text = text[-3:]
            print(text)
            
            predict_next_word(model, tokenizer, text)
            
        except Exception as e:
            print('Error occurred: ', e)
            continue

Enter your line... ነዊሕ ጕዕዞ በቲ ቀዳማይ
['ጕዕዞ', 'በቲ', 'ቀዳማይ']
ሃገራት ['ኣብ', 'እቲ', 'ናይ', 'ካብ', 'እዩ።', 'ኢትዮጵያ', 'ናብ', 'ሰባት', 'ምስ', 'ከም', 'ሃገር', 'ሓደ', 'ድማ', '።', 'ድሕሪ', 'እታ', 'ኣብቲ', 'ዓመት', 'ግዜ', 'ነቲ', 'ም', 'ጥዕና', 'ዓለም', 'ፈ', 'እውን', 'ዓ', 'ኣሎ።', 'እዚ', 'መንግስቲ', 'ዝኾነ', 'ልዕሊ', 'ዘሎ', 'ቤት', 'ከተማ', 'እዩ', 'ሚኒስትር', 'ኣፍሪካ', 'ስራሕ', 'እቶም', 'ቀዳማይ', 'ክልተ', 'ኣሜሪካ', 'ኣዲስ', 'ብሰንኪ', 'እዮም።', 'ዝሓለፈ', 'ኮይኑ', 'እንትኾን', 'ትካል', 'ኣየር', 'ኣበባ', 'ደቡብ', 'ኩባንያ', 'እዋን', 'ግጥም', 'ኣብዚ', 'ውሽጢ', 'ሚልዮን', 'ሰብ', 'እግሪ', 'ክሳብ', 'ውድድር', 'ሚኒስቴር', 'መዓልቲ', 'ውድብ', 'ኩዕሶ', 'ዝተኻየደ', 'ሕቡራት', 'ቅድሚ', 'ሰለስተ', 'ሎሚ', 'ንቲ', 'ስልጣን', 'ዝርከብ', 'ዝነበረ', 'በዓል', 'ሃገራዊ', 'ዋና', 'ክልል', 'ቫይረስ', 'ባሕሪ', 'ጋንታ', 'ሃገራት']
Enter your line... ዓብዪ ተዛራባይ ዓብዪ
['ዓብዪ', 'ተዛራባይ', 'ዓብዪ']
ኢትዮጵያ ['ኣብ', 'እቲ', 'ናይ', 'ካብ', 'እዩ።', 'ኢትዮጵያ']
Enter your line... ምስላ ኣብ ነዊሕ ተመክሮ
['ኣብ', 'ነዊሕ', 'ተመክሮ']
ኔቫዳ ['ኣብ', 'እቲ', 'ናይ', 'ካብ', 'እዩ።', 'ኢትዮጵያ', 'ናብ', 'ሰባት', 'ምስ', 'ከም', 'ሃገር', 'ሓደ', 'ድማ', '።', 'ድሕሪ', 'እታ', 'ኣብቲ', 'ዓመት', 'ግዜ', 'ነቲ', 'ም', 'ጥዕና', 'ዓለም', 'ፈ', 'እውን', 'ዓ', 'ኣሎ።', 'እዚ', 'መንግስቲ', 'ዝ