# Language Model (ITA)

https://machinelearningmastery.com/develop-word-based-neural-language-models-python-keras/

In [97]:
import numpy as np
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import Model
from keras.layers import Dense, Input, Dropout, LSTM, Embedding, Activation


In [98]:
# source text
sentence = """dovendo fare delle lunghe passeggiate
       meglio indossare scarpe comode.
       Per colazione ho mangiato pane e marmellata con un succo di arancia,
       Lucia invece ha preso la torta con un succo alla pesca,
       Giovanni un cornetto e spremuta di arancia e un succo alla pera.
       L'arancia, la pesca e la pera sono frutti.
       l'albero della mela.
       """

### Tokenize the sentence
Keras provides the Tokenizer class that can be used to perform this encoding. First, the Tokenizer is fit on the words list (the vocabulary) to develop the mapping from words to unique integers. Then sequences of text can be converted to sequences of integers by calling the texts_to_sequences() function.

In [99]:
def tokenize(sentence, vocab):

    tokenizer = Tokenizer(
         filters='!"#$%&()\'*+,-./:;<=>?@[\\]^_`{|}~\t\n',
         lower=True, 
         split=' '
    )
    tokenizer.fit_on_texts([vocab])
    encoded = tokenizer.texts_to_sequences([sentence])[0]

    return np.array(encoded), len(tokenizer.word_index), tokenizer

In [100]:
vocab = sentence
encoded, vocab_size, tokenizer = tokenize(sentence, vocab)
print('Vocab size: ', vocab_size)
print('Shape of encoded: ', encoded.shape)

encoded

Vocab size:  39
Shape of encoded:  (57,)


array([12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  1, 26,  6,
        2,  3,  7,  4, 27, 28, 29, 30,  5, 31,  6,  2,  3,  8,  9, 32,  2,
       33,  1, 34,  7,  4,  1,  2,  3,  8, 10, 11,  4,  5,  9,  1,  5, 10,
       35, 36, 11, 37, 38, 39])

### Create the sequences

In [101]:
def create_sequences(encoded, sequence_length):
    
    sequences = np.zeros((len(encoded), sequence_length))
    for i in range(0, len(encoded) - 1):
        sequence = encoded[i:i+sequence_length]
        missing_words = sequence_length - sequence.shape[0]
        if missing_words > 0:
            sequences[i,:-missing_words] = sequence
        else:
            sequences[i,:] = sequence

    return sequences

In [102]:
sequence_length = 3 # (2 input, 1 output)
sequences = create_sequences(encoded, sequence_length)
print('Shape of sequences: ', sequences.shape)

Shape of sequences:  (57, 3)


### Create X_train and Y_Train

In [103]:
X_train = sequences[:,:-1]
Y_train = sequences[:,-1]
print('Shape of X_train: ', X_train.shape)
print('Shape of Y_train: ',Y_train.shape)

Shape of X_train:  (57, 2)
Shape of Y_train:  (57,)


### One-Hot encoding
Encode output to one-hot vector

In [104]:
n_classes = vocab_size + 1
Y_train = keras.utils.to_categorical(Y_train, num_classes = n_classes)

### Create the model

In [105]:
def create_model(input_shape, n_classes, vocab_len, emb_dim):

    # Create the input layer
    input_layer = Input(shape=input_shape, dtype=np.int32)
    
    # Propagate input_layer through your embedding layer, you get back the embeddings
    embeddings = Embedding(vocab_len + 1, emb_dim)(input_layer)
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a batch of sequences.
    X = LSTM(128, return_sequences=True)(embeddings)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a single hidden state, not a batch of sequences.
    X = LSTM(128)(X)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X through a Dense layer with softmax activation to get back a batch of n-dimensional vectors.
    X = Dense(n_classes, activation='softmax')(X)
    
    # Create Model instance which converts input_layer into X.
    model = Model(input_layer, X)
    
   
    return model

In [106]:
model = create_model(
    (X_train.shape[1],),    # input shape
    n_classes,              # number of classes
    vocab_size,             # size of vocabulary
    30                      # embedding dimension
)
    
model.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 2)]               0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 2, 30)             1200      
_________________________________________________________________
lstm_14 (LSTM)               (None, 2, 128)            81408     
_________________________________________________________________
dropout_14 (Dropout)         (None, 2, 128)            0         
_________________________________________________________________
lstm_15 (LSTM)               (None, 128)               131584    
_________________________________________________________________
dropout_15 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 40)                5160

### Compile and fit the model

In [107]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit network
model.fit(X_train, Y_train, epochs=500, verbose=2)

Epoch 1/500
2/2 - 3s - loss: 3.6892 - accuracy: 0.0175
Epoch 2/500
2/2 - 0s - loss: 3.6874 - accuracy: 0.1053
Epoch 3/500
2/2 - 0s - loss: 3.6858 - accuracy: 0.1228
Epoch 4/500
2/2 - 0s - loss: 3.6843 - accuracy: 0.1228
Epoch 5/500
2/2 - 0s - loss: 3.6818 - accuracy: 0.1228
Epoch 6/500
2/2 - 0s - loss: 3.6808 - accuracy: 0.0702
Epoch 7/500
2/2 - 0s - loss: 3.6784 - accuracy: 0.0877
Epoch 8/500
2/2 - 0s - loss: 3.6776 - accuracy: 0.0877
Epoch 9/500
2/2 - 0s - loss: 3.6743 - accuracy: 0.1404
Epoch 10/500
2/2 - 0s - loss: 3.6736 - accuracy: 0.0351
Epoch 11/500
2/2 - 0s - loss: 3.6696 - accuracy: 0.1404
Epoch 12/500
2/2 - 0s - loss: 3.6663 - accuracy: 0.1228
Epoch 13/500
2/2 - 0s - loss: 3.6654 - accuracy: 0.1404
Epoch 14/500
2/2 - 0s - loss: 3.6611 - accuracy: 0.1404
Epoch 15/500
2/2 - 0s - loss: 3.6553 - accuracy: 0.0702
Epoch 16/500
2/2 - 0s - loss: 3.6513 - accuracy: 0.1053
Epoch 17/500
2/2 - 0s - loss: 3.6464 - accuracy: 0.1228
Epoch 18/500
2/2 - 0s - loss: 3.6451 - accuracy: 0.1053
E

<keras.callbacks.History at 0x7fa872845630>

### Evaluate

In [108]:
in_text = "succo alla"
test_encoded = tokenizer.texts_to_sequences([in_text])[0]
test_encoded = np.array(test_encoded)
test_encoded = np.reshape(test_encoded, (1, test_encoded.shape[0]))

Y = model.predict(test_encoded, verbose=0)
print(test_encoded.shape)
print (in_text)
top5 = np.argsort(Y)[0,-5:]
top5 = top5[::-1]
for n in top5:
    print ('     -->', str(tokenizer.index_word.get(n)).ljust(10), '{:.2f}%'.format(Y[0,n] * 100))


(1, 2)
succo alla
     --> pera       51.62%
     --> pesca      47.51%
     --> un         0.24%
     --> di         0.23%
     --> arancia    0.09%


In [109]:
j = (sequence_length - 1)
for i in range(encoded.shape[0]):
    test_encoded = encoded[i:i+j]
    if test_encoded.shape[0] == j:
        test_encoded = np.reshape(test_encoded, (1, j))
        Y = model.predict(test_encoded, verbose=0)

        s = ''
        for k in range(j):
            s = s + tokenizer.index_word.get(test_encoded[0,k]) + ' '

        #print (tokenizer.index_word.get(test_encoded[0,0]), tokenizer.index_word.get(test_encoded[0,1]), '-->', tokenizer.index_word.get(np.argmax(Y)))
        print (s, '-->', tokenizer.index_word.get(np.argmax(Y)))


dovendo fare  --> delle
fare delle  --> lunghe
delle lunghe  --> passeggiate
lunghe passeggiate  --> meglio
passeggiate meglio  --> indossare
meglio indossare  --> scarpe
indossare scarpe  --> comode
scarpe comode  --> per
comode per  --> colazione
per colazione  --> ho
colazione ho  --> mangiato
ho mangiato  --> pane
mangiato pane  --> e
pane e  --> marmellata
e marmellata  --> con
marmellata con  --> un
con un  --> succo
un succo  --> alla
succo di  --> arancia
di arancia  --> lucia
arancia lucia  --> invece
lucia invece  --> ha
invece ha  --> preso
ha preso  --> la
preso la  --> torta
la torta  --> con
torta con  --> un
con un  --> succo
un succo  --> alla
succo alla  --> pera
alla pesca  --> giovanni
pesca giovanni  --> un
giovanni un  --> cornetto
un cornetto  --> e
cornetto e  --> spremuta
e spremuta  --> di
spremuta di  --> arancia
di arancia  --> lucia
arancia e  --> un
e un  --> succo
un succo  --> alla
succo alla  --> pera
alla pera  --> l
pera l  --> arancia
l arancia  --> l