# Language Model (ITA)

https://machinelearningmastery.com/develop-word-based-neural-language-models-python-keras/

In [148]:
import numpy as np
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import Model
from keras.layers import Dense, Input, Dropout, LSTM, Embedding, Activation


In [149]:
# set tu True to use pretrained word2vec embedding matrix
USE_WORD2VEC = False

In [150]:
# source text
sentence = """dovendo fare delle lunghe passeggiate
       meglio indossare scarpe comode.
       Per colazione ho mangiato pane e marmellata con un succo di arancia,
       Lucia invece ha preso la torta con succo alla pesca,
       Giovanni un cornetto e spremuta di arancia e un succo alla pera.
       L'arancia, la pesca e la pera sono frutti.
       l'albero della mela.
       """

### Read the Word2Vec file

In [151]:
def read_word2vec_matrix(word2vec_file):
    with open(word2vec_file, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        words = set()
        word_to_vec_map = {}

        line = f.readline().strip().split()
        m = line[0] # number of rows
        n = line[1] # number of cols

        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)

    return words, word_to_vec_map

### Tokenize the sentence
Keras provides the Tokenizer class that can be used to perform this encoding. First, the Tokenizer is fit on the words list (the vocabulary) to develop the mapping from words to unique integers. Then sequences of text can be converted to sequences of integers by calling the texts_to_sequences() function.

In [152]:
def tokenize(sentence, vocab):

    tokenizer = Tokenizer(
         filters='!"#$%&()\'*+,-./:;<=>?@[\\]^_`{|}~\t\n',
         lower=True, 
         split=' '
    )
    tokenizer.fit_on_texts([vocab])
    encoded = tokenizer.texts_to_sequences([sentence])[0]

    return np.array(encoded), len(tokenizer.word_index), tokenizer

In [153]:
if USE_WORD2VEC :
    # words: set of words in the vocabulary
    # word_to_vec_map: dictionary mapping words to their Word2Vec vector representation
    words, word_to_vec_map = read_word2vec_matrix('../../../nlp/model.50000.txt')
    vocab = list(words)
else :
    vocab = sentence


In [154]:
encoded, vocab_size, tokenizer = tokenize(sentence, vocab)
print('Vocab size: ', vocab_size)
print('Shape of encoded: ', encoded.shape)

encoded

Vocab size:  39
Shape of encoded:  (56,)


array([12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  1, 26,  6,
        2,  3,  7,  4, 27, 28, 29, 30,  5, 31,  6,  3,  8,  9, 32,  2, 33,
        1, 34,  7,  4,  1,  2,  3,  8, 10, 11,  4,  5,  9,  1,  5, 10, 35,
       36, 11, 37, 38, 39])

### Create the sequences

In [155]:
def create_sequences(encoded, sequence_length):
    
    sequences = np.zeros((len(encoded), sequence_length))
    for i in range(0, len(encoded) - 1):
        sequence = encoded[i:i+sequence_length]
        missing_words = sequence_length - sequence.shape[0]
        if missing_words > 0:
            sequences[i,:-missing_words] = sequence
        else:
            sequences[i,:] = sequence

    return sequences

In [156]:
sequence_length = 3
sequences = create_sequences(encoded, sequence_length)
print('Shape of sequences: ', sequences.shape)

Shape of sequences:  (56, 3)


### Create X_train and Y_Train

In [157]:
X_train = sequences[:,:-1]
Y_train = sequences[:,-1]
print('Shape of X_train: ', X_train.shape)
print('Shape of Y_train: ',Y_train.shape)

Shape of X_train:  (56, 2)
Shape of Y_train:  (56,)


### One-Hot encoding
Encode output to one-hot vector

In [158]:
n_classes = vocab_size + 1
Y_train = keras.utils.to_categorical(Y_train, num_classes = n_classes)

In [159]:
def exists_word(word):

    try:
        word_to_vec_map[word]
        found = True
    except:
        found = False

    return found


In [160]:
w = 'il'
exists = exists_word(w)
if exists: print('the word "', w, '" exists in the vocabulary') 
else : print('the word "', w, '" doesen\'t exist in the vocabulary')



the word " il " doesen't exist in the vocabulary


### Create the pretrained embedding layer

In [161]:
def pretrained_embedding_layer(word_to_vec_map, vocab_len, emb_dim):
       
    emb_matrix = np.zeros((vocab_len + 1, emb_dim))
    
    # Set each row "index" of the embedding matrix to be the word vector representation of the "index"th word of the vocabulary
    index = 1
    for word in word_to_vec_map:
        emb_matrix[index, :] = word_to_vec_map[word]
        index += 1

    # Define Keras embedding layer with the correct output/input sizes, make it trainable. Use Embedding(...). Make sure to set trainable=False. 
    embedding_layer = Embedding(vocab_len + 1, emb_dim)
    embedding_layer.trainable = False

    # Build the embedding layer, it is required before setting the weights of the embedding layer. Do not modify the "None".
    embedding_layer.build((None,))
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

### Create the embedding layer

In [162]:
def embedding_layer(vocab_len, emb_dim):   
       
    # Define Keras embedding layer
    embedding_layer = Embedding(vocab_len + 1, emb_dim)
    embedding_layer.trainable = True
       
    return embedding_layer

### Create the model

In [163]:
def create_model(input_shape, n_classes, embedding_layer):

        # Create the input layer
    input_layer = Input(shape=input_shape, dtype=np.int32)
        
    # Propagate input_layer through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(input_layer)   
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a batch of sequences.
    X = LSTM(128, return_sequences=True)(embeddings)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a single hidden state, not a batch of sequences.
    X = LSTM(128)(X)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X through a Dense layer with softmax activation to get back a batch of n-dimensional vectors.
    X = Dense(n_classes, activation='softmax')(X)
    
    # Create Model instance which converts input_layer into X.
    model = Model(input_layer, X)
    
   
    return model

In [164]:
if(USE_WORD2VEC):
    model = create_model((X_train.shape[1],), n_classes, pretrained_embedding_layer(word_to_vec_map, vocab_size, 100))
else :
    model = create_model((X_train.shape[1],), n_classes, embedding_layer(vocab_size, 10))
    
model.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 2)]               0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 2, 10)             400       
_________________________________________________________________
lstm_14 (LSTM)               (None, 2, 128)            71168     
_________________________________________________________________
dropout_14 (Dropout)         (None, 2, 128)            0         
_________________________________________________________________
lstm_15 (LSTM)               (None, 128)               131584    
_________________________________________________________________
dropout_15 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 40)                5160

### Compile and fit the model

In [165]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit network
model.fit(X_train, Y_train, epochs=500, verbose=2)

Epoch 1/500
2/2 - 3s - loss: 3.6891 - accuracy: 0.0179
Epoch 2/500
2/2 - 0s - loss: 3.6877 - accuracy: 0.0714
Epoch 3/500
2/2 - 0s - loss: 3.6865 - accuracy: 0.0536
Epoch 4/500
2/2 - 0s - loss: 3.6853 - accuracy: 0.0536
Epoch 5/500
2/2 - 0s - loss: 3.6836 - accuracy: 0.0357
Epoch 6/500
2/2 - 0s - loss: 3.6822 - accuracy: 0.0714
Epoch 7/500
2/2 - 0s - loss: 3.6811 - accuracy: 0.0893
Epoch 8/500
2/2 - 0s - loss: 3.6795 - accuracy: 0.0714
Epoch 9/500
2/2 - 0s - loss: 3.6778 - accuracy: 0.0893
Epoch 10/500
2/2 - 0s - loss: 3.6763 - accuracy: 0.0893
Epoch 11/500
2/2 - 0s - loss: 3.6735 - accuracy: 0.0893
Epoch 12/500
2/2 - 0s - loss: 3.6731 - accuracy: 0.0714
Epoch 13/500
2/2 - 0s - loss: 3.6709 - accuracy: 0.1071
Epoch 14/500
2/2 - 0s - loss: 3.6679 - accuracy: 0.0714
Epoch 15/500
2/2 - 0s - loss: 3.6655 - accuracy: 0.0714
Epoch 16/500
2/2 - 0s - loss: 3.6613 - accuracy: 0.0714
Epoch 17/500
2/2 - 0s - loss: 3.6596 - accuracy: 0.0893
Epoch 18/500
2/2 - 0s - loss: 3.6564 - accuracy: 0.0714
E

<keras.callbacks.History at 0x7f0eae499c88>

### Evaluate

In [170]:
in_text = "succo alla"
test_encoded = tokenizer.texts_to_sequences([in_text])[0]
test_encoded = np.array(test_encoded)
test_encoded = np.reshape(test_encoded, (1,2))

Y = model.predict(test_encoded, verbose=0)
print(test_encoded.shape)
print (in_text)
top5 = np.argsort(Y)[0,-5:]
top5 = top5[::-1]
for n in top5:
    print ('     -->', str(tokenizer.index_word.get(n)).ljust(10), '{:.2f}%'.format(Y[0,n] * 100))


(1, 2)
succo alla
     --> pesca      51.48%
     --> pera       44.08%
     --> e          1.84%
     --> arancia    0.85%
     --> sono       0.78%


In [167]:
encoded
for i in range(encoded.shape[0]):
    test_encoded = encoded[i:i+2]
    if test_encoded.shape[0] == 2:
        test_encoded = np.reshape(test_encoded, (1, 2))
        Y = model.predict(test_encoded, verbose=0)
        print (tokenizer.index_word.get(test_encoded[0,0]), tokenizer.index_word.get(test_encoded[0,1]), '-->', tokenizer.index_word.get(np.argmax(Y)))


dovendo fare --> delle
fare delle --> lunghe
delle lunghe --> passeggiate
lunghe passeggiate --> meglio
passeggiate meglio --> indossare
meglio indossare --> scarpe
indossare scarpe --> comode
scarpe comode --> per
comode per --> colazione
per colazione --> ho
colazione ho --> mangiato
ho mangiato --> pane
mangiato pane --> e
pane e --> marmellata
e marmellata --> con
marmellata con --> un
con un --> succo
un succo --> di
succo di --> arancia
di arancia --> lucia
arancia lucia --> invece
lucia invece --> ha
invece ha --> preso
ha preso --> la
preso la --> torta
la torta --> con
torta con --> succo
con succo --> alla
succo alla --> pesca
alla pesca --> giovanni
pesca giovanni --> un
giovanni un --> cornetto
un cornetto --> e
cornetto e --> spremuta
e spremuta --> di
spremuta di --> arancia
di arancia --> lucia
arancia e --> un
e un --> succo
un succo --> di
succo alla --> pesca
alla pera --> l
pera l --> arancia
l arancia --> la
arancia la --> pesca
la pesca --> e
pesca e --> la
e la --