I want to make a quick study into how a neural network can input & output **words**

We will explore this types of representation:
- One Hot encoding
- Raw integer
- Embeddings

!! The dataset contains word pairs with multiple possible outputs, for example, 'abalanzar' will be associated with multiple synonims so we will just pick the first one for the sake of simplicity (unless adding a random noise to the inpiut, the network will have a very hard time trying to associate a single input with multiple possible outputs) 

In [52]:
import re
# first let's load the data

pairs = []
words = []

with open("sinonimos.txt", "r") as document:
    for line in document:
        raw_string = re.sub(r'[^\w\s]', '', line)
        pair = raw_string.split()[:2]
        if not pair[0] in words and not pair[1] in words:
            pairs.append(pair)
            pairs.append(pair[::-1])
            words.append(pair[0])
            words.append(pair[1])
        
print("Pair count:", len(pairs))
print("Word count:", len(words))
pairs[:10]

Pair count: 4502
Word count: 4502


[['abalanzar', 'equilibrar'],
 ['equilibrar', 'abalanzar'],
 ['abecedario', 'silabario'],
 ['silabario', 'abecedario'],
 ['abertura', 'rendija'],
 ['rendija', 'abertura'],
 ['ablandar', 'molificar'],
 ['molificar', 'ablandar'],
 ['abogar', 'patrocinar'],
 ['patrocinar', 'abogar']]

In [53]:
from collections import Counter

word_counter = Counter(words)
print("Unique words:", len(word_counter.most_common()))
word_counter.most_common()

Unique words: 4502


[('abalanzar', 1),
 ('equilibrar', 1),
 ('abecedario', 1),
 ('silabario', 1),
 ('abertura', 1),
 ('rendija', 1),
 ('ablandar', 1),
 ('molificar', 1),
 ('abogar', 1),
 ('patrocinar', 1),
 ('abolir', 1),
 ('derogar', 1),
 ('abominar', 1),
 ('detestar', 1),
 ('aborigen', 1),
 ('nativo', 1),
 ('abortar', 1),
 ('malparir', 1),
 ('abrasar', 1),
 ('quemar', 1),
 ('abrazar', 1),
 ('ceñir', 1),
 ('abrir', 1),
 ('perforar', 1),
 ('absorber', 1),
 ('embeber', 1),
 ('abstenerse', 1),
 ('privarse', 1),
 ('abstracción', 1),
 ('ensimismamiento', 1),
 ('abultar', 1),
 ('acrecentar', 1),
 ('abundar', 1),
 ('sobrar', 1),
 ('abusar', 1),
 ('atropellar', 1),
 ('abyección', 1),
 ('Infamia', 1),
 ('abyecto', 1),
 ('despreciable', 1),
 ('acabar', 1),
 ('terminar', 1),
 ('academia', 1),
 ('escuela', 1),
 ('acaecer', 1),
 ('suceder', 1),
 ('acalorarse', 1),
 ('exaltarse', 1),
 ('acatar', 1),
 ('obedecer', 1),
 ('acción', 1),
 ('hecho', 1),
 ('acendrar', 1),
 ('depurar', 1),
 ('acervo', 1),
 ('cúmulo', 1),
 ('a

In [54]:
from keras.preprocessing.text import Tokenizer
# Let's now create a word dictionary
tokenizer = Tokenizer()
tokenizer.fit_on_texts(pairs)

tokenized_pairs = tokenizer.texts_to_sequences(pairs)

print("Number of words: ", len(tokenizer.word_index))
tokenized_pairs[:10]

Number of words:  4500


[[3, 4],
 [4, 3],
 [5, 6],
 [6, 5],
 [7, 8],
 [8, 7],
 [9, 10],
 [10, 9],
 [11, 12],
 [12, 11]]

In [55]:
# This will help us visualize the outputs much better
def indexToWord(indices):
    words = []
    for idx in indices:
        words.append(tokenizer.index_word[idx])
    return words

indexToWord([12, 15, 34, 21])

['patrocinar', 'abominar', 'acrecentar', 'abrasar']

## in: One Hot - out: One Hot

In [56]:
from keras.utils import to_categorical
import numpy as np

one_hot_pairs = to_categorical(tokenized_pairs)

one_hot_x = one_hot_pairs[:, 0] 
one_hot_y = one_hot_pairs[:, 1]

print(one_hot_x.shape, one_hot_y.shape)
indexToWord([np.argmax(one_hot_x[0]), np.argmax(one_hot_y[0])])

(4502, 4501) (4502, 4501)


['abalanzar', 'equilibrar']

In [82]:
#Let's now do a very simple model that will try to learn using One Hot input and output
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(300, activation="relu", input_shape=(4501,)))
model.add(Dense(4501, activation="softmax"))
model.summary()

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(one_hot_x, one_hot_y, shuffle=True, epochs=5, batch_size=256)

# Test trained model
tokenized_y = np.array(tokenized_pairs)[:, 1]
expected = indexToWord([tokenized_y[0], tokenized_y[100], tokenized_y[200], tokenized_y[300]])

pred = model.predict(np.array([one_hot_x[0], one_hot_x[100], one_hot_x[200], one_hot_x[300]]))
pred_tokenized = [np.argmax(p) for p in pred]
pred_words = indexToWord(pred_tokenized)

print("Expected:", expected)
print("Predicted:", pred_words)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_80 (Dense)             (None, 300)               1350600   
_________________________________________________________________
dense_81 (Dense)             (None, 4501)              1354801   
Total params: 2,705,401
Trainable params: 2,705,401
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Expected: ['equilibrar', 'infortunio', 'consciente', 'apariencia']
Predicted: ['equilibrar', 'infortunio', 'consciente', 'apariencia']


We can see that by using one hot encoddings the model can learn but after 5 epochs it only achieves 65% accuracy, this it's to be expected as a vector of size 4501 with all zeros except for a single 1 has a VERY low activation power and it's very hard for it to learn... :(  

## in: Raw integer - out: One Hot

In [75]:
# Model
model = Sequential()
model.add(Dense(300, activation="relu", input_shape=(1,))) # same as before, but it accepts a single value as input: the raw token
model.add(Dense(4501, activation="softmax"))
model.summary()

tokenized_x = np.array(tokenized_pairs)[:, 0]

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(tokenized_x, one_hot_y, shuffle=True, epochs=5, batch_size=256)

## Test trained model
pred = model.predict(np.array([tokenized_x[0], tokenized_x[100], tokenized_x[200], tokenized_x[300]]))
pred_tokenized = [np.argmax(p) for p in pred]
pred_words = indexToWord(pred_tokenized)

print("Expected:", expected)
print("Predicted:", pred_words)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_66 (Dense)             (None, 300)               600       
_________________________________________________________________
dense_67 (Dense)             (None, 4501)              1354801   
Total params: 1,355,401
Trainable params: 1,355,401
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Expected: ['equilibrar', 'infortunio', 'consciente', 'apariencia']
Predicted: ['asomar', 'asomar', 'asomar', 'asomar']


If the input is a raw integer the model doesn't seem to learn anything

## in: Embeddings - out: One Hot

In [83]:
from keras.layers import Embedding, Reshape
# Model
model = Sequential()
model.add(Embedding(4501, 100, input_length=1)) # the input shape is the same, but we transform it to an embedding before passing it further
model.add(Reshape((100,)))
model.add(Dense(300, activation="relu"))
model.add(Dense(4501, activation="softmax"))
model.summary()

tokenized_x = np.array(tokenized_pairs)[:, 0]

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(tokenized_x, one_hot_y, shuffle=True, epochs=5, batch_size=256)

## Test trained model
pred = model.predict(np.array([tokenized_x[0], tokenized_x[100], tokenized_x[200], tokenized_x[300]]))
pred_tokenized = [np.argmax(p) for p in pred]
pred_words = indexToWord(pred_tokenized)

print("Expected:", expected)
print("Predicted:", pred_words)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 1, 100)            450100    
_________________________________________________________________
reshape_12 (Reshape)         (None, 100)               0         
_________________________________________________________________
dense_82 (Dense)             (None, 300)               30300     
_________________________________________________________________
dense_83 (Dense)             (None, 4501)              1354801   
Total params: 1,835,201
Trainable params: 1,835,201
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Expected: ['equilibrar', 'infortunio', 'consciente', 'apariencia']
Predicted: ['equilibrar', 'impericia', 'consciente', 'impericia']


Not so good :( only 42% accuracy
Maybe the big bottleneck here is the output... let's start fiddling with that

## in: One Hot - out: Raw integer

In [84]:
model = Sequential()
model.add(Dense(300, activation="relu", input_shape=(4501,)))
model.add(Dense(1)) # we need the outputs to be values so we'll use a relu activation
model.summary()

model.compile(loss="mse", optimizer="adam", metrics=["accuracy"])
model.fit(one_hot_x, tokenized_y, epochs=5, batch_size=256)

# Test trained model
expected = indexToWord([tokenized_y[0], tokenized_y[100], tokenized_y[200], tokenized_y[300]])

pred = model.predict(np.array([one_hot_x[0], one_hot_x[100], one_hot_x[200], one_hot_x[300]]))
pred_tokenized = [x[0] for x in np.rint(pred).astype(np.int16)]
print(pred_tokenized)
pred_words = indexToWord(pred_tokenized)

print("Expected:", expected)
print("Predicted:", pred_words)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_84 (Dense)             (None, 300)               1350600   
_________________________________________________________________
dense_85 (Dense)             (None, 1)                 301       
Total params: 1,350,901
Trainable params: 1,350,901
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[4, 4, 4, 4]
Expected: ['equilibrar', 'infortunio', 'consciente', 'apariencia']
Predicted: ['equilibrar', 'equilibrar', 'equilibrar', 'equilibrar']


Using a raw integer as output also doesn't work :( the model has a very high loss that seems to be reducing but very VERY slowly, also, the model risks of outputing the wrong word just because thge value was 34.4 instead of 34.6 when the expected value is 35

## in: One Hot - out: Embeddings

In [86]:
model = Sequential()
model.add(Dense(300, activation="relu", input_shape=(4501,)))
model.add(Dense(4501, activation="softmax")) # the model will output a one-hot encoded value, but thanks to the loss function it will be associated with an embedding
model.summary()

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(one_hot_x, tokenized_y, shuffle=True, epochs=5, batch_size=256)

# Test trained model
tokenized_y = np.array(tokenized_pairs)[:, 1]
expected = indexToWord([tokenized_y[0], tokenized_y[100], tokenized_y[200], tokenized_y[300]])

pred = model.predict(np.array([one_hot_x[0], one_hot_x[100], one_hot_x[200], one_hot_x[300]]))
pred_tokenized = [np.argmax(p) for p in pred]
pred_words = indexToWord(pred_tokenized)

print("Expected:", expected)
print("Predicted:", pred_words)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_88 (Dense)             (None, 300)               1350600   
_________________________________________________________________
dense_89 (Dense)             (None, 4501)              1354801   
Total params: 2,705,401
Trainable params: 2,705,401
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Expected: ['equilibrar', 'infortunio', 'consciente', 'apariencia']
Predicted: ['equilibrar', 'infortunio', 'consciente', 'apariencia']


This performs very similarly to the One Hot - One Hot model

## in: Embeddings - out: Embeddings

In [125]:
# Model
model = Sequential()
model.add(Embedding(4501, 200, input_length=1)) # the input shape is the same, but we transform it to an embedding before passing it further
model.add(Reshape((200,)))
model.add(Dense(300, activation="relu"))
model.add(Dense(4501, activation="softmax"))
model.summary()

tokenized_x = np.array(tokenized_pairs)[:, 0]

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(tokenized_x, tokenized_y, epochs=15, batch_size=256)

## Test trained model
pred = model.predict(np.array([tokenized_x[0], tokenized_x[100], tokenized_x[200], tokenized_x[300]]))
pred_tokenized = [np.argmax(p) for p in pred]
pred_words = indexToWord(pred_tokenized)

print("Expected:", expected)
print("Predicted:", pred_words)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 1, 200)            900200    
_________________________________________________________________
reshape_16 (Reshape)         (None, 200)               0         
_________________________________________________________________
dense_96 (Dense)             (None, 300)               60300     
_________________________________________________________________
dense_97 (Dense)             (None, 4501)              1354801   
Total params: 2,315,301
Trainable params: 2,315,301
Non-trainable params: 0
_________________________________________________________________
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Expected: ['equilibrar', 'infortunio', 'consciente', 'apariencia']
Predicted: ['equilibrar', 

This is definitely the best model, as we achieved 82% accuracy after only 5 epochs

In [138]:
word = 'precisión'
word_index = tokenizer.word_index[word]
synonim = model.predict(np.array([word_index]))[0]
synonim = indexToWord([np.argmax(synonim)])

print(word, " ", synonim)

precisión   ['exactitud']
