# Magnitude word embeddings + Keras LSTM

https://colab.research.google.com/drive/1lOcAhIffLW8XC6QsKzt5T_ZqPP4Y9eS4#scrollTo=eHiu14ba0Jkz

In [11]:
from ipynb.fs.full.data_loader import load_train_test_data
from ipynb.fs.full.utils import eval_keras

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GaussianNoise, LSTM, Bidirectional, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer

from pymagnitude import Magnitude, MagnitudeUtils
from nltk.tokenize import word_tokenize

from math import ceil
import numpy as np

from __future__ import print_function

---

In [12]:
MAX_WORDS = 50 # TODO make at least 50; The maximum number of words the sequence model will consider
STD_DEV = 0.01 # Deviation of noise for Gaussian Noise applied to the embeddings
HIDDEN_UNITS = 100 # The number of hidden units from the LSTM
DROPOUT_RATIO = .8 # The ratio to dropout
BATCH_SIZE = 100 # The number of examples per train/validation step
EPOCHS = 50 # The number of times to repeat through all of the training data
LEARNING_RATE = .01 # The learning rate for the optimizer

vectors = Magnitude('~/.magnitude/wiki-news-300d-1M.magnitude', pad_to_length = MAX_WORDS)

In [13]:
train_X, test_X, train_y, test_y = load_train_test_data()

In [14]:
# tokenizer = CountVectorizer(stop_words="english").build_analyzer()
# train_X_tokenized = [tokenizer(review) for review in train_X]

In [15]:
def create_model_accuracy():
    model = Sequential()

    model.add(GaussianNoise(STD_DEV, input_shape=(MAX_WORDS, vectors.dim)))
    model.add(Bidirectional(LSTM(HIDDEN_UNITS, activation='tanh'), merge_mode='concat'))
    model.add(Dropout(DROPOUT_RATIO))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(lr=LEARNING_RATE),
        metrics=['accuracy'])
    
    return model

In [16]:
# TODO how does handle 3D input from magnitude (each word is list)?
# vectors.query(["cat", "dog"])[1]

def create_model_categorical_accuracy():
    model = Sequential()

    model.add(GaussianNoise(STD_DEV, input_shape=(MAX_WORDS, vectors.dim)))
    model.add(Bidirectional(LSTM(HIDDEN_UNITS, activation='tanh'), merge_mode='concat'))
    model.add(Dropout(DROPOUT_RATIO))
    model.add(Dense(2, activation='softmax'))
    
    model.compile(
        loss='categorical_crossentropy',
        optimizer=Adam(lr=LEARNING_RATE),
        metrics=['categorical_accuracy'])
    
    return model

In [17]:
def embed(df):
    tokenized = [word_tokenize(line) for line in df]
    embeddings = vectors.query(tokenized)
    return embeddings

In [18]:
model_acc = create_model_accuracy()
model_acc.fit(embed(train_X), train_y, epochs=5, batch_size=32, validation_split=0.1)
eval_keras(model_acc, embed(test_X), test_y)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss: 0.1629543900489807
Test accuracy: 0.9488818049430847


In [19]:
model_acc.predict(embed(["sharika https://"]))

array([[0.9153353]], dtype=float32)

In [20]:
model_cat_acc = create_model_categorical_accuracy()
model_cat_acc.fit(embed(train_X), MagnitudeUtils.to_categorical(train_y, 2), epochs=5, batch_size=32, validation_split=0.1)
eval_keras(model_cat_acc, embed(test_X), MagnitudeUtils.to_categorical(test_y, 2))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss: 0.1688094437122345
Test accuracy: 0.9297124743461609


In [21]:
model_cat_acc.predict(embed(["sharika https://"]))

array([[0.00447831, 0.99552166]], dtype=float32)

In [22]:
# TODO weird accuracy
def create_model3():
    model = Sequential()

    model.add(GaussianNoise(STD_DEV, input_shape=(MAX_WORDS, vectors.dim)))
    model.add(Bidirectional(LSTM(HIDDEN_UNITS, activation='tanh'), merge_mode='concat'))
    model.add(Dropout(DROPOUT_RATIO))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(lr=LEARNING_RATE),
        metrics=['categorical_accuracy'])
    
    return model
    
    return model

In [23]:
model3 = create_model3()
model3.fit(embed(train_X), train_y, epochs=5, batch_size=32, validation_split=0.1)
eval_keras(model3, embed(test_X), test_y)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test loss: 0.22809937596321106
Test accuracy: 1.0


In [24]:
model3.predict(embed(test_X[:10]))

array([[0.99906766],
       [0.0307785 ],
       [0.5196377 ],
       [0.28241655],
       [0.16814643],
       [0.06639221],
       [0.999691  ],
       [0.9733412 ],
       [0.99023247],
       [0.00595078]], dtype=float32)