Reference link for using Word Embeddings from genesim library - https://machinelearningmastery.com/develop-word-embeddings-python-gensim/

Handling Out-of-Vocabulary Words in Natural Language Processing based on Context - Better way than the one I followed below - [Link](https://medium.com/@shabeelkandi/handling-out-of-vocabulary-words-in-natural-language-processing-based-on-context-4bbba16214d5)

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

## Importing Google's Word2Vec: (300 dimensional)

In [79]:
from gensim.models import KeyedVectors 
filename = r'C:\Users\Monish Kumar\Python projects\# Natural Language Proccessing\GoogleNews-vectors-negative300.bin.gz'
word2vec = KeyedVectors.load_word2vec_format(filename, binary=True)

## Training custom Word2Vec on imdb dataset: 300 dimensional
#### Attributes of genism Word2Vec:
- size: (default 100) The number of dimensions of the embedding, e.g. the length of the dense vector to represent each token (word).
- window: (default 5) The maximum distance between a target word and words around the target word.
- min_count: (default 5) The minimum count of words to consider when training the model; words with an occurrence less than this count will be ignored.
- workers: (default 3) The number of threads to use while training.
- sg: (default 0 or CBOW) The training algorithm, either CBOW (0) or skip gram (1).
- negative = 10 - for negative sampling


In [40]:
from gensim.models import Word2Vec
from keras.datasets import imdb

# Loading the imdb dataset
vocabulary_size = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = vocabulary_size)
print('Loaded imdb dataset with {} training samples, {} test samples'.format(len(X_train), len(X_test)))

# Converting the words from numbers to words
word_index = imdb.get_word_index()
id2word = {i: word for word, i in word_index.items()}
imdb_dataset = pd.Series(X_train).apply(lambda x: [id2word.get(i) for i in x ]) 

# Train our custom model on the imdb dataset
embedding_size=300
word2vec_custom = Word2Vec(imdb_dataset, min_count = 1, size = embedding_size, window = 5)

# Saving the model
word2vec_custom.save('word2vec_custom_imdb.bin')


Loaded dataset with 25000 training samples, 25000 test samples


In [None]:
# This word (diane keaton) is present in imdb dataset but not in Googles's Word2Vec
word2vec_imdb['keaton']

## Defining the LSTM model:

In [None]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

def train_lstm(X_train, y_train, embedding_matrix, embedding_trainable, embedding_size=300, batch_size = 64, num_epochs = 3):
    
    embedding_size=300 

    model=Sequential()
    model.add(Embedding(vocabulary_size + 1, embedding_size, 
                        weights=[embedding_matrix], input_length=max_words, 
                        trainable = embedding_trainable))
    model.add(LSTM(100))
    # model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    print(model.summary())

    batch_size = 64
    num_epochs = 3

    from sklearn.model_selection import train_test_split
    X_train2, X_valid, y_train2, y_valid = train_test_split(X_train,y_train, test_size = 0.33, random_state = 123)

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train2, y_train2, validation_data=(X_valid, y_valid), batch_size=batch_size, epochs=num_epochs)

## Loading and padding the dataset:

In [64]:
from keras.datasets import imdb
vocabulary_size = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = vocabulary_size)
print('Loaded dataset with {} training samples, {} test samples'.format(len(X_train), len(X_test)))

Loaded dataset with 25000 training samples, 25000 test samples


In [41]:
word_index = imdb.get_word_index()
id2word = {i: word for word, i in word_index.items()}
print("'Review':",[id2word.get(i, ' ') for i in X_train[0]])
print("\n'Label':",y_train[0])

'Review': ['the', 'as', 'you', 'with', 'out', 'themselves', 'powerful', 'lets', 'loves', 'their', 'becomes', 'reaching', 'had', 'journalist', 'of', 'lot', 'from', 'anyone', 'to', 'have', 'after', 'out', 'atmosphere', 'never', 'more', 'room', 'and', 'it', 'so', 'heart', 'shows', 'to', 'years', 'of', 'every', 'never', 'going', 'and', 'help', 'moments', 'or', 'of', 'every', 'chest', 'visual', 'movie', 'except', 'her', 'was', 'several', 'of', 'enough', 'more', 'with', 'is', 'now', 'current', 'film', 'as', 'you', 'of', 'mine', 'potentially', 'unfortunately', 'of', 'you', 'than', 'him', 'that', 'with', 'out', 'themselves', 'her', 'get', 'for', 'was', 'camp', 'of', 'you', 'movie', 'sometimes', 'movie', 'that', 'with', 'scary', 'but', 'and', 'to', 'story', 'wonderful', 'that', 'in', 'seeing', 'in', 'character', 'to', 'of', '70s', 'musicians', 'with', 'heart', 'had', 'shadows', 'they', 'of', 'here', 'that', 'with', 'her', 'serious', 'to', 'have', 'does', 'when', 'from', 'why', 'what', 'have', '

In [5]:
print('Maximum review length: {}'.format(len(max((X_train + X_test), key=len))))
print('Minimum review length: {}'.format(len(min((X_train + X_test), key=len))))

Maximum review length: 2697
Minimum review length: 70


In [55]:
from keras.preprocessing.sequence import pad_sequences
max_words = 1000

# By default Pre-Sequence Truncation is followed - if you want to use Post-Sequence Truncation use truncating='post'
# By default Pre-Sequence Padding is followed - if you want to use Post Padding use padding='post'
X_train = pad_sequences(X_train, maxlen=max_words)
X_test = pad_sequences(X_test, maxlen=max_words)

## Method 1: Not removing the words not available in the dictionary

In [70]:
# Using a vector of zeros for every missing word
embedding_dim = len(word2vec['hi']) # 300!

word_index = imdb.get_word_index()
embedding_matrix = np.zeros((vocabulary_size + 1, embedding_dim))
id2word = {i: word for word, i in word_index.items()}

count = 0
for i in range(1,vocabulary_size + 1):
    word = id2word.get(i)
    try: 
        embedding_matrix[i] = word2vec[word]
    except:  # if word not present in the dictionary - leave the row of the embedding matrix to be empty
        count +=1
        
print('No. of words not available in the dictionary:', count)

No. of words not available in the dictionary: 714


### Training LSTM model:

In [69]:
train_lstm(X_train, y_train, embedding_matrix = embedding_matrix, embedding_trainable = True)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 1000, 300)         3000300   
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 101       
Total params: 3,160,801
Trainable params: 3,160,801
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 16750 samples, validate on 8250 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x235fe59db08>

In [70]:
train_lstm(X_train, y_train, embedding_matrix = embedding_matrix, embedding_trainable = False)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 1000, 300)         3000300   
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 101       
Total params: 3,160,801
Trainable params: 160,501
Non-trainable params: 3,000,300
_________________________________________________________________
None
Train on 16750 samples, validate on 8250 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x2348f3277c8>

## Method 2: Removing the words not available in the dictionary

In [2]:
import pandas as pd
import numpy as np

from keras.datasets import imdb
vocabulary_size = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = vocabulary_size)
print('Loaded dataset with {} training samples, {} test samples'.format(len(X_train), len(X_test)))

Using TensorFlow backend.


Loaded dataset with 25000 training samples, 25000 test samples


In [None]:
# Taking a hell of a time (infinite) to process

word_index = imdb.get_word_index()
id2word = {i: word for word, i in word_index.items()}

word2vec_vocab = list(word2vec.vocab.keys())
X_removed = pd.Series(X_train).apply(lambda x: [id2word.get(i) for i in x if id2word.get(i) in word2vec_vocab])

## Method 3: Using the custom embeddings (imdb dataset) for unknown words

In [78]:
# Loading our custom model(embedding) trained on imdb dataset
word2vec_imdb = Word2Vec.load('word2vec_custom_imdb.bin')

embedding_dim = 300
word_index = imdb.get_word_index()
embedding_matrix = np.zeros((vocabulary_size + 1, embedding_dim))
id2word = {i: word for word, i in word_index.items()}

for i in range(1,vocabulary_size + 1):
    word = id2word.get(i)
    try: 
        embedding_matrix[i] = word2vec[word]
    except:  
        try:
            # if word not present in the dictionary - use custom trained embedding
            embedding_matrix[i] = word2vec_imdb[word]
        except:
            print('Why is this word not found in imdb dataset?','Index:',i,', Word:',word)

Why is this word not found in imdb dataset? Index: 3 , Word: a


In [75]:
train_lstm(X_train, y_train, embedding_matrix = embedding_matrix, embedding_trainable = True)

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 1000, 300)         3000300   
_________________________________________________________________
lstm_7 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 101       
Total params: 3,160,801
Trainable params: 3,160,801
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 16750 samples, validate on 8250 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


## Doubts:

In [None]:
# word2vec['< UNK >'] - train them
# model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))