# Model trained using 300-dimensional pretrained FastText English word vectors released by [Facebook](https://www.kaggle.com/yekenot/fasttext-crawl-300d-2m).

In [1]:
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense,LSTM,Dropout,Embedding,Bidirectional
from keras.callbacks import ModelCheckpoint,ReduceLROnPlateau,EarlyStopping # warto zobaczyć w oryginalnym notebooku użycie dodatkowych callbacków
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
maxlen = 400
n_embeddings = 300
n_features = 5000

W kodzie istotnym obiektem był `tokenizer.word_index`, czyli mapowanie ze słów do id słowa zapisanego w sekwencjach zbioru danych. My też musimy uzyskać taki obiekt. W dokumentacji można było znaleźć funkcję https://www.tensorflow.org/api_docs/python/tf/keras/datasets/imdb/get_word_index, którą najłatwiej użyć wykorzystując gotowy fragment kodu skopiowany i uzupełniony poniżej:

In [3]:
start_char = 1
oov_char = 2
index_from = 3

from tensorflow.keras.datasets import imdb
(X_train, y_train), (X_test, y_test) = imdb.load_data(
    start_char=start_char, oov_char=oov_char, index_from=index_from,
    num_words=5000, maxlen=maxlen, 
    path="/kaggle/input/keras-imdb/imdb.npz"
)

word_index = imdb.get_word_index(path="/kaggle/input/imdb-word-index/imdb_word_index.json")

# filtrowanie tylko pierwszych 5000 słów
word_index = {k: v for k,v in word_index.items() if v<n_features}

inverted_word_index = dict(
    (i + index_from, word) for (word, i) in word_index.items()
)
inverted_word_index[start_char] = "[START]"
inverted_word_index[oov_char] = "[OOV]"

In [4]:
# sprawdźmy, jakie są rozmiary word_index i inverted_word_index:
len(word_index), len(inverted_word_index)

(4999, 5001)

In [5]:
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

n_train = 3000
n_test = 1000
X_train = X_train[:n_train]
y_train = y_train[:n_train]
X_test = X_test[:n_test]
y_test = y_test[:n_test]

In [6]:
# test sieci z uczonymi embeddingami
model = Sequential()
model.add(Embedding(input_dim=n_features,output_dim=n_embeddings,input_length=maxlen))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(64, activation="tanh"))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
print(model.summary())

early_stopping = EarlyStopping(patience=3)
model.fit(X_train, y_train, epochs=100, callbacks=[early_stopping], validation_split=0.2)

model.evaluate(X_test, y_test)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 400, 300)          1500000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                85248     
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 1,589,473
Trainable params: 1,589,473
Non-trainable params: 0
_________________________________________________________________
None


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 2400 samples, validate on 600 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100


[0.7020515403747558, 0.7929999828338623]

In [7]:
# Sprawdźmy, czy inverted_word_index, którego chcemy wykorzystać przy embeddingach poprawnie mapuje liczbę na słowo 
X_train[0][182:], [inverted_word_index[x] for x in X_train[0][182:]]

(array([   1,   14,   22,   16,   43,  530,  973, 1622, 1385,   65,  458,
        4468,   66, 3941,    4,  173,   36,  256,    5,   25,  100,   43,
         838,  112,   50,  670,    2,    9,   35,  480,  284,    5,  150,
           4,  172,  112,  167,    2,  336,  385,   39,    4,  172, 4536,
        1111,   17,  546,   38,   13,  447,    4,  192,   50,   16,    6,
         147, 2025,   19,   14,   22,    4, 1920, 4613,  469,    4,   22,
          71,   87,   12,   16,   43,  530,   38,   76,   15,   13, 1247,
           4,   22,   17,  515,   17,   12,   16,  626,   18,    2,    5,
          62,  386,   12,    8,  316,    8,  106,    5,    4, 2223,    2,
          16,  480,   66, 3785,   33,    4,  130,   12,   16,   38,  619,
           5,   25,  124,   51,   36,  135,   48,   25, 1415,   33,    6,
          22,   12,  215,   28,   77,   52,    5,   14,  407,   16,   82,
           2,    8,    4,  107,  117,    2,   15,  256,    4,    2,    7,
        3766,    5,  723,   36,   71, 

In [8]:
# z wykorzystywanego notebooka - wczytywanie gotowych embeddingów:
import numpy as np

embedding_path = "../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec"

def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_path))

In [9]:
max(inverted_word_index.keys()), X_train.max()

(5002, 4999)

In [10]:
inverted_word_index[5000], inverted_word_index[5001], inverted_word_index[5002]

('resulting', 'spain', 'bergman')

In [11]:
# 3 słowa: ('resulting', 'spain', 'bergman') są nieużywane w X_train, bo X_train zawiera jedynie słowa o id równym 4999 i żadnych o większym id
# zatem można je wyrzucić z naszego inverted_word_index
inverted_word_index.pop(5000)
inverted_word_index.pop(5001)
inverted_word_index.pop(5002)

'bergman'

In [12]:
max(inverted_word_index.keys())

4999

In [13]:
word_index
nb_words = max(inverted_word_index.keys())
embedding_matrix = np.zeros((nb_words+1, n_embeddings))
for i, word in inverted_word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [14]:
model = Sequential()
# trainable=False - dzięki temu kod będzie trenował się szybciej, jest to jeden z głównych powodów używania gotowych embeddingów
model.add(Embedding(input_dim=n_features,output_dim=n_embeddings,input_length=maxlen,
                    weights = [embedding_matrix], trainable = False)) #using pre-trained embeddings
model.add(Bidirectional(LSTM(32)))
model.add(Dense(64, activation="tanh"))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
print(model.summary())

early_stopping = EarlyStopping(patience=3)
model.fit(X_train, y_train, epochs=100, callbacks=[early_stopping], validation_split=0.2)

model.evaluate(X_test, y_test)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 400, 300)          1500000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 64)                85248     
_________________________________________________________________
dense_3 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 1,589,473
Trainable params: 89,473
Non-trainable params: 1,500,000
_________________________________________________________________
None
Train on 2400 samples, validate on 600 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


[0.36977984714508055, 0.8560000061988831]