LEARNING WORD EMBEDDINGS WITH THE EMBEDDING LAYER

In [1]:
from keras.datasets import imdb
from keras import preprocessing
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding
import pandas as pd
import numpy as np

In [2]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = 10000)

In [3]:
# converting lists of integers to a two-dimensional tensor with integers and with shape (samples, max_length)

x_train = pad_sequences(x_train, maxlen=20)  # text cropping after 20 words in the review
x_test = pad_sequences(x_test, maxlen=20)

In [4]:
x_train

array([[  65,   16,   38, ...,   19,  178,   32],
       [  23,    4, 1690, ...,   16,  145,   95],
       [1352,   13,  191, ...,    7,  129,  113],
       ...,
       [  11, 1818, 7561, ...,    4, 3586,    2],
       [  92,  401,  728, ...,   12,    9,   23],
       [ 764,   40,    4, ...,  204,  131,    9]])

In [5]:
x_train.shape

(25000, 20)

In [6]:
model = Sequential()
model.add(Embedding(10000, 8, input_length=20))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid')) 
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

In [7]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 8)             80000     
                                                                 
 flatten (Flatten)           (None, 160)               0         
                                                                 
 dense (Dense)               (None, 1)                 161       
                                                                 
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________


In [8]:
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [9]:
model.evaluate(x_test,y_test)



[0.5298969149589539, 0.750760018825531]

--------------------------

predicting

In [10]:
predicted_test = model.predict(x_test)



In [11]:
predicted_test

array([[0.3148501 ],
       [0.79632163],
       [0.18995926],
       ...,
       [0.07835809],
       [0.28157496],
       [0.5552151 ]], dtype=float32)

In [12]:
a = pd.DataFrame(y_test,columns=['test_real'])

In [13]:
a['test_predicted'] = pd.DataFrame(predicted_test)
a['test_predicted_binary'] = pd.Series(map(lambda q: 0 if q<0.5 else 1,a['test_predicted']))

In [14]:
a

Unnamed: 0,test_real,test_predicted,test_predicted_binary
0,0,0.314850,0
1,1,0.796322,1
2,1,0.189959,0
3,0,0.576364,1
4,1,0.997197,1
...,...,...,...
24995,1,0.983962,1
24996,1,0.676212,1
24997,0,0.078358,0
24998,0,0.281575,0


In [15]:
np.unique(np.array(a['test_real']==a['test_predicted_binary']), return_counts=True) # 6231 misprediction and 18769 correct predictions

(array([False,  True]), array([ 6231, 18769], dtype=int64))