# Loading the raw IMDB movie data.

In [1]:
import os
imdb_dir="aclImdb"
train_dir=os.path.join(imdb_dir,"train")

labels=[]
texts=[]
for label_type in ["neg","pos"]:
    dir_name=os.path.join(train_dir,label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:]==".txt":
            f=open(os.path.join(dir_name,fname),encoding="UTF-8")
            texts.append(f.read())
            f.close()
            if label_type=="neg":
                labels.append(0)
            else:
                labels.append(1)
print(labels[0])
print(texts[0])

0
Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.


In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen=20
max_words=10000
tokenizer=Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences=tokenizer.texts_to_sequences(texts)

data=pad_sequences(sequences,maxlen=maxlen)
labels=np.asarray(labels)
indexs=np.arange(data.shape[0])
np.random.shuffle(indexs)
x_train=data[indexs];y_train=labels[indexs]

Using TensorFlow backend.


# Load the glove vectors

In [4]:
glove_dir="glove.6B.100d"
embedding_index={}
f=open(os.path.join(glove_dir,"glove.6B.100d.txt"),encoding="UTF-8")
for line in f:
    values=line.split()
    word=values[0]
    coefs=np.asarray(values[1:],dtype="float32")##turn string into float32
    embedding_index[word]=coefs
f.close()
print("Found %s words. "%len(embedding_index))

Found 400000 words. 


In [5]:
embedding_dim=100
embedding_matrix=np.zeros((max_words,embedding_dim))
word_index=tokenizer.word_index
for word,i in word_index.items():
    embedding_vector=embedding_index.get(word)
    if i<max_words and embedding_vector is not None:
        embedding_matrix[i]=embedding_vector

# Define the network

In [6]:
from keras.models import Sequential
from keras.layers import Embedding,Flatten,Dense

model=Sequential()
model.add(Embedding(max_words,embedding_dim,input_length=maxlen))
model.add(Flatten())
#model.add(Dense(32,activation="relu"))
model.add(Dense(1,activation="sigmoid"))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 20, 100)           1000000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 2000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 2001      
Total params: 1,002,001
Trainable params: 1,002,001
Non-trainable params: 0
_________________________________________________________________


# Freeze the embedding layer

In [7]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable=False

In [8]:
model.compile(optimizer="rmsprop",
             loss="binary_crossentropy",
             metrics=["acc"])
history=model.fit(x_train,y_train,epochs=10,batch_size=32,validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
