In [1]:
import pandas as pd
import numpy as np

from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D
from keras.layers.merge import Concatenate
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
BECHDEL_PATH = '../data/movies.csv'
IMDB_PATH = '../data/imdb_data.json'

movies_df = pd.read_csv(BECHDEL_PATH, delimiter=',')
movies_df = movies_df[['imdb','year', 'title', 'budget', 'intgross', 'binary']]

movies_df['binary'] = movies_df['binary'].str.replace('FAIL', '0')
movies_df['binary'] = movies_df['binary'].str.replace('PASS', '1')
movies_df['binary'] = movies_df['binary'].astype('int64')

imdb_df = pd.read_json(IMDB_PATH)
imdb_df = imdb_df[imdb_df['Type'] == 'movie']

dataset_df =  imdb_df.join(movies_df, how='inner')
print(", ".join([k for k in dataset_df.keys()]))

Actors, Awards, Country, Director, Error, Genre, Language, Metascore, Plot, Poster, Rated, Released, Response, Runtime, Title, Type, Writer, Year, imdbID, imdbRating, imdbVotes, imdb, year, title, budget, intgross, binary


In [3]:
dataset_df.head()

Unnamed: 0,Actors,Awards,Country,Director,Error,Genre,Language,Metascore,Plot,Poster,...,Year,imdbID,imdbRating,imdbVotes,imdb,year,title,budget,intgross,binary
0,"Jennifer Shirley, Blake Woodruff, Michael Rook...",,"USA, Canada",Stewart Hendler,,"Crime, Drama, Horror",English,,Sinister things begin happening to kidnappers ...,http://ia.media-imdb.com/images/M/MV5BMTM1Njgw...,...,2007,tt0435528,5.8,6584,tt1711425,2013,21 &amp; Over,13000000,42195766.0,0
1,"Michel Piccoli, Jerzy Stuhr, Renato Scarpa, Fr...",9 wins & 12 nominations.,"Italy, France",Nanni Moretti,,"Comedy, Drama","Italian, German, Latin, English, Spanish, Poli...",64.0,A story centered on the relationship between t...,http://ia.media-imdb.com/images/M/MV5BMTQ4MjYz...,...,2011,tt1456472,6.8,8556,tt1343727,2012,Dredd 3D,45000000,40868994.0,1
2,"Aml Ameen, Red Madrell, Noel Clarke, Adam Deacon",2 wins & 1 nomination.,UK,Menhaj Huda,,Drama,English,,A day in the life of a group of troubled 15-ye...,http://ia.media-imdb.com/images/M/MV5BMzg2Nzc2...,...,2006,tt0435680,6.8,12097,tt2024544,2013,12 Years a Slave,20000000,158607035.0,0
3,"Paget Brewster, Jeff Branson, Jess Weixler, Ra...",4 wins & 1 nomination.,USA,Ishai Setton,,Comedy,English,,A group of Connecticut locals enroll in an adu...,http://ia.media-imdb.com/images/M/MV5BMTg5OTQy...,...,2006,tt0460721,6.5,1209,tt1272878,2013,2 Guns,61000000,132493015.0,0
4,"Keira Knightley, Viggo Mortensen, Michael Fass...",Nominated for 1 Golden Globe. Another 17 wins ...,"UK, Germany, Canada, Switzerland",David Cronenberg,,"Biography, Drama, Thriller",English,76.0,A look at how the intense relationship between...,http://ia.media-imdb.com/images/M/MV5BMTU5Mjk3...,...,2011,tt1571222,6.5,61249,tt0453562,2013,42,40000000,95020213.0,0


In [4]:
dataset_df.shape

(1784, 27)

In [5]:
training = dataset_df.head(1500)
X_train = training['Plot'].tolist()
y_train = training['binary'].tolist()

testing = dataset_df.tail(284)
X_test = testing['Plot'].tolist()

y_test = testing['binary'].tolist()

### Pad the data
Each sentece has different lenghts. But our CNN models needs a fixed-size input. Hence, we include zeros at the ende so each sentence will have the same number of words.

In [6]:
def max_words(sentences):
    """ Return the maximum number of words in the dataset """
    max_num_words = -1
    for sentence in sentences:
        len_sentence = len(sentence)
        if len_sentence > max_num_words:
            max_num_words =len_sentence
    return max_num_words

def pad(sentences, max_sequence=None):
    """ Pad all the sententences in order to have sequences of same length """
    if not max_sequence:
        max_sequence = max_words(sentences)
        
    padded_dataset = []
    for sentence in sentences:
        sentence = sentence.split()
        len_padding = max_sequence - len(sentence)
        padded_sentence = sentence + ['</pad>'] * len_padding
        assert len(padded_sentence) == max_sequence
        padded_dataset.append(padded_sentence)
    
    assert len(padded_dataset) == len(sentences)
    return padded_dataset, max_sequence


In [7]:
train_pad, max_sequence = pad(X_train)
test_pad, _ = pad(X_test, max_sequence)

In [8]:
X_train[0]

'Sinister things begin happening to kidnappers who are holding a young boy for ransom in a remote cabin.'

In [9]:
' '.join(train_pad[0])

'Sinister things begin happening to kidnappers who are holding a young boy for ransom in a remote cabin. </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad> </pad>

### Create the lookup matrix and the vocabulary

In [10]:
from collections import Counter
from itertools import chain

def build_vocab(dataset, to_lower=False):
    """ Create a lookup table and a list with the vocabulary.

    Args:
        dataset (list): a matrix with the words from the dataset

    Returns:
        vocab_sorted (list): list of the words sorted by its frequency
        lookup (dict): a dictionary with the lookup table. The keys are
            the words and the values are the indexes.
    """
    # Count how many times a word appear in the dataset
    word_counts = Counter(chain(*dataset))
    # Create a list with the most common words sorted.
    # The position will be the index of the lookup table.
    vocab_sorted = []
    for word, _ in word_counts.most_common():
        if to_lower:
            word = word.lower()
        if word not in vocab_sorted:
            vocab_sorted.append(word)
    vocab_sorted.append('<oov>')
    # Create a lookup table using a dictionary. Map each index with a word
    lookup = {word: index for index, word in enumerate(vocab_sorted)}

    # TODO: Move this to the unittest
    assert len(list(lookup.keys())) == len(vocab_sorted)

    return vocab_sorted, lookup

In [11]:
vocab_sorted, lookup = build_vocab(train_pad)

In [12]:
print("Vocabulary Size: {:d}".format(len(vocab_sorted)))

Vocabulary Size: 10060


In [13]:
def to_indexes(sentences, vocabulary):
    """ Convert a list of sentences to its corresponding indices.

    Args:
        sentences: a list where each element is a lists of words (sentences)
        vocabulary: a dictionary of words and its corresponding index

    Returns:
        a list where each element is a list of indexes (sentences)

    """
    senteces_idx = []
    for sentence in sentences:
        aux_idx = []
        for word in sentence:
            # If the word is in the vocabulary get its index otherwise use the <oov> index
            if word in vocabulary:
                word_idx = vocabulary[word]
            else:
                word_idx = vocabulary['<oov>']
            aux_idx.append(word_idx)
        senteces_idx.append(aux_idx)
    return np.array(senteces_idx)

In [14]:
train_idx = to_indexes(train_pad, lookup)
test_idx = to_indexes(test_pad, lookup)

In [15]:
train_idx[0]

array([3209,  311,  635, 1793,    3, 3210,   16,   23,  958,    1,   25,
        100,   12, 3211,    6,    1,  312, 3212,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [16]:
# Model Hyperparameters
sequence_length = max_sequence
embedding_dim = 50
num_filters = 100
dropout_prob = (0.5,)
hidden_dims = (1024, 128)
filters_h = (3, 4, 5)

batch_size = 100
num_epochs = 10

output_classes =  1


In [17]:
def cnn_model_rand(sequence_length, embedding_dim, num_filters, filtes_h, 
              dropout_prob, hidden_dims, output_neurons, verbose=False):
    # Input
    embedding_weights = None
    input_shape = (sequence_length,)
    model_input = Input(shape=input_shape)
    embedding_input = Embedding(len(vocab_sorted), 
                                embedding_dim, 
                                input_length=sequence_length, 
                                weights=embedding_weights, name="embedding")(model_input)

    print("Embedding dim:", len(vocab_sorted), embedding_dim)
        
    # CNNs
    convs = []
    for fh in filtes_h:
        conv = Convolution1D(filters=num_filters,
                             kernel_size=fh,
                             padding='valid',
                             activation='relu',
                             strides=1)(embedding_input)
        pool = MaxPooling1D(pool_size=2)(conv)
        flatten = Flatten()(pool)
        convs.append(flatten)

    concat_layer = Concatenate(axis=-1)(convs)
    dropout_cnn = Dropout(dropout_prob[0])(concat_layer)

    h1 = Dense(hidden_dims[0], activation="relu")(dropout_cnn)
    h2 = Dense(hidden_dims[1], activation="relu")(h1)
    model_output = Dense(output_neurons, activation="sigmoid")(h2)

    model = Model(inputs=model_input, outputs=model_output)
    adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    if verbose:
        print(model.summary())
    return model

In [18]:
verbose = True
model = cnn_model_rand(sequence_length, embedding_dim, num_filters, filters_h, 
                         dropout_prob, hidden_dims, output_classes, verbose)

Embedding dim: 10060 50
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 386)           0                                            
____________________________________________________________________________________________________
embedding (Embedding)            (None, 386, 50)       503000      input_1[0][0]                    
____________________________________________________________________________________________________
conv1d_1 (Conv1D)                (None, 384, 100)      15100       embedding[0][0]                  
____________________________________________________________________________________________________
conv1d_2 (Conv1D)                (None, 383, 100)      20100       embedding[0][0]                  
___________________________________________________________________

In [19]:
model.fit(train_idx, np.array(y_train), batch_size=batch_size, epochs=num_epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1261e6278>

In [20]:
loss, acc = model.evaluate(test_idx, np.array(y_test), verbose=0)
print(acc)

0.626760560862
