In [3]:
import tensorflow as tf

In [4]:
tf.__version__

'1.0.1'

In [5]:
import requests
import json
#import imdb
import time
import itertools
import wget
import os
import pickle

import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.cluster.bicluster import SpectralCoclustering
from sklearn.metrics import precision_recall_curve
import scipy

sns.set_style('white')

import tensorflow as tf
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras.models import load_model

# image processing 
from PIL import Image
from keras.preprocessing.image import ImageDataGenerator
from scipy.misc import imresize
import matplotlib.image as mpimg
import io
import urllib

Using TensorFlow backend.


In [6]:
keras.__version__

'2.0.2'

In [7]:
import warnings
# Check for a GPU
if not tf.test.gpu_device_name():
    warnings.warn('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

Default GPU Device: /gpu:0


# Load Data

In [8]:
############# RUNNING LOCALLY: Read in data FROM FILE ################

# train = pd.read_csv('../data/train_data_with_sampling.csv')
# test = pd.read_csv('../data/test_data.csv')

############# FOR AWS: Read in data FROM GITHUB ################

train = pd.read_csv('https://raw.githubusercontent.com/nikhilaravi/movies/master/data/train_data_with_sampling.csv')
test = pd.read_csv('https://raw.githubusercontent.com/nikhilaravi/movies/master/data/test_data.csv')

############# Convert to test and train set ################

X_train_ids = train[['tmdb_id']].values
y_train = train[['group1', 'group2', 'group3', 'group4', 'group5', 'group6', 'group7']].as_matrix()
X_test_ids = test[['tmdb_id']].values
y_test = test[['group1', 'group2', 'group3', 'group4', 'group5', 'group6', 'group7']].as_matrix()

############## Function to read poster image from github given tmdb id ###################

def loadImageFromId(tmdb_id):
    url = 'https://raw.githubusercontent.com/nikhilaravi/movies/master/posters/' + tmdb_id + '.jpg'
    file = io.BytesIO(urllib.request.urlopen(url).read())
    img = Image.open(file)
    return img

############# Load in posters ################

def load_posters_from_ids(id_array, n, y):
    
    ### Use these three lines to subset the training and testing data when running locally (can't load 7000 images)
    
    # start = np.random.choice(range(len(id_array)-n))
    # end = start + n
    # Y = y[:n]
    Y = y
    posters = []
    ids = []
    errors = 0
    
    for poster in id_array:
        
        # uncomment this line if running on AWS 
        img = loadImageFromId(str(int(poster[0])))
        
        # uncomment this line if running locally
#         img = Image.open('../posters/' + str(int(poster[0])) + '.jpg') 
        
        try:
            # original scale 500 x 741 but need to reduce resolution 
            scaled = img.resize(size=(100, int(741/5)))
            posters.append(np.asarray(scaled))
            ids.append(poster)
            
        except ValueError:
            errors +=1
            continue
            
    posters_array = np.asarray(posters)
    X = np.array(posters_array)
    
    print ('posters shape: ', X.shape)
    print ('errors: ', errors)
    
    # rescale pixel values
    
    return X/255, Y

In [9]:
#### load testing and training data 
X_train, Y_train = load_posters_from_ids(X_train_ids, 500, y_train)
X_test, Y_test = load_posters_from_ids(X_test_ids, 100, y_test)

posters shape:  (7814, 148, 100, 3)
errors:  0
posters shape:  (3700, 148, 100, 3)
errors:  0


# CNN from scratch

First we build a basic CNN from scratch.

In [9]:
# Build and compile CNN

from keras.callbacks import TensorBoard

log_dir_string = "cv"

model = Sequential()
model.add(Conv2D(32, (3,3), input_shape=(X_train.shape[1], X_train.shape[2], 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(3,3)))
log_dir_string += "_1conv"

model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
log_dir_string += "_2conv"

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
log_dir_string += "_3conv"

model.add(Flatten())
model.add(Dense(units=64, kernel_initializer='uniform', activation='relu'))
log_dir_string += "_1fc"

model.add(Dense(units=7, kernel_initializer='uniform', activation='sigmoid'))

model.compile(loss='categorical_crossentropy',
             optimizer='adam')

tensorboard = TensorBoard(log_dir='./logs/cv/'+log_dir_string, histogram_freq=1, write_graph=True, write_images=False)

In [None]:
# Fit model
batch_size = 16
epochs = 20

model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[tensorboard])

# SCP to local
# scp -i /Users/leonardloo/.ssh/deep-learning.pem -r ubuntu@ec2-54-86-80-43.compute-1.amazonaws.com:/home/ubuntu/notebooks/logs /Users/leonardloo/Desktop/Harvard\ Classes/Spring\ 2017/AC209b/movies/logs

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

In [20]:
# Define three different error measures to compare our models

from sklearn.metrics import hamming_loss

def error_measures(ypred, ytest):
    ypred = np.array(ypred)
    ytest = np.array(ytest)
    
    # Hamming loss
    h_loss = hamming_loss(ytest, ypred)

    # Percent exact matches
    y_pred_str = np.array([str(yi) for yi in ypred])
    y_test_str = np.array([str(yi) for yi in ytest])
    percent_exact = np.sum(y_pred_str == y_test_str) * 1. / ytest.shape[0]
    
    # Percent at least one match (at least one of the genres are both 1)
    atleastone_count = 0
    for ind in range(len(ypred)):
        yi_pred = ypred[ind]
        yi_test = ytest[ind]
        for i in range(len(yi_pred)):
            if yi_pred[i] == 1 and yi_test[i] == 1:
                atleastone_count += 1
                break
    percent_atleastone = atleastone_count * 1. / ytest.shape[0]
    
    return h_loss, percent_exact, percent_atleastone

In [15]:
# Make predictions on the test set
y_pred_proba = model.predict(X_test, batch_size=batch_size)

# Predict after casting probabilities to labels
y_pred = []
for yi in y_pred_proba:
    yi_new = np.array([1 if yii >= 0.5 else 0 for yii in yi])
    y_pred.append(yi_new)
y_pred = np.array(y_pred)

# compute error measures

h_loss, percent_exact, percent_atleastone = error_measures(y_pred, Y_test)
print ('CNN error measures \n=====================\n')
print ('Hamming Loss: ', h_loss)
print ('Percent exact: ', percent_exact)
print ('Percent at least one: ', percent_atleastone)

CNN error measures 

Hamming Loss:  0.15355212355212355
Percent exact:  0.438648648649
Percent at least one:  0.9481081081081081


We can use grid search to fine-tune our CNN from scratch.

In [None]:
# Let's use grid search to fine tune our network

from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import TensorBoard
from keras import metrics

def build_classifier(optimizer, n_features):
    log_dir_string = "cv" + str(optimizer) + str(n_features)
    model = Sequential()
    model.add(Conv2D(n_features, (3,3), input_shape=(X_train.shape[1], X_train.shape[2], 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(3,3)))
    model.add(Dropout(0.2))

    model.add(Conv2D(n_features, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Dropout(0.2))

    model.add(Conv2D(n_features*2, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Dropout(0.2))

    model.add(Flatten())
    model.add(Dense(units=n_features*2, kernel_initializer='uniform', activation='relu'))

    model.add(Dense(units=7, kernel_initializer='uniform', activation='sigmoid'))

    model.compile(loss='categorical_crossentropy',
                 optimizer=optimizer, metrics = [metrics.mae, metrics.categorical_accuracy])
    
    tensorboard = TensorBoard(log_dir='./logs/cv/'+log_dir_string, histogram_freq=1, write_graph=True, write_images=False)
    return model

classifier = KerasClassifier(build_fn=build_classifier)
parameters = {'batch_size': [16,32],
              'epochs': [20,50],
              'optimizer': ['adam','rmsprop'],
              'n_features': [64,128]}
grid_search = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='accuracy', cv=4, verbose=False)
grid_search = grid_search.fit(X_train, Y_train)
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20

Now we use our better params to tune our CNN:

In [17]:
# Build and compile tuned CNN

from keras.callbacks import TensorBoard

log_dir_string = "cv-tuned"

model = Sequential()
model.add(Conv2D(64, (3,3), input_shape=(X_train.shape[1], X_train.shape[2], 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(3,3)))
model.add(Dropout(0.2))
log_dir_string += "_1conv"

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.2))
log_dir_string += "_2conv"

model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Dropout(0.2))
log_dir_string += "_3conv"

model.add(Flatten())
model.add(Dense(units=128, kernel_initializer='uniform', activation='relu'))
log_dir_string += "_1fc"
model.add(Dropout(0.2))
model.add(Dense(units=7, kernel_initializer='uniform', activation='sigmoid'))

model.compile(loss='categorical_crossentropy', 
              optimizer='adam', metrics = [metrics.mae, metrics.categorical_accuracy])

tensorboard = TensorBoard(log_dir='./logs/cv/'+log_dir_string, histogram_freq=1, write_graph=True, write_images=False)

In [None]:
# Fit model, also using image augmentation
datagen = ImageDataGenerator(rescale=1./255,
                             shear_range = 0.2,
                             zoom_range = 0.2,
                             horizontal_flip = True)
batch_size = 16
epochs = 50

history = model.fit_generator(datagen.flow(X_train, Y_train, batch_size=batch_size),
                    steps_per_epoch=(Y_train.shape[0])/batch_size,
                    epochs=epochs,
                    verbose=1, callbacks=[tensorboard])

# SCP to local
# scp -i /Users/leonardloo/.ssh/deep-learning.pem -r ubuntu@ec2-54-86-80-43.compute-1.amazonaws.com:/home/ubuntu/notebooks/logs /Users/leonardloo/Desktop/Harvard\ Classes/Spring\ 2017/AC209b/movies/logs

INFO:tensorflow:Summary name conv2d_7/kernel:0 is illegal; using conv2d_7/kernel_0 instead.
INFO:tensorflow:Summary name conv2d_7/bias:0 is illegal; using conv2d_7/bias_0 instead.
INFO:tensorflow:Summary name conv2d_8/kernel:0 is illegal; using conv2d_8/kernel_0 instead.
INFO:tensorflow:Summary name conv2d_8/bias:0 is illegal; using conv2d_8/bias_0 instead.
INFO:tensorflow:Summary name conv2d_9/kernel:0 is illegal; using conv2d_9/kernel_0 instead.
INFO:tensorflow:Summary name conv2d_9/bias:0 is illegal; using conv2d_9/bias_0 instead.
INFO:tensorflow:Summary name dense_5/kernel:0 is illegal; using dense_5/kernel_0 instead.
INFO:tensorflow:Summary name dense_5/bias:0 is illegal; using dense_5/bias_0 instead.
INFO:tensorflow:Summary name dense_6/kernel:0 is illegal; using dense_6/kernel_0 instead.
INFO:tensorflow:Summary name dense_6/bias:0 is illegal; using dense_6/bias_0 instead.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/

In [21]:
# Make predictions on the test set
y_pred_proba = model.predict(X_test, batch_size=batch_size)

# Predict after casting probabilities to labels
y_pred = []
for yi in y_pred_proba:
    yi_new = np.array([1 if yii >= 0.5 else 0 for yii in yi])
    y_pred.append(yi_new)
y_pred = np.array(y_pred)

# compute error measures

h_loss, percent_exact, percent_atleastone = error_measures(y_pred, Y_test)
print ('CNN error measures \n=====================\n')
print ('Hamming Loss: ', h_loss)
print ('Percent exact: ', percent_exact)
print ('Percent at least one: ', percent_atleastone)

CNN error measures 

Hamming Loss:  0.3519305019305019
Percent exact:  0.00567567567568
Percent at least one:  0.9972972972972973
