In [6]:
from __future__ import print_function

import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dropout, Dense, Activation, Conv2D, MaxPooling2D, Flatten
from keras.optimizers import SGD

from keras import backend as K

# vggnet pretrained model 
from keras import applications

import pandas as pd
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')

import numpy as np
from PIL import Image
from keras.preprocessing.image import ImageDataGenerator
from scipy.misc import imresize
import matplotlib.image as mpimg

In [7]:
############# Read in data ################

train = pd.read_csv('../data/train_data_with_sampling.csv')
test = pd.read_csv('../data/test_data.csv')

############# Convert to test and train set ################

X_train_ids = train[['tmdb_id']].values
y_train = train[['group1', 'group2', 'group3', 'group4', 'group5', 'group6', 'group7']].as_matrix()
X_test_ids = test[['tmdb_id']].values
y_test = test[['group1', 'group2', 'group3', 'group4', 'group5', 'group6', 'group7']].as_matrix()

############# Function to convert color to grayscale ################

def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])

############# Read in posters for training set ################

def load_posters_from_ids(id_array, n, y):
    start = np.random.choice(range(len(id_array)-n))
    end = start + n
    posters = []
    ids = []
    errors = 0
    for poster in id_array[start:end]:
        img = Image.open('../posters/' + str(int(poster[0])) + '.jpg') 
        try:
            scaled = img.resize(size=(500, 741))
            posters.append(np.asarray(scaled))
            ids.append(poster)
        except ValueError:
            errors +=1
            continue
    posters_array = np.asarray(posters)
    X = np.array(posters_array)
    
    Y = y[:n]
    print ('posters shape: ', X.shape)
    print ('errors: ', errors)
    return X, Y

In [8]:
X_train, Y_train = load_posters_from_ids(X_train_ids, 2000, y_train)
X_test, Y_test = load_posters_from_ids(X_test_ids, 1000, y_test)

posters shape:  (2000, 741, 500, 3)
errors:  0
posters shape:  (1000, 741, 500, 3)
errors:  0


In [9]:
###################### load pre-trained model ######################
batch_size = 16

# pretrained VGG16 net apart from top layer - extract only the features 
model = applications.VGG16(include_top=False, weights='imagenet')

# model.load_weights('./vgg16_weights.h5')

In [None]:
# pass training data once through pretrained network 
bottleneck_features_train = model.predict(X_train)
np.save(open('bottleneck_features_train.npy', 'w'), bottleneck_features_train)

# pass validation data once through network 
bottleneck_features_test = model.predict(X_test)
np.save(open('bottleneck_features_test.npy', 'w'), bottleneck_features_test)

In [None]:
############## Load inputs to fully connected layers ###################

train_data = np.load(open('bottleneck_features_train.npy'))
train_labels = Y_train

validation_data = np.load(open('bottleneck_features_validation.npy'))
validation_labels = Y_test

############### Fully connected network ####################

model = Sequential()
model.add(Flatten(input_shape=train_data.shape[1:]))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(7, activation='sigmoid'))

model.compile(optimizer='rmsprop',
             loss='binary_crossentropy',
             metrics=['accuracy'])

model.fit(train_data, train_labels, 
         epochs=50, 
         batch_size=batch_size, 
         validation_data=(validation_data, validation_labels))

# save out weights 
model.save_weights('bottleneck_fc_model.h5')