Goal of this notebook: generate an entry for the kaggle galaxy classification challenge

- [x] split out a validation set

- [x] parse the solution csv file

- [x] write a generator to find picture for each solution

- [x] use fit_generator to train model

In [54]:
import csv, random, os, bcolz, glob
import numpy as np
from scipy.ndimage import imread
from scipy.misc import imresize
import matplotlib.pyplot as plt
from vgg16 import Vgg16, Dense, Adam, Sequential
from utils import *
from __future__ import print_function

In [2]:
%matplotlib inline

In [3]:
%pwd

u'/home/ubuntu/courses/deeplearning1/nbs'

In [4]:
current_dir = os.getcwd()
LESSON_HOME_DIR = current_dir
DATA_HOME_DIR = LESSON_HOME_DIR + "/data/galaxy"
solutions_csv = DATA_HOME_DIR + "/solns.csv"

In [5]:
def save_array(fname, arr):
    c=bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()


def load_array(fname):
    return bcolz.open(fname)[:]

## make validation set

In [6]:
valid_array = load_array(DATA_HOME_DIR + "/valid/valid_solutions")
train_array = load_array(DATA_HOME_DIR + "/train/training_solutions")

In [7]:
print(valid_array.shape)
#print valid_array[:5, :]
#print valid_array[0, 1:]

(12159, 38)


In [8]:
train_ids = map(int, train_array[:,0])
valid_ids = map(int, valid_array[:,0])

In [102]:
%cd $DATA_HOME_DIR/test
g = glob("*.jpg")
test_ids = [int(filename[:-4]) for filename in g]
print(test_ids[:5])
print(g[:5])

/home/ubuntu/courses/deeplearning1/nbs/data/galaxy/test
[436614, 241401, 204696, 717923, 173724]
['436614.jpg', '241401.jpg', '204696.jpg', '717923.jpg', '173724.jpg']


## make samples

In [9]:
from shutil import copyfile

In [10]:
train_sample_size = 200
valid_sample_size = 100
test_sample_size = 200

## test an idea for generator

## make batched generators

In [13]:
def get_pic_by_id(uid, folder = DATA_HOME_DIR + "/train/unknown/"):
    return np.swapaxes(np.swapaxes(imresize(imread(folder + "%d.jpg" % uid), (224, 224)), 1, 2), 0, 1)

In [14]:
get_pic_by_id(train_ids[0]).shape

(3, 224, 224)

In [95]:
# predict_generator needs a generator that won't continue past the end of the underlying dataset

def img_gen(idlist, batchsize, folder = DATA_HOME_DIR + "/train/unknown/"):
    imgen = (get_pic_by_id(uid, folder) for uid in idlist)
    while(1):
        img_out = []
        for i in range(batchsize):
            try:
                this_img = imgen.next()
                img_out.append(this_img)
            except StopIteration:
                pass
        try:
            all_img_out = np.stack(img_out, axis=0)
        except ValueError:
            raise StopIteration
        yield all_img_out

In [16]:
def data_gen(solnarray, batchsize, folder = DATA_HOME_DIR + "/train/unknown/"):
    imgen = ((get_pic_by_id(rr[0], folder), rr[1:]) for rr in list(solnarray))
    while(1):
        img_out = []
        soln_out = []
        for i in range(batchsize):
            try:
                this_img, this_soln = imgen.next()
            except StopIteration:
                imgen = ((get_pic_by_id(rr[0], folder), rr[1:]) for rr in list(solnarray))
                this_img, this_soln = imgen.next()
            img_out.append(this_img)
            soln_out.append(this_soln)
        all_img_out = np.stack(img_out, axis=0)
        all_soln_out = np.stack(soln_out, axis=0)
        yield (all_img_out, all_soln_out)

## alter the vgg16 model

In [17]:
vgg = Vgg16()

In [18]:
hidden_size = 64
output_size = train_array.shape[1] - 1
lr = 0.001

model = vgg.model
model.pop()
for layer in model.layers: layer.trainable = False
model.add(Dense(hidden_size, activation = 'relu'))
model.add(Dense(output_size, activation = None))
model.compile(optimizer=Adam(lr=lr), loss='mse')

## precompute the cnn layers

In [68]:
pre_layers = model.layers[:-2]
pre_model = Sequential(pre_layers)
#pre_model.summary()
pred_batch_size = 32
num_train_samples = len(train_ids)
print(num_train_samples)

49419


In [75]:
pre_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
lambda_1 (Lambda)                (None, 3, 224, 224)   0           lambda_input_1[0][0]             
____________________________________________________________________________________________________
zeropadding2d_1 (ZeroPadding2D)  (None, 3, 226, 226)   0           lambda_1[0][0]                   
____________________________________________________________________________________________________
convolution2d_1 (Convolution2D)  (None, 64, 224, 224)  0           zeropadding2d_1[5][0]            
____________________________________________________________________________________________________
zeropadding2d_2 (ZeroPadding2D)  (None, 64, 226, 226)  0           convolution2d_1[5][0]            
___________________________________________________________________________________________

In [106]:
valid_features = load_array(DATA_HOME_DIR + '/results/valid_convlayer_features.bc')
train_features = load_array(DATA_HOME_DIR + '/results/train_convlayer_features.bc')
test_features = load_array(DATA_HOME_DIR + '/results/test_convlayer_features.bc')

## Set up just the last few layers to train

In [157]:
hidden1_size = 128
hidden2_size = 128
output_size = 37

fc_model = Sequential([
        Dense(hidden1_size, input_dim=4096, activation='relu'),
        Dense(hidden2_size, activation='relu'),
        Dense(output_size, activation=None)
    ])

fc_model.compile(optimizer = RMSprop(lr=0.001, rho=0.7),
                loss = 'mse')

## train the model

In [158]:
train_solns = train_array[:, 1:]
valid_solns = valid_array[:, 1:]
print(train_solns.shape)

(49419, 37)


In [159]:
fit_batchsize = 64

fc_model.fit(x = train_features,
             y = train_solns,
             validation_data = (valid_features, valid_solns),
             batch_size = fit_batchsize,
             nb_epoch = 5)

Train on 49419 samples, validate on 12159 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f89cd57ee10>

In [164]:
fc_model.optimizer.lr=0.001

In [165]:
this_hist = fc_model.fit(x = train_features,
             y = train_solns,
             validation_data = (valid_features, valid_solns),
             batch_size = fit_batchsize,
             nb_epoch = 5)

Train on 49419 samples, validate on 12159 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [166]:
print(np.sqrt(this_hist.history["val_loss"][-1]))

0.127835715089


In [131]:
fc_model.optimizer.lr=0.01

In [132]:
fc_model.fit(x = train_features,
             y = train_solns,
             validation_data = (valid_features, valid_solns),
             batch_size = fit_batchsize,
             nb_epoch = 5)

Train on 49419 samples, validate on 12159 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f8a0d74eed0>

## generate test predictions

In [167]:
test_predicts = fc_model.predict(test_features, pred_batch_size)
print(test_predicts.shape)

(79975, 37)


In [168]:
kaggle_predicts = np.hstack([np.reshape(np.array(test_ids), (-1, 1)), test_predicts])
print(kaggle_predicts.shape)

(79975, 38)


In [169]:
fmt_str = '%d,'+'%.5f,'*37
fmt_str = fmt_str[:-1]
print(fmt_str)

%d,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f


In [170]:
with open(solutions_csv, 'rb') as csvfile:
    solutions_reader = csv.reader(csvfile, delimiter=',')
    # skip header
    solns_header = ",".join(solutions_reader.next())
    
print(solns_header)

GalaxyID,Class1.1,Class1.2,Class1.3,Class2.1,Class2.2,Class3.1,Class3.2,Class4.1,Class4.2,Class5.1,Class5.2,Class5.3,Class5.4,Class6.1,Class6.2,Class7.1,Class7.2,Class7.3,Class8.1,Class8.2,Class8.3,Class8.4,Class8.5,Class8.6,Class8.7,Class9.1,Class9.2,Class9.3,Class10.1,Class10.2,Class10.3,Class11.1,Class11.2,Class11.3,Class11.4,Class11.5,Class11.6


In [171]:
%cd $DATA_HOME_DIR
submission_file_name = 'galaxy_submission_2.csv'
np.savetxt(submission_file_name, kaggle_predicts, fmt=fmt_str, header=solns_header, comments='')

/home/ubuntu/courses/deeplearning1/nbs/data/galaxy


In [172]:
from IPython.display import FileLink
%cd $LESSON_HOME_DIR
FileLink("data/galaxy/" + submission_file_name)

/home/ubuntu/courses/deeplearning1/nbs
