Keras + CV

Thanks @anokas for the starter code at https://www.kaggle.com/anokas/planet-understanding-the-amazon-from-space/simple-keras-starter/

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc

import keras as k
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.callbacks import EarlyStopping, ModelCheckpoint

import cv2
from tqdm import tqdm

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.metrics import fbeta_score
import time


Using Theano backend.


Pre-processing the train and test data

In [None]:
x_train = []
x_test = []
y_train = []

df_train = pd.read_csv('../data/train_csv/train.csv')
df_test = pd.read_csv('../data/sample_submission_csv/sample_submission.csv')

flatten = lambda l: [item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in df_train['tags'].values])))

label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}

for f, tags in tqdm(df_train.values[:20000], miniters=1000):
    img = cv2.imread('../data/train-jpg/train-jpg/{}.jpg'.format(f))
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
#     print(img)
    x_train.append(cv2.resize(img, (32, 32)))
    y_train.append(targets)

for f, tags in tqdm(df_test.values, miniters=1000):
    img = cv2.imread('../data/test-jpg/test-jpg/{}.jpg'.format(f))
    x_test.append(cv2.resize(img, (32, 32)))
    
y_train = np.array(y_train, np.uint8)
x_train = np.array(x_train, np.float32) / 255.
x_test  = np.array(x_test, np.float32) / 255.

print(x_train.shape)
print(y_train.shape)

100%|████████████████████████████████████| 20000/20000 [07:36<00:00, 43.84it/s]  0%|                                                | 0/20000 [00:20<?, ?it/s]
100%|████████████████████████████████████| 40669/40669 [11:20<00:00, 59.73it/s]


(20000, 32, 32, 3)
(20000, 17)


In [9]:
x_train[1]

array([[[ 0.25098041,  0.27843139,  0.23529412],
        [ 0.20392157,  0.25490198,  0.20392157],
        [ 0.25490198,  0.30980393,  0.25490198],
        ..., 
        [ 0.22352941,  0.24705882,  0.2       ],
        [ 0.27843139,  0.35294119,  0.29019609],
        [ 0.25882354,  0.28235295,  0.24705882]],

       [[ 0.25882354,  0.30980393,  0.27058825],
        [ 0.25882354,  0.29019609,  0.24313726],
        [ 0.24313726,  0.29019609,  0.24705882],
        ..., 
        [ 0.24705882,  0.25490198,  0.21960784],
        [ 0.26274511,  0.30588236,  0.27450982],
        [ 0.25490198,  0.34509805,  0.27843139]],

       [[ 0.25098041,  0.29411766,  0.25098041],
        [ 0.27058825,  0.32156864,  0.27450982],
        [ 0.25490198,  0.29803923,  0.25882354],
        ..., 
        [ 0.23137255,  0.27058825,  0.22352941],
        [ 0.25490198,  0.26666668,  0.23529412],
        [ 0.25882354,  0.32156864,  0.26666668]],

       ..., 
       [[ 0.32156864,  0.43529412,  0.41176471],
        

Transpose the data if use Theano

In [10]:
x_train = x_train.transpose((0, 3, 1, 2))
x_test = x_test.transpose((0, 3, 1, 2))

In [19]:
x_test[0]

array([[[ 0.25882354,  0.28235295,  0.24313726],
        [ 0.27450982,  0.31764707,  0.27843139],
        [ 0.27058825,  0.30588236,  0.25882354],
        ..., 
        [ 0.28235295,  0.32156864,  0.28627452],
        [ 0.29411766,  0.33333334,  0.31764707],
        [ 0.3019608 ,  0.33725491,  0.30980393]],

       [[ 0.26666668,  0.29803923,  0.25490198],
        [ 0.27450982,  0.30980393,  0.27843139],
        [ 0.25490198,  0.27450982,  0.23529412],
        ..., 
        [ 0.28627452,  0.3137255 ,  0.28627452],
        [ 0.28235295,  0.31764707,  0.27450982],
        [ 0.29019609,  0.32549021,  0.29411766]],

       [[ 0.27058825,  0.3019608 ,  0.28235295],
        [ 0.26274511,  0.3019608 ,  0.27058825],
        [ 0.27843139,  0.31764707,  0.28235295],
        ..., 
        [ 0.26274511,  0.29411766,  0.25490198],
        [ 0.27843139,  0.29019609,  0.24705882],
        [ 0.27843139,  0.29019609,  0.25490198]],

       ..., 
       [[ 0.27058825,  0.29411766,  0.26666668],
        

Create n-folds cross-validation

In [None]:
nfolds = 3

num_fold = 0
sum_score = 0

yfull_test = []
yfull_train =[]

kf = KFold(len(y_train), n_folds=nfolds, shuffle=True, random_state=1)

for train_index, test_index in kf:
        start_time_model_fitting = time.time()
        
        X_train = x_train[train_index]
        Y_train = y_train[train_index]
        X_valid = x_train[test_index]
        Y_valid = y_train[test_index]

        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(X_train), len(Y_train))
        print('Split valid: ', len(X_valid), len(Y_valid))
        
        kfold_weights_path = os.path.join('', 'weights_kfold_' + str(num_fold) + '.h5')
        
        model = Sequential()
        model.add(Conv2D(32, 3, 3, activation='relu', input_shape=(32, 32, 3)))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Conv2D(48, 3, 3, activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.25))
        model.add(Conv2D(64, 3, 3, activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.25))
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(17, activation='sigmoid'))

        model.compile(loss='binary_crossentropy', 
                      optimizer='adam',
                      metrics=['accuracy'])
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=2, verbose=0),
            ModelCheckpoint(kfold_weights_path, monitor='val_loss', save_best_only=True, verbose=0)]
        
        model.fit(x = X_train, y= Y_train, validation_data=(X_valid, Y_valid),
                  batch_size=128,verbose=2, nb_epoch=7,callbacks=callbacks,)
        
        if os.path.isfile(kfold_weights_path):
            model.load_weights(kfold_weights_path)
        
        p_valid = model.predict(X_valid, batch_size = 128, verbose=2)
        print(fbeta_score(Y_valid, np.array(p_valid) > 0.2, beta=2, average='samples'))
        
        p_test = model.predict(x_train, batch_size = 128, verbose=2)
        yfull_train.append(p_test)
        
        p_test = model.predict(x_test, batch_size = 128, verbose=2)
        yfull_test.append(p_test)

Start KFold number 1 from 3
Split train:  13333 13333
Split valid:  6667 6667




Train on 13333 samples, validate on 6667 samples
Epoch 1/7
22s - loss: 0.3280 - acc: 0.8761 - val_loss: 0.2525 - val_acc: 0.9047
Epoch 2/7
23s - loss: 0.2453 - acc: 0.9063 - val_loss: 0.2253 - val_acc: 0.9102
Epoch 3/7
23s - loss: 0.2271 - acc: 0.9126 - val_loss: 0.2112 - val_acc: 0.9174
Epoch 4/7
24s - loss: 0.2110 - acc: 0.9169 - val_loss: 0.1975 - val_acc: 0.9219
Epoch 5/7
24s - loss: 0.2021 - acc: 0.9212 - val_loss: 0.2086 - val_acc: 0.9208
Epoch 6/7
29s - loss: 0.1981 - acc: 0.9222 - val_loss: 0.1862 - val_acc: 0.9279
Epoch 7/7
53s - loss: 0.1913 - acc: 0.9247 - val_loss: 0.1840 - val_acc: 0.9285
0.817821450644
Start KFold number 2 from 3
Split train:  13333 13333
Split valid:  6667 6667




Train on 13333 samples, validate on 6667 samples
Epoch 1/7
32s - loss: 0.3128 - acc: 0.8870 - val_loss: 0.2420 - val_acc: 0.9063
Epoch 2/7
36s - loss: 0.2426 - acc: 0.9079 - val_loss: 0.2351 - val_acc: 0.9091
Epoch 3/7
41s - loss: 0.2267 - acc: 0.9132 - val_loss: 0.2169 - val_acc: 0.9163
Epoch 4/7
47s - loss: 0.2177 - acc: 0.9156 - val_loss: 0.2176 - val_acc: 0.9148
Epoch 5/7
45s - loss: 0.2101 - acc: 0.9175 - val_loss: 0.2019 - val_acc: 0.9192
Epoch 6/7
36s - loss: 0.2052 - acc: 0.9196 - val_loss: 0.2009 - val_acc: 0.9225
Epoch 7/7
43s - loss: 0.1989 - acc: 0.9224 - val_loss: 0.1912 - val_acc: 0.9255
0.810891837524
Start KFold number 3 from 3
Split train:  13334 13334
Split valid:  6666 6666
Train on 13334 samples, validate on 6666 samples
Epoch 1/7


Averaging the prediction from each fold

In [None]:
result = np.array(yfull_test[0])
for i in range(1, nfolds):
    result += np.array(yfull_test[i])
result /= nfolds
result = pd.DataFrame(result, columns = labels)
result


Output prediction for submission

In [None]:
from tqdm import tqdm

preds = []
for i in tqdm(range(result.shape[0]), miniters=1000):
    a = result.ix[[i]]
    a = a.apply(lambda x: x > 0.2, axis=1)
    a = a.transpose()
    a = a.loc[a[i] == True]
    ' '.join(list(a.index))
    preds.append(' '.join(list(a.index)))


In [None]:
df_test['tags'] = preds
df_test

In [8]:
df_test.to_csv('submission_keras.csv', index=False)