In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

import cv2

# import tensorflow
# import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.callbacks import EarlyStopping, ModelCheckpoint

from tqdm import tqdm

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.metrics import fbeta_score
import time

Using TensorFlow backend.


In [2]:
x_train = []
x_test = []
y_train = []

df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/sample_submission.csv')


In [3]:
# df_train['tags'].values.shape
# flatten = lambda l: [item for sublist in l for item in sublist]
# labels = list(set(flatten([l.split(' ') for l in df_train['tags'].values])))
labels = ['haze',
 'artisinal_mine',
 'blooming',
 'habitation',
 'cultivation',
 'primary',
 'clear',
 'water',
 'road',
 'slash_burn',
 'cloudy',
 'agriculture',
 'partly_cloudy',
 'conventional_mine',
 'bare_ground',
 'selective_logging',
 'blow_down']

label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}
print(inv_label_map)

{0: 'haze', 1: 'artisinal_mine', 2: 'blooming', 3: 'habitation', 4: 'cultivation', 5: 'primary', 6: 'clear', 7: 'water', 8: 'road', 9: 'slash_burn', 10: 'cloudy', 11: 'agriculture', 12: 'partly_cloudy', 13: 'conventional_mine', 14: 'bare_ground', 15: 'selective_logging', 16: 'blow_down'}


In [None]:
img = cv2.imread('../input/train-jpg/train_0.jpg')
img.shape

In [4]:
for f, tags in tqdm(df_train.values, miniters=1000):
    img = cv2.imread('../input/train-jpg/{}.jpg'.format(f))
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    x_train.append(cv2.resize(img, (64, 64)))
    y_train.append(targets)

for f, tags in tqdm(df_test.values, miniters=1000):
    img = cv2.imread('../input/test-jpg/{}.jpg'.format(f))
    x_test.append(cv2.resize(img, (64, 64)))
    
y_train = np.array(y_train, np.uint8)
x_train = np.array(x_train, np.float32) / 255.
x_test  = np.array(x_test, np.float32) / 255.

print(x_train.shape)
print(y_train.shape)

100%|██████████| 40479/40479 [01:28<00:00, 459.13it/s]
100%|██████████| 40669/40669 [01:02<00:00, 649.71it/s]


(40479, 64, 64, 3)
(40479, 17)


In [10]:
from keras.layers import Activation

def load_model(weights_path=None):
    model = Sequential()
    
#     model.add(Conv2D(32, 3, 3, activation='relu', input_shape=(48, 48, 3)))
#     model.add(MaxPooling2D(pool_size=(2, 2)))
#     model.add(Conv2D(48, 3, 3, activation='relu'))
#     model.add(MaxPooling2D(pool_size=(2, 2)))
#     model.add(Dropout(0.25))
#     model.add(Conv2D(64, 3, 3, activation='relu'))
#     model.add(MaxPooling2D(pool_size=(2, 2)))
#     model.add(Dropout(0.25))
#     model.add(Flatten())
#     model.add(Dense(128, activation='relu'))
#     model.add(Dropout(0.5))
#     model.add(Dense(17, activation='softmax'))

    model.add(Conv2D(32, (3, 3), padding='same',
                 input_shape=(64, 64, 3)))
    model.add(Activation('relu'))
    model.add(Conv2D(32, (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))

    model.add(Conv2D(64, (3, 3), padding='same'))
    model.add(Activation('relu'))
    model.add(Conv2D(64, (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))

    model.add(Flatten())
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(17))
    model.add(Activation('softmax'))

    model.compile(loss='binary_crossentropy', 
                  optimizer='adam',
                  metrics=['accuracy'])
    if weights_path:
        pass
    return model


In [11]:
nfolds = 5

num_fold = 0
sum_score = 0

yfull_test = []
yfull_train =[]
historys = []
f_scores = [{}, {}, {}, {}, {}]

kf = KFold(len(y_train), n_folds=nfolds, shuffle=True, random_state=1)

for train_index, test_index in kf:
        start_time_model_fitting = time.time()
#         print(train_index, test_index)
        X_train = x_train[train_index]
        Y_train = y_train[train_index]
        X_valid = x_train[test_index]
        Y_valid = y_train[test_index]

        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(X_train), len(Y_train))
        print('Split valid: ', len(X_valid), len(Y_valid))
        
        kfold_weights_path = os.path.join('', 'weights_kfold_' + str(num_fold) + '.h5')
        
        model = load_model()

        callbacks = [
            EarlyStopping(monitor='val_loss', patience=2, verbose=0),
            ModelCheckpoint(kfold_weights_path, monitor='val_loss', save_best_only=True, save_weights_only=True, verbose=0)]
        
        history = model.fit(x = X_train, y= Y_train, validation_data=(X_valid, Y_valid),
                  batch_size=32,verbose=2, epochs=50,callbacks=callbacks,
                  shuffle=True)
        historys.append(history)
        
        if os.path.isfile(kfold_weights_path):
            model.load_weights(kfold_weights_path)
        
        p_valid = model.predict(X_valid, batch_size = 128, verbose=2)
        for i in range(5, 20):
            threshold = (i + 1)/100.
            score = fbeta_score(Y_valid, np.array(p_valid) > threshold, beta=2, average='samples')
            f_scores[num_fold - 1][threshold] = score
        
#         print(fbeta_score(Y_valid, np.array(p_valid) > 0.2, beta=2, average='samples'))
#         print(fbeta_score(Y_valid, np.array(p_valid) > 0.3, beta=2, average='samples'))
#         print(fbeta_score(Y_valid, np.array(p_valid) > 0.4, beta=2, average='samples'))

#         p_test = model.predict(x_train, batch_size = 128, verbose=2)
#         yfull_train.append(p_test)
        
        p_test = model.predict(x_test, batch_size = 128, verbose=2)
        yfull_test.append(p_test)

Start KFold number 1 from 5
Split train:  32383 32383
Split valid:  8096 8096
Train on 32383 samples, validate on 8096 samples
Epoch 1/50
83s - loss: 0.3359 - acc: 0.8376 - val_loss: 0.3154 - val_acc: 0.8308
Epoch 2/50
79s - loss: 0.3084 - acc: 0.8445 - val_loss: 0.2961 - val_acc: 0.8522
Epoch 3/50
79s - loss: 0.2975 - acc: 0.8461 - val_loss: 0.2918 - val_acc: 0.8455
Epoch 4/50
79s - loss: 0.2913 - acc: 0.8466 - val_loss: 0.2856 - val_acc: 0.8351
Epoch 5/50


KeyboardInterrupt: 

In [14]:
f_scores

[{0.06: 0.76484472219402877,
  0.07: 0.75966737560105335,
  0.08: 0.74097948585974727,
  0.09: 0.72528084049806052,
  0.1: 0.71572704432242795},
 {0.06: 0.72401422535394511,
  0.07: 0.70540042374317447,
  0.08: 0.68673168784407024,
  0.09: 0.67504208374727059,
  0.1: 0.65558002015540784},
 {0.06: 0.73087513059753428,
  0.07: 0.72321915232450373,
  0.08: 0.71038291648254948,
  0.09: 0.69754987076165198,
  0.1: 0.68579500906922719},
 {},
 {}]

In [1]:
import matplotlib.pyplot as plt

plot_num = len(historys)
for i in range(plot_num):
    history_dict = historys[i].history
    
    # history_dict = history.history
    loss_values = history_dict['loss']
    val_loss_values = history_dict['val_loss']
    epochs = range(1, len(loss_values) + 1)
    plt.subplot(plot_num, 1, i + 1)
    # "bo" is for "blue dot"
    plt.plot(epochs, loss_values, 'bo')
    # b+ is for "blue crosses"
    plt.plot(epochs, val_loss_values, 'b+')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')

    plt.show()


NameError: name 'historys' is not defined

In [None]:
for i in range(plot_num):
    history_dict = historys[i].history
    acc_values = history_dict['acc']
    val_acc_values = history_dict['val_acc']
#     print(acc_values)
    epochs = range(1, len(loss_values) + 1)
    
    plt.subplot(plot_num, 1, i+1)
    
    plt.plot(epochs, acc_values, 'bo')
    plt.plot(epochs, val_acc_values, 'b+')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')

    plt.show()