In [1]:
import numpy as np # linear algebra
import pandas as pd
import cv2, gc
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import fbeta_score
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Flatten
from keras.applications.vgg19 import VGG19
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [2]:
input_size = 128
epoch = 20
batch_size = 128
input_shape = (input_size, input_size, 3)
path = '/kaggle/input/planets-dataset/planet/planet/'

In [4]:
gc.collect()
train_classes = pd.read_csv(f'{path}train_classes.csv')
train_classes = shuffle(train_classes, random_state=0)
sample_submission = pd.read_csv(f'{path}sample_submission.csv')
trad_sample_df = sample_submission[sample_submission.image_name.str.contains('file_')].copy()
sample_submission = sample_submission[sample_submission.image_name.str.contains('test_')]

In [5]:
s = train_classes.tags.str.split(' ').explode()
lb = MultiLabelBinarizer()
encoded = lb.fit_transform(s.values[:, None])
one_hot_df = pd.DataFrame(encoded.tolist(), columns=np.ravel(lb.classes_), dtype='int') \
                .groupby(s.index) \
                .sum()
one_hot_df['image_name'] = train_classes["image_name"].apply(lambda fn: fn+".jpg")
cols = ['image_name'] + list(np.ravel(lb.classes_))
train_classes = one_hot_df[cols].copy()
del one_hot_df, s, encoded, lb
trad_sample_df['image_name'] = trad_sample_df["image_name"].apply(lambda fn: fn+".jpg")
sample_submission['image_name'] = sample_submission["image_name"].apply(lambda fn: fn+".jpg")

In [6]:
datagen = ImageDataGenerator(horizontal_flip=True, vertical_flip=True,
                             zoom_range=0.5, rotation_range=90,
                             rescale=1./255.)

In [7]:
def VGG19_Amazon_Model(input_shape=input_shape):
    gc.collect()
    base_model = VGG19(include_top=False, weights='imagenet',
                       input_shape=input_shape)
    model = Sequential()
    model.add(BatchNormalization(input_shape=input_shape))
    model.add(base_model)
    model.add(Flatten())
    model.add(Dense(17, activation='sigmoid'))
   
    return model


def return_model_name(k):
    return '/kaggle/working/model_'+str(k)+'.h5'


def train_model(df, k=5):
    gc.collect()
    model = VGG19_Amazon_Model()
    kf = KFold(n_splits=k, random_state=1, shuffle=True)
    fold = 1

    for train_index, val_index in kf.split(df.image_name):
        
        training_data = df.iloc[train_index]
        validation_data = df.iloc[val_index]
        
        train_generator=datagen.flow_from_dataframe(
                                            dataframe=training_data, directory=f'{path}/train-jpg/',
                                            x_col="image_name", y_col=cols[1:], batch_size=batch_size,
                                            seed=42, shuffle=True, class_mode="raw",
                                            target_size=(input_size, input_size))
        
        val_generator=datagen.flow_from_dataframe(
                                            dataframe=validation_data, directory=f'{path}/train-jpg/',
                                            x_col="image_name", y_col=cols[1:], batch_size=batch_size,
                                            seed=42, shuffle=True, class_mode="raw",
                                            target_size=(input_size, input_size))
        
        opt = Adam(lr=0.0001)
        model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

        callback = [EarlyStopping(monitor='val_accuracy', patience=4, verbose=1),
                    ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2,
                                   cooldown=2, verbose=1),
                    ModelCheckpoint(return_model_name(fold), monitor='val_accuracy', 
                                    verbose=1, save_best_only=True, mode='max')]
        history = model.fit_generator(train_generator, 
                                      validation_data=val_generator,
                                      callbacks=callback, verbose=1, epochs=epoch) 
        
        #pred_val = model.predict_generator(val_generator, verbose=1)
        #preds = np.array(1*(pred_val > 0.18))
        #print("F BETA Score: {}".format(fbeta_score(val_generator.labels, preds, beta=2, 
        #                                              average='samples')))
        
        fold += 1
        
    return val_generator


def predict_model(test_gen, k=5, batch_size=batch_size):
    model = VGG19_Amazon_Model()
    full_test = []

    for nfold in range(1,k+1):
        model.load_weights(filepath=return_model_name(nfold))
        p_test = model.predict_generator(test_gen, verbose=1)
        full_test.append(p_test)
    
    result = np.array(full_test[0])
    for i in range(1, k):
        result += np.array(full_test[i])
    result = result / k
    
    return result

In [7]:
val_generator = train_model(train_classes)
gc.collect()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5
Found 32383 validated image filenames.
Found 8096 validated image filenames.
Epoch 1/20
Epoch 00001: val_accuracy improved from -inf to 0.05089, saving model to /kaggle/working/model_1.h5
Epoch 2/20
Epoch 00002: val_accuracy improved from 0.05089 to 0.13463, saving model to /kaggle/working/model_1.h5
Epoch 3/20
Epoch 00003: val_accuracy did not improve from 0.13463
Epoch 4/20
Epoch 00004: val_accuracy did not improve from 0.13463
Epoch 5/20
Epoch 00005: val_accuracy improved from 0.13463 to 0.13785, saving model to /kaggle/working/model_1.h5
Epoch 6/20
Epoch 00006: val_accuracy did not improve from 0.13785
Epoch 7/20
Epoch 00007: val_accuracy did not improve from 0.13785
Epoch 8/20
Epoch 00008: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.

Epoch 00008: val_accuracy did not improve from 0.13785
Epoch 9/20
Epoch 00009: val_accura

Epoch 2/20
Epoch 00002: val_accuracy improved from 0.08833 to 0.11217, saving model to /kaggle/working/model_5.h5
Epoch 3/20
Epoch 00003: val_accuracy did not improve from 0.11217
Epoch 4/20
Epoch 00004: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.

Epoch 00004: val_accuracy did not improve from 0.11217
Epoch 5/20
Epoch 00005: val_accuracy did not improve from 0.11217
Epoch 6/20
Epoch 00006: val_accuracy did not improve from 0.11217
Epoch 00006: early stopping


4384

In [42]:
pred_val = predict_model(val_generator, 5)        
#pred_val = (pred_val > 0.18)
#preds = pred_val.astype(int)
#vals = np.array(val_generator.labels, np.int8)
#print('F2 = {}'.format(fbeta_score(vals, np.array(pred_val) > 0.18, beta=2, average='samples')))



In [55]:
test_datagen=ImageDataGenerator(rescale=1./255.)

test2_generator=test_datagen.flow_from_dataframe(
                                            dataframe=sample_submission, directory=f'{path}/test-jpg/',
                                            x_col="image_name", y_col=None, batch_size=16,
                                            seed=42, shuffle=False, class_mode=None, 
                                            target_size=(input_size, input_size))

Found 40669 validated image filenames.


In [58]:
pred_test = predict_model(test1_generator, 5)
pred_bool = (pred_test > 0.18)
result1 = pred_bool.astype(int)
result1 = pd.DataFrame(result1, columns=cols[1:])



ValueError: Shape of passed values is (40669, 17), indices imply (40669, 18)

In [None]:
test2_generator=test_datagen.flow_from_dataframe(
                                            dataframe=trad_sample_df, 
                                            directory='../input/planets-dataset/test-jpg-additional/test-jpg-additional/',
                                            x_col="image_name", y_col=None, batch_size=16,
                                            seed=42, shuffle=False, class_mode=None, 
                                            target_size=(input_size, input_size))

In [None]:
pred_test = predict_model(test2_generator, 5)
pred_bool = (pred_test > 0.18)
result2 = pred_bool.astype(int)
result2 = pd.DataFrame(result2, columns=cols[1:])

In [60]:
results = result1.append(result2)
results["image_name"]=test_generator.filenames
results = results[cols] #To get the same column order
results.to_csv("submission.csv",index=False)