In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import seaborn as sns
from skimage import io
from tqdm import tqdm
import cv2
import gc
%matplotlib inline

In [2]:
train_df=pd.read_csv("../input/planet-understanding-the-amazon-from-space/train_v2.csv/train_v2.csv")
print(train_df)

In [3]:
#Building list with unique labels
label_list=[]
for tag_str in train_df.tags.values:
    labels=tag_str.split(' ')
    for label in labels:
        if label not in label_list:
            label_list.append(label)
print(label_list)

In [4]:
#Adding features for every label
for label in label_list:
    train_df[label]=train_df['tags'].apply(lambda x:1 if label in x.split(' ') else 0)
print(train_df)

In [5]:
train_df[label_list].sum().sort_values().plot.bar()

In [6]:
#Define a function that generates a concurrent matrix (contains the number of overlaps of pairs of tags)
def make_cooccurence_matrix(labels):
    numeric_df = train_df[labels]; 
    c_matrix = numeric_df.T.dot(numeric_df)
    sns.heatmap(c_matrix, cmap=sns.cm.rocket_r)
    return c_matrix

In [7]:
#Computing the co-ocurrence matrix for all labels
make_cooccurence_matrix(label_list)

In [8]:
#Classifying the tags into the three categories
weather_labels=['clear','partly_cloudy','cloudy','haze']
land_labels=['primary','water','habitation','agriculture','road','cultivation','bare_ground']
rare_labels=[tag for tag in label_list if (tag not in weather_labels) and (tag not in land_labels)]

In [9]:
#Computing the co-ocurrence matrix for weather-labels
make_cooccurence_matrix(weather_labels)

In [10]:
#Computing the co-ocurrence matrix for land-labels
make_cooccurence_matrix(land_labels)

In [11]:
#Computing the co-ocurrence matrix for rare-labels
make_cooccurence_matrix(rare_labels)

In [12]:
#Inspecting Images
#Adding '.jpg' extension to 'image_name'
train_df['image_name'] = train_df['image_name'].apply(lambda x: '{}.jpg'.format(x)) 
print(train_df)

In [13]:
import tarfile
def extract(tar_file, path):
    opened_tar = tarfile.open(tar_file)
     
    if tarfile.is_tarfile(tar_file):
        opened_tar.extractall(path)
    else:
        print('The tar file you entered is not a tar file')

In [14]:
#Viewing a sample image say 'train_9.jpg' 
image_number=9
sample_img=io.imread('../input/train-jpg/train_{}.jpg'.format(image_number))
r,g,b=sample_img[:,:,0],sample_img[:,:,1],sample_img[:,:,2]
sample_img.shape

In [15]:
#Displaying the red, green and blue channels seperately
fig = plt.figure()
fig.set_size_inches(12, 4)
for ind, (img, channel) in enumerate(((r, 'r'), (g, 'g'), (b, 'b'))):
    a = fig.add_subplot(1, 4, ind+1)
    a.set_title(channel)
    plt.imshow(img)

In [16]:
#Displaying all channels at once
plt.imshow(sample_img)

In [17]:
#Processing Images
y_col = list(train_df.columns[2:]) # storing the tags column names as a variable

# initializing an image generator with some data augumentation
image_gen = tf.keras.preprocessing.image.ImageDataGenerator(rotation_range=45, horizontal_flip=True, \
                                            vertical_flip=True, zoom_range=0.2)

# loading images from dataframe
X = image_gen.flow_from_dataframe(dataframe=train_df, \
        directory='../input/train-jpg', x_col='image_name', y_col=y_col, \
       target_size=(128, 128), class_mode='raw', seed=1, batch_size=128)

In [18]:
# let's abitrarily view an image
x109 = X[0][0][109] # first batch, images, 109th image
y109 = X[0][1][109] # first batch, labels, 109th label
print("each image's shape is {}".format(x109.shape))
print("each label's shape is {}".format(y109.shape))
print('we have {} batches'.format(len(X)))
print('each batch has {} images/labels'.format(X[0][0].shape[0]))
print('901/128 is {:.2F}, so the last batch will have {} images/labels'.format(901/128, X[7][0].shape[0]))

In [19]:
plt.imshow(x109/255)

In [20]:
#Importing useful deep learning libraries
from tensorflow.keras.applications.vgg19 import VGG19
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

In [21]:
#Defining a function to calculate fbeta score

def fbeta(ytrue, ypred, beta=2, threshold=0.2, epsilon=1e-7):
    # threshold is set to 0.2 to maximize recall since f2 score is recall biased
    # epsilon is set to 1e-7 to avoide Nan values due to zero division
    
    beta_squarred = float(beta)**2
    
    ytrue = tf.cast(ytrue, tf.float32) # casts ytrue as a float
    # convert ypred to bool, then to float
    ypred = tf.cast(tf.greater(tf.cast(ypred, tf.float32), tf.constant(threshold)), tf.float32) 
    
    tp = tf.reduce_sum(tf.cast(tf.equal((2.0*ytrue + ypred), tf.constant(3.0)), tf.float32), axis=1) 
    fp = tf.reduce_sum(tf.cast(tf.equal((2.0*ytrue + ypred), tf.constant(1.0)), tf.float32), axis=1)
    fn = tf.reduce_sum(tf.cast(tf.equal((2.0*ytrue + ypred), tf.constant(2.0)), tf.float32), axis=1)

    precision = tp / (tp+fp)
    recall = tp / (tp+fn)
    fb = (beta_squarred+1) * precision * recall / (precision*beta_squarred + recall + epsilon)
  
    return fb

In [22]:
#Creating a function to calculate multi-label accuracy 

def multi_label_acc(ytrue, ypred, threshold=0.2, epsilon=1e-7):
    # threshold is set to 0.2 to maximize recall since f2 score is recall biased
    # epsilon is set to 1e-7 to avoide Nan values due to zero division
    
    ytrue = tf.cast(ytrue, tf.float32) # casts ytrue as a float
    # convert ypred to bool, then to float
    ypred = tf.cast(tf.greater(tf.cast(ypred, tf.float32), tf.constant(threshold)), tf.float32) 
    
    tp = tf.reduce_sum(tf.cast(tf.equal((2.0*ytrue + ypred), tf.constant(3.0)), tf.float32), axis=1) 
    fp = tf.reduce_sum(tf.cast(tf.equal((2.0*ytrue + ypred), tf.constant(1.0)), tf.float32), axis=1)
    fn = tf.reduce_sum(tf.cast(tf.equal((2.0*ytrue + ypred), tf.constant(2.0)), tf.float32), axis=1)
    tn = tf.reduce_sum(tf.cast(tf.equal((2.0*ytrue + ypred), tf.constant(0.0)), tf.float32), axis=1)
    
    acc = (tp+tn) / (tp+fp+fn+tn+epsilon)  
    
    return acc

In [23]:
# creating a function to build a sequential model

def build_model():
    base_model = VGG19(include_top=False, weights='imagenet', input_shape=(128, 128, 3))
    model = Sequential()
    model.add(BatchNormalization(input_shape=(128, 128, 3)))
    model.add(base_model)
    model.add(Flatten())
    model.add(Dense(17, activation='sigmoid'))
    opt = Adam(lr=1e-4)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=[multi_label_acc, fbeta])
    
    return model

In [24]:
#Initializing callbacks
early_stopping = EarlyStopping(monitor='val_fbeta', patience=10, mode='max', verbose=1)
reduced_lr = ReduceLROnPlateau(monitor='val_fbeta', patience=3, cool_down=2, mode='max')
save_best_check_point = ModelCheckpoint(filepath='best_model.hdf5', monitor='val_fbeta', \
                                        mode='max', save_best_only=True, save_weights_only=True)

In [25]:
# initializing an image data generator object with a validation split of 80:20
train_image_gen = tf.keras.preprocessing.image.ImageDataGenerator(rotation_range=180, horizontal_flip=True, \
                                            vertical_flip=True, validation_split=0.2)

# generating the 80% training image data
train_gen = train_image_gen.flow_from_dataframe(dataframe=train_df, \
        directory='../input/train-jpg', x_col='image_name', y_col=y_col, \
       target_size=(128, 128), class_mode='raw', seed=0, batch_size=128, subset='training')

# generating the 20% validation image data
val_gen = train_image_gen.flow_from_dataframe(dataframe=train_df, \
        directory='../input/train-jpg', x_col='image_name', y_col=y_col, \
       target_size=(128, 128), class_mode='raw', seed=0, batch_size=128, subset='validation')

In [26]:
#Setting step size for training and validation image data
step_train_size = int(np.ceil(train_gen.samples / train_gen.batch_size))
step_val_size = int(np.ceil(val_gen.samples / train_gen.batch_size))

In [27]:
import warnings
warnings.filterwarnings('ignore')

In [28]:
gc.collect()

In [29]:
train_model = build_model() 

# fitting the model
train_model.fit(x=train_gen, steps_per_epoch=step_train_size, validation_data=val_gen, validation_steps=step_val_size,
         epochs=30, callbacks=[early_stopping, reduced_lr, save_best_check_point], )

In [32]:
#Saving model to bin 
import pickle

In [36]:
print(train_model)

In [45]:
from keras.models import load_model

train_model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'

In [47]:
import h5py
filename = "my_model.h5"

with h5py.File(filename, "r") as f:
    # List all groups
    print("Keys: %s" % f.keys())
    a_group_key = list(f.keys())[0]

    # Get the data
    data = list(f[a_group_key])

In [48]:
data

In [51]:
type(data)

In [53]:
df=pd.DataFrame(data) 
print(df) 

In [55]:
df.to_csv('first_submission.csv', index=False)