In [1]:
%load_ext tensorboard

In [2]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from pathlib import Path
import os
import numpy as np
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Dense, GlobalAveragePooling2D, Input
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.applications import efficientnet_v2
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
import datetime

In [3]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [4]:
tf.__version__

'2.8.0'

In [5]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [6]:
IMG_SIZE = 256
BATCH_SIZE = 32
EPOCHS = 30
N_SPLITS = 5
BASE_PATH = Path('/Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/')
SAVED_DIR = BASE_PATH/'kFold_models'
LOG_DIR = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
DATA_PATH  = Path('/Users/rudra/Tech/ML_Work/Datasets/archive')

In [7]:
!rm -rf ./logs/

In [8]:
DATA_PATH

PosixPath('/Users/rudra/Tech/ML_Work/Datasets/archive')

In [9]:
train_df = pd.read_csv(DATA_PATH/'train_cultivar_mapping.csv')

In [10]:
train_df.head()

Unnamed: 0,image,cultivar,file_path,is_exist
0,2017-06-16__12-24-20-930.jpeg,PI_257599,../input/sorghum-id-fgvc-9/train_images/2017-0...,True
1,2017-06-02__16-48-57-866.jpeg,PI_154987,../input/sorghum-id-fgvc-9/train_images/2017-0...,True
2,2017-06-12__13-18-07-707.jpeg,PI_92270,../input/sorghum-id-fgvc-9/train_images/2017-0...,True
3,2017-06-22__13-18-06-841.jpeg,PI_152651,../input/sorghum-id-fgvc-9/train_images/2017-0...,True
4,2017-06-26__12-56-48-642.jpeg,PI_176766,../input/sorghum-id-fgvc-9/train_images/2017-0...,True


In [11]:
train_df.is_exist.unique()

array([ True])

In [12]:
le = LabelEncoder()
le.fit(train_df.cultivar)
train_df['target'] = le.transform(train_df.cultivar)

In [13]:
train_df.head()

Unnamed: 0,image,cultivar,file_path,is_exist,target
0,2017-06-16__12-24-20-930.jpeg,PI_257599,../input/sorghum-id-fgvc-9/train_images/2017-0...,True,73
1,2017-06-02__16-48-57-866.jpeg,PI_154987,../input/sorghum-id-fgvc-9/train_images/2017-0...,True,29
2,2017-06-12__13-18-07-707.jpeg,PI_92270,../input/sorghum-id-fgvc-9/train_images/2017-0...,True,99
3,2017-06-22__13-18-06-841.jpeg,PI_152651,../input/sorghum-id-fgvc-9/train_images/2017-0...,True,6
4,2017-06-26__12-56-48-642.jpeg,PI_176766,../input/sorghum-id-fgvc-9/train_images/2017-0...,True,50


In [14]:
test_filenames = os.listdir(DATA_PATH/'test')
test_df = pd.DataFrame({'filename' : test_filenames})

In [15]:
def create_train_datagen():
    train_datagen = ImageDataGenerator(
        rotation_range=30,
        width_shift_range=0.2,
        height_shift_range=0.2,
        zoom_range=0.15,
        horizontal_flip=True,
        fill_mode='nearest'
    )
    return train_datagen

In [16]:
def get_data(datagen, df, dset):
    if dset == 'train':
        train_generator = datagen.flow_from_dataframe(
            dataframe=df,
            directory=DATA_PATH/'train',
            x_col='image',
            y_col='cultivar',
            class_mode='sparse',
            target_size=(IMG_SIZE, IMG_SIZE),
            batch_size=BATCH_SIZE,
            shuffle=True
        )
        
        return train_generator

    elif dset == 'val':
        valid_generator = datagen.flow_from_dataframe(
            dataframe=df,
            directory=DATA_PATH/'train',
            x_col='image',
            y_col='cultivar',
            class_mode='sparse',
            target_size=(IMG_SIZE, IMG_SIZE),
            batch_size=BATCH_SIZE,
            shuffle=True
        )

        return valid_generator

    elif dset == 'test':
        test_generator = datagen.flow_from_dataframe(
            dataframe = df,
            directory = DATA_PATH/'test',
            x_col = 'filename',
            y_col = None,
            class_mode = None,
            target_size = (IMG_SIZE, IMG_SIZE),
            batch_size = BATCH_SIZE,
            shuffle = False
        )
        return test_generator

    else:
        print("Please pass the correct value for dset. It should be one of 'train', 'val' or 'test'.")
        return -1

In [17]:
def createCallbacks(i):
    early_stopping_callback = EarlyStopping(monitor='val_loss', verbose=1, patience=3)

    checkpoint_callback = ModelCheckpoint(filepath=f'{SAVED_DIR}/sorghum_keras_model_fold_{i}.h5', 
                                            monitor='val_accuracy', verbose=1, save_freq='epoch', 
                                                        save_best_only=True, save_weights_only=True, period=1)

    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, mode=min, patience=3, min_lr=1e-5)

    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1)

    return [early_stopping_callback, checkpoint_callback, reduce_lr, tensorboard_callback]

In [18]:
def create_model():
    base_model = efficientnet_v2.EfficientNetV2B3(weights='imagenet', include_top=False, input_shape=(IMG_SIZE, IMG_SIZE, 3))
    base_model.trainable = True
    inputs = Input(shape=(IMG_SIZE, IMG_SIZE, 3))
    output = base_model(inputs)
    output = GlobalAveragePooling2D()(output)
    output = Dense(100, activation='softmax')(output)

    model = keras.Model(inputs, output)
    return model

In [19]:
pred_idx_lst = []
skf = StratifiedKFold(n_splits=N_SPLITS)

for fold, (train_index, val_index) in enumerate(skf.split(X=train_df.image, y=train_df.cultivar)):
    training_data = train_df.iloc[train_index]
    val_data  = train_df.iloc[val_index]
    train_datagen = create_train_datagen()
    train_generator = get_data(train_datagen, training_data, 'train')
    valid_generator = get_data(train_datagen, val_data, 'val')

    model = create_model()
    model.summary()
    model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy'])
    history = model.fit(train_generator, validation_data=valid_generator, epochs=EPOCHS, callbacks=createCallbacks(fold))
    model.save(f'Sorghum_keras_best_model_fold_{fold}.h5')

    test_datagen = ImageDataGenerator()
    test_generator = get_data(test_datagen, test_df, 'test')
    preds = model.predict(test_generator)
    pred_idx = tf.math.argmax(preds, axis=1)
    pred_idx_lst.append(pred_idx)

Found 17754 validated image filenames belonging to 100 classes.
Found 4439 validated image filenames belonging to 100 classes.
Metal device set to: Apple M1

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB



2022-05-04 08:58:37.866861: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-05-04 08:58:37.866953: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 256, 256, 3)]     0         
                                                                 
 efficientnetv2-b3 (Function  (None, 8, 8, 1536)       12930622  
 al)                                                             
                                                                 
 global_average_pooling2d (G  (None, 1536)             0         
 lobalAveragePooling2D)                                          
                                                                 
 dense (Dense)               (None, 100)               153700    
                                                                 
Total params: 13,084,322
Trainable params: 12,975,106
Non-trainable params: 109,216
_________________________________________________________________
Epoch 1/30


2022-05-04 08:58:40.695250: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-05-04 08:58:46.076476: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-05-04 09:31:49.739287: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.



Epoch 1: val_accuracy improved from -inf to 0.56319, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_0.h5
Epoch 2/30
Epoch 2: val_accuracy improved from 0.56319 to 0.75918, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_0.h5
Epoch 3/30
Epoch 3: val_accuracy improved from 0.75918 to 0.81730, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_0.h5
Epoch 4/30
Epoch 4: val_accuracy did not improve from 0.81730
Epoch 5/30
Epoch 5: val_accuracy improved from 0.81730 to 0.84524, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_0.h5
Epoch 6/30
Epoch 6: val_accuracy improved from 0.84524 to 0.86641, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_0.h5
Epoch 7/30
Epoch 7: val_accuracy improved from 0.86641 t

2022-05-04 17:57:07.794925: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Found 17754 validated image filenames belonging to 100 classes.
Found 4439 validated image filenames belonging to 100 classes.
Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 256, 256, 3)]     0         
                                                                 
 efficientnetv2-b3 (Function  (None, 8, 8, 1536)       12930622  
 al)                                                             
                                                                 
 global_average_pooling2d_1   (None, 1536)             0         
 (GlobalAveragePooling2D)                                        
                                                                 
 dense_1 (Dense)             (None, 100)               153700    
                                                                 
Total params: 13,084,322
Trainable params: 12,975,106
Non-traina

2022-05-04 18:07:49.920706: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-05-04 18:43:23.335618: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.



Epoch 1: val_accuracy improved from -inf to 0.58054, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_1.h5
Epoch 2/30
Epoch 2: val_accuracy improved from 0.58054 to 0.73215, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_1.h5
Epoch 3/30
Epoch 3: val_accuracy improved from 0.73215 to 0.82451, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_1.h5
Epoch 4/30
Epoch 4: val_accuracy did not improve from 0.82451
Epoch 5/30
Epoch 5: val_accuracy improved from 0.82451 to 0.86213, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_1.h5
Epoch 6/30
Epoch 6: val_accuracy improved from 0.86213 to 0.88466, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_1.h5
Epoch 7/30
Epoch 7: val_accuracy did not improve from 0.

2022-05-05 03:27:58.291036: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Found 17754 validated image filenames belonging to 100 classes.
Found 4439 validated image filenames belonging to 100 classes.
Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 256, 256, 3)]     0         
                                                                 
 efficientnetv2-b3 (Function  (None, 8, 8, 1536)       12930622  
 al)                                                             
                                                                 
 global_average_pooling2d_2   (None, 1536)             0         
 (GlobalAveragePooling2D)                                        
                                                                 
 dense_2 (Dense)             (None, 100)               153700    
                                                                 
Total params: 13,084,322
Trainable params: 12,975,106
Non-traina

2022-05-05 03:37:56.054966: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-05-05 04:07:08.535836: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.



Epoch 1: val_accuracy improved from -inf to 0.57017, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_2.h5
Epoch 2/30
Epoch 2: val_accuracy improved from 0.57017 to 0.73192, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_2.h5
Epoch 3/30
Epoch 3: val_accuracy improved from 0.73192 to 0.82136, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_2.h5
Epoch 4/30
Epoch 4: val_accuracy improved from 0.82136 to 0.83848, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_2.h5
Epoch 5/30
Epoch 5: val_accuracy did not improve from 0.83848
Epoch 6/30
Epoch 6: val_accuracy improved from 0.83848 to 0.86664, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_2.h5
Epoch 7/30
Epoch 7: val_accuracy improved from 0.86664 t

2022-05-05 19:58:04.931541: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Found 17755 validated image filenames belonging to 100 classes.
Found 4438 validated image filenames belonging to 100 classes.
Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, 256, 256, 3)]     0         
                                                                 
 efficientnetv2-b3 (Function  (None, 8, 8, 1536)       12930622  
 al)                                                             
                                                                 
 global_average_pooling2d_3   (None, 1536)             0         
 (GlobalAveragePooling2D)                                        
                                                                 
 dense_3 (Dense)             (None, 100)               153700    
                                                                 
Total params: 13,084,322
Trainable params: 12,975,106
Non-traina

2022-05-05 20:06:22.846609: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-05-05 20:50:34.676983: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.



Epoch 1: val_accuracy improved from -inf to 0.59712, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_3.h5
Epoch 2/30
Epoch 2: val_accuracy improved from 0.59712 to 0.71631, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_3.h5
Epoch 3/30
Epoch 3: val_accuracy improved from 0.71631 to 0.82717, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_3.h5
Epoch 4/30
Epoch 4: val_accuracy did not improve from 0.82717
Epoch 5/30
Epoch 5: val_accuracy improved from 0.82717 to 0.85579, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_3.h5
Epoch 6/30
Epoch 6: val_accuracy improved from 0.85579 to 0.87562, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_3.h5
Epoch 7/30
Epoch 7: val_accuracy improved from 0.87562 t

2022-05-06 06:26:53.997592: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Found 17755 validated image filenames belonging to 100 classes.
Found 4438 validated image filenames belonging to 100 classes.
Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 256, 256, 3)]     0         
                                                                 
 efficientnetv2-b3 (Function  (None, 8, 8, 1536)       12930622  
 al)                                                             
                                                                 
 global_average_pooling2d_4   (None, 1536)             0         
 (GlobalAveragePooling2D)                                        
                                                                 
 dense_4 (Dense)             (None, 100)               153700    
                                                                 
Total params: 13,084,322
Trainable params: 12,975,106
Non-traina

2022-05-06 06:36:57.881591: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-05-06 07:09:46.924969: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.



Epoch 1: val_accuracy improved from -inf to 0.56422, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_4.h5
Epoch 2/30
Epoch 2: val_accuracy improved from 0.56422 to 0.69198, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_4.h5
Epoch 3/30
Epoch 3: val_accuracy improved from 0.69198 to 0.79878, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_4.h5
Epoch 4/30
Epoch 4: val_accuracy improved from 0.79878 to 0.85940, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_4.h5
Epoch 5/30
Epoch 5: val_accuracy did not improve from 0.85940
Epoch 6/30
Epoch 6: val_accuracy improved from 0.85940 to 0.87472, saving model to /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_models/sorghum_keras_model_fold_4.h5
Epoch 7/30
Epoch 7: val_accuracy improved from 0.87472 t

2022-05-06 20:23:15.498565: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [20]:
%tensorboard --logdir logs/fit

In [30]:
predictions = tf.reduce_mean(tf.stack(pred_idx_lst), axis=0)
test_df['cultivar'] = le.inverse_transform(list(predictions))
test_df.filename = test_df.filename.map(lambda x: x.split('.jpeg')[0] + '.png')
test_df.to_csv(f'{BASE_PATH}/kFold_submission.csv', index=False)

In [31]:
predictions.shape

TensorShape([23639])

In [34]:
test_df.head()

Unnamed: 0,filename,cultivar
0,1838798748.png,PI_181083
1,42096263.png,PI_156463
2,316490365.png,PI_329299
3,1091940264.png,PI_218112
4,470001726.png,PI_329300


In [24]:
!kaggle competitions submit -c sorghum-id-fgvc-9 -f /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/kFold_submission.csv -m "K Fold Cross Validation - Keras + EfficientNetV2"

100%|████████████████████████████████████████| 565k/565k [00:05<00:00, 97.7kB/s]
Successfully submitted to Sorghum -100 Cultivar Identification - FGVC 9

Inference using the best model

Since Fold 2 model has best training as well validation accuracies, we will use it to make predictions here.

In [39]:
best_model = keras.models.load_model('/Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/sorghum_keras_best_model_fold_2.h5')
best_preds = best_model.predict(test_generator)
best_pred_idx = tf.math.argmax(best_preds, axis=1)

test_df['cultivar'] = le.inverse_transform(list(best_pred_idx))

2022-05-06 21:34:12.467380: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [42]:
test_df.filename = test_df.filename.map(lambda x: x.split('.')[0] + '.png')
test_df.to_csv(f'{BASE_PATH}/best_model_submission.csv', index=False)

In [43]:
!kaggle competitions submit -c sorghum-id-fgvc-9 -f /Users/rudra/Tech/ML_Work/Sorghum_Kaggle_Competition/best_model_submission.csv -m "Inference using Best Model: K Fold Cross Validation - Keras + EfficientNetV2"

100%|█████████████████████████████████████████| 564k/564k [00:05<00:00, 104kB/s]
Successfully submitted to Sorghum -100 Cultivar Identification - FGVC 9

In [51]:
best_pred_idx.shape, len(pred_idx_lst), pred_idx_lst[2].shape

(TensorShape([23639]), 5, TensorShape([23639]))

In [55]:
best_pred_idx == pred_idx_lst[2]

<tf.Tensor: shape=(23639,), dtype=bool, numpy=array([ True,  True,  True, ...,  True,  True,  True])>