## Create the augmented datasets

In [1]:
import glob
import os
import random
import shutil
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

SEED = 1234
def set_seed(seed=SEED):
    np.random.seed(seed) 
    tf.set_random_seed(seed) 
    random.seed(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = "1"
    os.environ['TF_CUDNN_DETERMINISM'] = "1"
    os.environ['PYTHONHASHSEED'] = str(seed)
set_seed()

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
final_dir = './diseases_final/'
metadata_file = 'metadata.csv'
metadata = pd.read_csv(os.path.join(final_dir, metadata_file))

classes = metadata.label.value_counts().to_dict()
print(classes)
metadata.head()

{'normal': 2405, 'blast': 2351, 'hispa': 2151, 'tungro': 1951, 'white_stem_borer': 1273, 'brown_spot': 1257, 'leaf_roller': 1095, 'downy_mildew': 868, 'yellow_stem_borer': 765, 'bacterial_leaf_blight': 648, 'black_stem_borer': 506, 'bacterial_leaf_streak': 505, 'bacterial_panicle_blight': 450}


Unnamed: 0,image_id,label,variety,age
0,PDD00001.jpg,bacterial_leaf_blight,45,65
1,PDD00002.jpg,bacterial_leaf_blight,45,60
2,PDD00003.jpg,bacterial_leaf_blight,45,55
3,PDD00004.jpg,bacterial_leaf_blight,KarnatakaPonni,50
4,PDD00005.jpg,bacterial_leaf_blight,45,72


## Image augmentation #1 : 5 times of original size

In [3]:
final_aug_dir = './diseases_final_augmented_5x/'
target_img_size = (256,256)

datagen = ImageDataGenerator(
    rotation_range = 5,
    shear_range = 0.2,
    zoom_range = 0.2,
    width_shift_range = 0.0,
    height_shift_range = 0.0,
    fill_mode = 'nearest',
    horizontal_flip = True,
    vertical_flip = False    
)

for cls_name, count in sorted(classes.items()):
    
    if cls_name != 'bacterial_leaf_blight':
        next
    
    to_dir = os.path.join(final_aug_dir, cls_name)
    if not os.path.exists(to_dir):
        os.makedirs(to_dir)
        
    image = datagen.flow_from_directory(
        final_dir,
        classes = [cls_name],
        target_size = target_img_size,
        save_to_dir = to_dir,
        save_prefix = '',
        save_format = 'jpg',
        seed = SEED,
        batch_size = count)
    
    total = 5
    for ix in range(total):
        print(datetime.now(), 'Augmenting', cls_name, ' #iteration = ', ix)
        im = image.next()
    print('')

Found 648 images belonging to 1 classes.
2022-11-27 03:00:01.336538 Augmenting bacterial_leaf_blight  #iteration =  0
2022-11-27 03:00:32.053819 Augmenting bacterial_leaf_blight  #iteration =  1
2022-11-27 03:00:59.993830 Augmenting bacterial_leaf_blight  #iteration =  2
2022-11-27 03:01:29.868847 Augmenting bacterial_leaf_blight  #iteration =  3
2022-11-27 03:01:56.399863 Augmenting bacterial_leaf_blight  #iteration =  4

Found 505 images belonging to 1 classes.
2022-11-27 03:02:24.339877 Augmenting bacterial_leaf_streak  #iteration =  0
2022-11-27 03:02:45.126884 Augmenting bacterial_leaf_streak  #iteration =  1
2022-11-27 03:03:07.571455 Augmenting bacterial_leaf_streak  #iteration =  2
2022-11-27 03:03:30.884462 Augmenting bacterial_leaf_streak  #iteration =  3
2022-11-27 03:03:50.683472 Augmenting bacterial_leaf_streak  #iteration =  4

Found 450 images belonging to 1 classes.
2022-11-27 03:04:10.695480 Augmenting bacterial_panicle_blight  #iteration =  0
2022-11-27 03:04:29.27209

#### Rename augmented files and create new meta data file

In [4]:
aug_meta = []
for cls_name, count in sorted(classes.items()):
    meta1 = metadata[metadata.label == cls_name]
    meta1['file_seq'] = range(meta1.shape[0])
    
    cls_dir = os.path.join(final_aug_dir, cls_name)    
    
    all_files = [Path(filename).name for filename in glob.glob(cls_dir + '/*.jpg')]
    
    files_df = pd.DataFrame({'filename': all_files})
    files_df['file_seq'] = files_df.filename.apply(lambda x: int(x.split('_')[1]))
    
    meta2 = pd.merge(meta1, files_df, on="file_seq")
    meta2['seq'] = meta2.groupby(['image_id']).cumcount()
    meta2['new_filename'] = meta2['image_id'].str[:8] + '_' + meta2['seq'].apply(lambda x: str(x+1).zfill(3)) + '.jpg'
    aug_meta.append(meta2)
    print(cls_dir, meta1.shape[0], len(all_files), meta2.shape[0])    
    
    ## rename files
    for index, row in meta2.iterrows():
        cls_dir = os.path.join(final_aug_dir, cls_name)
        from_path = os.path.join(cls_dir, row['filename'])
        to_path = os.path.join(cls_dir, row['new_filename'])
        os.rename(from_path, to_path)

aug_meta = pd.concat(aug_meta)
aug_meta['image_id'] = aug_meta['new_filename']
aug_meta = aug_meta[['image_id', 'label', 'variety', 'age']]
aug_meta.to_csv(os.path.join(final_aug_dir, 'metadata.csv'), index=False)
print(aug_meta.shape)
aug_meta

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


./diseases_final_augmented_5x/bacterial_leaf_blight 648 3240 3240
./diseases_final_augmented_5x/bacterial_leaf_streak 505 2525 2525
./diseases_final_augmented_5x/bacterial_panicle_blight 450 2250 2250
./diseases_final_augmented_5x/black_stem_borer 506 2530 2530
./diseases_final_augmented_5x/blast 2351 11755 11755
./diseases_final_augmented_5x/brown_spot 1257 6285 6285
./diseases_final_augmented_5x/downy_mildew 868 4340 4340
./diseases_final_augmented_5x/hispa 2151 10755 10755
./diseases_final_augmented_5x/leaf_roller 1095 5475 5475
./diseases_final_augmented_5x/normal 2405 12025 12025
./diseases_final_augmented_5x/tungro 1951 9755 9755
./diseases_final_augmented_5x/white_stem_borer 1273 6365 6365
./diseases_final_augmented_5x/yellow_stem_borer 765 3825 3825
(81125, 4)


Unnamed: 0,image_id,label,variety,age
0,PDD00001_001.jpg,bacterial_leaf_blight,45,65
1,PDD00001_002.jpg,bacterial_leaf_blight,45,65
2,PDD00001_003.jpg,bacterial_leaf_blight,45,65
3,PDD00001_004.jpg,bacterial_leaf_blight,45,65
4,PDD00001_005.jpg,bacterial_leaf_blight,45,65
...,...,...,...,...
3820,PDD16225_001.jpg,yellow_stem_borer,Zonal,70
3821,PDD16225_002.jpg,yellow_stem_borer,Zonal,70
3822,PDD16225_003.jpg,yellow_stem_borer,Zonal,70
3823,PDD16225_004.jpg,yellow_stem_borer,Zonal,70


### Image augmentation #2 : 2k samples for each class

In [5]:
final_dir = './diseases_final/'
final_aug_dir = './diseases_final_augmented_2k/'
target_img_size = (256,256)

datagen = ImageDataGenerator(
    rotation_range = 5,
    shear_range = 0.2,
    zoom_range = 0.2,
    width_shift_range = 0.0,
    height_shift_range = 0.0,
    fill_mode = 'nearest',
    horizontal_flip = True,
    vertical_flip = False    
)

for cls_name, count in sorted(classes.items()):
    
    print(datetime.now(), 'Augmenting', cls_name)    
    to_dir = os.path.join(final_aug_dir, cls_name)
    if not os.path.exists(to_dir):
        os.makedirs(to_dir)
        
    image = datagen.flow_from_directory(
        final_dir,
        classes = [cls_name],
        target_size = target_img_size,
        save_to_dir = to_dir,
        save_prefix = '',
        save_format = 'jpg',
        seed = SEED,
        batch_size = 1)
    
    total = 2000
    for ix in range(total):
        #print(datetime.now(), 'Augmenting for', cls_name, ' #iteration = ', ix)
        im = image.next()
    print('')

2022-11-27 15:11:00.517477 Augmenting bacterial_leaf_blight
Found 648 images belonging to 1 classes.

2022-11-27 15:12:36.004678 Augmenting bacterial_leaf_streak
Found 505 images belonging to 1 classes.

2022-11-27 15:14:14.137707 Augmenting bacterial_panicle_blight
Found 450 images belonging to 1 classes.

2022-11-27 15:15:29.656681 Augmenting black_stem_borer
Found 506 images belonging to 1 classes.

2022-11-27 15:16:48.500678 Augmenting blast
Found 2351 images belonging to 1 classes.

2022-11-27 15:18:08.185679 Augmenting brown_spot
Found 1257 images belonging to 1 classes.

2022-11-27 15:19:49.675691 Augmenting downy_mildew
Found 868 images belonging to 1 classes.

2022-11-27 15:21:09.927304 Augmenting hispa
Found 2151 images belonging to 1 classes.

2022-11-27 15:22:30.006299 Augmenting leaf_roller
Found 1095 images belonging to 1 classes.

2022-11-27 15:23:51.601334 Augmenting normal
Found 2405 images belonging to 1 classes.

2022-11-27 15:25:12.016797 Augmenting tungro
Found 195

#### Rename augmented files and create new meta data file

In [6]:
aug_meta = []
for cls_name, count in sorted(classes.items()):
    meta1 = metadata[metadata.label == cls_name]
    meta1['file_seq'] = range(meta1.shape[0])
    
    cls_dir = os.path.join(final_aug_dir, cls_name)    
    
    all_files = [Path(filename).name for filename in glob.glob(cls_dir + '/*.jpg')]
    
    files_df = pd.DataFrame({'filename': all_files})
    files_df['file_seq'] = files_df.filename.apply(lambda x: int(x.split('_')[1]))
    
    meta2 = pd.merge(meta1, files_df, on="file_seq")
    meta2['seq'] = meta2.groupby(['image_id']).cumcount()
    meta2['new_filename'] = meta2['image_id'].str[:8] + '_' + meta2['seq'].apply(lambda x: str(x+1).zfill(3)) + '.jpg'
    aug_meta.append(meta2)
    print(cls_dir, meta1.shape[0], len(all_files), meta2.shape[0])    
    
    ## rename files
    for index, row in meta2.iterrows():
        cls_dir = os.path.join(final_aug_dir, cls_name)
        from_path = os.path.join(cls_dir, row['filename'])
        to_path = os.path.join(cls_dir, row['new_filename'])
        os.rename(from_path, to_path)

aug_meta = pd.concat(aug_meta)
aug_meta['image_id'] = aug_meta['new_filename']
aug_meta = aug_meta[['image_id', 'label', 'variety', 'age']]
aug_meta.to_csv(os.path.join(final_aug_dir, 'metadata.csv'), index=False)
print(aug_meta.shape)
aug_meta

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


./diseases_final_augmented_2k/bacterial_leaf_blight 648 2000 2000
./diseases_final_augmented_2k/bacterial_leaf_streak 505 2000 2000
./diseases_final_augmented_2k/bacterial_panicle_blight 450 2000 2000
./diseases_final_augmented_2k/black_stem_borer 506 2000 2000
./diseases_final_augmented_2k/blast 2351 2000 2000
./diseases_final_augmented_2k/brown_spot 1257 2000 2000
./diseases_final_augmented_2k/downy_mildew 868 2000 2000
./diseases_final_augmented_2k/hispa 2151 2000 2000
./diseases_final_augmented_2k/leaf_roller 1095 2000 2000
./diseases_final_augmented_2k/normal 2405 2000 2000
./diseases_final_augmented_2k/tungro 1951 2000 2000
./diseases_final_augmented_2k/white_stem_borer 1273 2000 2000
./diseases_final_augmented_2k/yellow_stem_borer 765 2000 2000
(26000, 4)


Unnamed: 0,image_id,label,variety,age
0,PDD00001_001.jpg,bacterial_leaf_blight,45,65
1,PDD00001_002.jpg,bacterial_leaf_blight,45,65
2,PDD00001_003.jpg,bacterial_leaf_blight,45,65
3,PDD00002_001.jpg,bacterial_leaf_blight,45,60
4,PDD00002_002.jpg,bacterial_leaf_blight,45,60
...,...,...,...,...
1995,PDD16224_002.jpg,yellow_stem_borer,45,70
1996,PDD16224_003.jpg,yellow_stem_borer,45,70
1997,PDD16225_001.jpg,yellow_stem_borer,Zonal,70
1998,PDD16225_002.jpg,yellow_stem_borer,Zonal,70


### Image augmentation #3 : 5k samples for each class

In [7]:
final_aug_dir = './diseases_final_augmented_5k/'
target_img_size = (256,256)

datagen = ImageDataGenerator(
    rotation_range = 5,
    shear_range = 0.2,
    zoom_range = 0.2,
    width_shift_range = 0.0,
    height_shift_range = 0.0,
    fill_mode = 'nearest',
    horizontal_flip = True,
    vertical_flip = False    
)

for cls_name, count in sorted(classes.items()):
    
    print(datetime.now(), 'Augmenting', cls_name)    
#     if cls_name != 'bacterial_leaf_blight':
#         continue
    
    to_dir = os.path.join(final_aug_dir, cls_name)
    if not os.path.exists(to_dir):
        os.makedirs(to_dir)
        
    image = datagen.flow_from_directory(
        final_dir,
        classes = [cls_name],
        target_size = target_img_size,
        save_to_dir = to_dir,
        save_prefix = '',
        save_format = 'jpg',
        seed = SEED,
        batch_size = 1)
    
    total = 5000
    for ix in range(total):
        #print(datetime.now(), 'Augmenting for', cls_name, ' #iteration = ', ix)
        im = image.next()
    print('')

2022-11-27 15:29:48.746813 Augmenting bacterial_leaf_blight
Found 648 images belonging to 1 classes.

2022-11-27 15:33:12.235795 Augmenting bacterial_leaf_streak
Found 505 images belonging to 1 classes.

2022-11-27 15:36:32.103908 Augmenting bacterial_panicle_blight
Found 450 images belonging to 1 classes.

2022-11-27 15:39:43.159912 Augmenting black_stem_borer
Found 506 images belonging to 1 classes.

2022-11-27 15:43:05.513637 Augmenting blast
Found 2351 images belonging to 1 classes.

2022-11-27 15:46:22.712846 Augmenting brown_spot
Found 1257 images belonging to 1 classes.

2022-11-27 15:50:03.407407 Augmenting downy_mildew
Found 868 images belonging to 1 classes.

2022-11-27 15:53:28.381217 Augmenting hispa
Found 2151 images belonging to 1 classes.

2022-11-27 15:56:56.525827 Augmenting leaf_roller
Found 1095 images belonging to 1 classes.

2022-11-27 15:59:38.464021 Augmenting normal
Found 2405 images belonging to 1 classes.

2022-11-27 16:02:27.310104 Augmenting tungro
Found 195

#### Rename augmented files and create new meta data file

In [8]:
aug_meta = []
for cls_name, count in sorted(classes.items()):
    meta1 = metadata[metadata.label == cls_name]
    meta1['file_seq'] = range(meta1.shape[0])
    
    cls_dir = os.path.join(final_aug_dir, cls_name)    
    
    all_files = [Path(filename).name for filename in glob.glob(cls_dir + '/*.jpg')]
    
    files_df = pd.DataFrame({'filename': all_files})
    files_df['file_seq'] = files_df.filename.apply(lambda x: int(x.split('_')[1]))
    
    meta2 = pd.merge(meta1, files_df, on="file_seq")
    meta2['seq'] = meta2.groupby(['image_id']).cumcount()
    meta2['new_filename'] = meta2['image_id'].str[:8] + '_' + meta2['seq'].apply(lambda x: str(x+1).zfill(3)) + '.jpg'
    aug_meta.append(meta2)
    print(cls_dir, meta1.shape[0], len(all_files), meta2.shape[0])    
    
    ## rename files
    for index, row in meta2.iterrows():
        cls_dir = os.path.join(final_aug_dir, cls_name)
        from_path = os.path.join(cls_dir, row['filename'])
        to_path = os.path.join(cls_dir, row['new_filename'])
        os.rename(from_path, to_path)

aug_meta = pd.concat(aug_meta)
aug_meta['image_id'] = aug_meta['new_filename']
aug_meta = aug_meta[['image_id', 'label', 'variety', 'age']]
aug_meta.to_csv(os.path.join(final_aug_dir, 'metadata.csv'), index=False)
print(aug_meta.shape)
aug_meta

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


./diseases_final_augmented_5k/bacterial_leaf_blight 648 5000 5000
./diseases_final_augmented_5k/bacterial_leaf_streak 505 5000 5000
./diseases_final_augmented_5k/bacterial_panicle_blight 450 5000 5000
./diseases_final_augmented_5k/black_stem_borer 506 5000 5000
./diseases_final_augmented_5k/blast 2351 5000 5000
./diseases_final_augmented_5k/brown_spot 1257 5000 5000
./diseases_final_augmented_5k/downy_mildew 868 5000 5000
./diseases_final_augmented_5k/hispa 2151 5000 5000
./diseases_final_augmented_5k/leaf_roller 1095 5000 5000
./diseases_final_augmented_5k/normal 2405 5000 5000
./diseases_final_augmented_5k/tungro 1951 5000 5000
./diseases_final_augmented_5k/white_stem_borer 1273 5000 5000
./diseases_final_augmented_5k/yellow_stem_borer 765 5000 5000
(65000, 4)


Unnamed: 0,image_id,label,variety,age
0,PDD00001_001.jpg,bacterial_leaf_blight,45,65
1,PDD00001_002.jpg,bacterial_leaf_blight,45,65
2,PDD00001_003.jpg,bacterial_leaf_blight,45,65
3,PDD00001_004.jpg,bacterial_leaf_blight,45,65
4,PDD00001_005.jpg,bacterial_leaf_blight,45,65
...,...,...,...,...
4995,PDD16225_003.jpg,yellow_stem_borer,Zonal,70
4996,PDD16225_004.jpg,yellow_stem_borer,Zonal,70
4997,PDD16225_005.jpg,yellow_stem_borer,Zonal,70
4998,PDD16225_006.jpg,yellow_stem_borer,Zonal,70


In [9]:
# aug_meta = pd.concat(aug_meta)
# aug_meta.to_csv(os.path.join(final_aug_dir, 'metadata.csv'), index=False)

In [10]:
# meta2 = pd.merge(meta1, df1, on="file_seq")
# meta2['seq'] = meta2.groupby(['image_id']).cumcount()
# meta2['new_filename'] = meta2['image_id'].str[:8] + '_' + meta2['seq'].apply(lambda x: str(x+1).zfill(3)) + '.jpg'
# meta2.to_csv('test.csv')
# meta2