# Classification of Cassava Leaves

## 1. Creating Input Data

### Imports

In [16]:
import os  # for path operations
import pickle  # for saving and loading created data

import cv2  # for reading images
import pandas as pd  # for reading and using image and label data
import numpy as np  # for the matrix operations
from sklearn.model_selection import train_test_split  # for splitting data to train and test sets
from tensorflow.keras.layers.experimental import preprocessing  # for augmentation operations of images
from tensorflow.keras.models import Sequential  # for augmentation layer

### Definiation of augmentation paramaters

In [17]:
test_split = 0.2  # test/train data split ratio
IMG_W, IMG_H, CHAN = 200, 200, 3  # output image dimensions 200x200 pixels and 3 color channels as RGB
sample_count = 4000  # number to determine how much sample will be included for each class

# below paths should be edit  
input_data_path = ""  # this directory must contain train.csv, train_images(dir)
                      # "../input/cassava-leaf-disease-classification" for running on kaggle
output_data_path = ""  # this directory will be used for training phase

### Creation of augmentation model

In [18]:
# model for images will be created by augmentation
aug_rotate = Sequential([preprocessing.Resizing(IMG_H, IMG_W),
                         preprocessing.RandomRotation(factor=1.0),
                         preprocessing.Rescaling(1./255)])

# model for modifing original images
aug_norm = Sequential([preprocessing.Resizing(IMG_H, IMG_W),
                       preprocessing.Rescaling(1./255)])

### Reading the label and image data

In [19]:
df = pd.read_csv(os.path.join(input_data_path, "train.csv"))  # read raw data
counts = [np.count_nonzero(df['label'] == i) for i in range(5)]  # calculate label counts
print(f"original label distribution: {counts}\n")

# selection of augmentation needed labels and images will be augmented
sample = None
for i in range(5):
    if sample_count <= counts[i]:
        sample = sample.append(df[df['label'] == i].sample(n=sample_count))
        continue
    if sample is None:
        sample = df[df['label'] == i]
    else:
        sample = sample.append(df[df['label'] == i])
    sample = sample.append(df[df['label'] == i].sample(
        n=sample_count-counts[i], replace=True))
sample['augment'] = sample.duplicated()
print(sample)

original label distribution: [1087, 2189, 2386, 13158, 2577]

             image_id  label  augment
0      1000015157.jpg      0    False
7      1001320321.jpg      0    False
18     1003888281.jpg      0    False
73     1012426959.jpg      0    False
109    1018973237.jpg      0    False
...               ...    ...      ...
12504  3241388791.jpg      4     True
6677   2195289519.jpg      4     True
4439   1792460758.jpg      4     True
16239  3924666081.jpg      4     True
1973   1347999958.jpg      4     True

[20000 rows x 3 columns]


### Splitting dataframe to train and test entries

In [20]:
df_train, df_test = train_test_split(sample, test_size=test_split)
print(df_train, df_test, sep='\n')

             image_id  label  augment
18890    52883488.jpg      1     True
21098   939773003.jpg      4    False
2570   1447330096.jpg      2    False
1634   1286409959.jpg      1    False
3249   1563395006.jpg      2    False
...               ...    ...      ...
13046  3337851974.jpg      1     True
19629   664996949.jpg      3    False
3243    156209433.jpg      1    False
3664   1649683387.jpg      1     True
7794   2386015135.jpg      2    False

[16000 rows x 3 columns]
             image_id  label  augment
15043  3702249794.jpg      1     True
7570    234859800.jpg      3    False
5595   1994597996.jpg      1    False
11400  3035716224.jpg      4    False
2785   1484094890.jpg      0    False
...               ...    ...      ...
19457   632445353.jpg      3    False
4091    173019618.jpg      1    False
3126   1541989559.jpg      0     True
12818  3295550498.jpg      1     True
20443   813217011.jpg      4    False

[4000 rows x 3 columns]


### Creating training data

In [22]:
train_output = np.zeros([len(df_train), IMG_H, IMG_W, CHAN], dtype=np.float32)  # image data container
train_labels = np.zeros([len(df_train)], dtype=np.uint8)  # label data container

for idx, (image_id, label, aug) in enumerate(df_train.values):  # iteration over training dataframe
    train_labels[idx] = label  # save label
    im = cv2.imread(os.path.join(input_data_path, "train_images", image_id))  # reading image
    
    if aug:
        train_output[idx] = aug_rotate(np.array([im]))  # if image is not original, it will be augmented
    else:
        train_output[idx] = aug_norm(np.array([im]))  # if image is original image, resizing and rescaling enough
    
    if (idx + 1) % 500 == 0:
        print(f"processing...{idx}/{len(df_train)}")

processing...499/16000
processing...999/16000
processing...1499/16000
processing...1999/16000
processing...2499/16000
processing...2999/16000
processing...3499/16000
processing...3999/16000
processing...4499/16000
processing...4999/16000
processing...5499/16000
processing...5999/16000
processing...6499/16000
processing...6999/16000
processing...7499/16000
processing...7999/16000
processing...8499/16000
processing...8999/16000
processing...9499/16000
processing...9999/16000
processing...10499/16000
processing...10999/16000
processing...11499/16000
processing...11999/16000
processing...12499/16000
processing...12999/16000
processing...13499/16000
processing...13999/16000
processing...14499/16000
processing...14999/16000
processing...15499/16000
processing...15999/16000


### Stroring training data

In [30]:
train_data_path = os.path.join(output_data_path, f"train_data/aug_{IMG_W}x{IMG_H}x{CHAN}")  # path to save training data
if not os.path.isdir(train_data_path):
    os.makedirs(train_data_path)  # create directory if not exist

with open(os.path.join(train_data_path, "train.pickle"), "wb") as f:  # storing train image data 
    pickle.dump(train_output, f, protocol=4)
    print(f"train image data is saved as: {os.path.join(train_data_path, 'train.pickle')}")

with open(os.path.join(train_data_path, "label.pickle"), "wb") as f:  # storing train label data
    pickle.dump(train_labels, f)
    print(f"train label data is saved as: {os.path.join(train_data_path, 'label.pickle')}")

train image data is saved as: C:/users/omer/desktop/demo_data\train_data/aug_200x200x3\train.pickle
train label data is saved as: C:/users/omer/desktop/demo_data\train_data/aug_200x200x3\label.pickle


### Creating testing data

In [32]:
test_output = np.zeros([len(df_test), IMG_H, IMG_W, CHAN], dtype=np.float32)
test_labels = np.zeros([len(df_test)], dtype=np.uint8)

for idx, (image_id, label, aug) in enumerate(df_test.values):  # iteration over test dataframe
    test_labels[idx] = label
    im = cv2.imread(os.path.join(input_data_path, "train_images", image_id))  # reading image
    if aug:
        test_output[idx] = aug_rotate(np.array([im]))  # if image is not original, it will be augmented
    else:
        test_output[idx] = aug_norm(np.array([im]))  # if image is original image, resizing and rescaling enough
    if (idx + 1) % 500 == 0:
        print(f"processing...{idx}/{len(df_test)}")

processing...499/4000
processing...999/4000
processing...1499/4000
processing...1999/4000
processing...2499/4000
processing...2999/4000
processing...3499/4000
processing...3999/4000


### Storing testing data

In [34]:
test_data_path = os.path.join(output_data_path, f"test_data/aug_{IMG_W}x{IMG_H}x{CHAN}")  # path to save testing data
if not os.path.isdir(test_data_path):
    os.makedirs(test_data_path)  # create directory if not exist

with open(os.path.join(test_data_path, "test.pickle"), "wb") as f:  # storing test image data
    pickle.dump(test_output, f, protocol=4)
    print(f"test image data is saved as: {os.path.join(test_data_path, 'test.pickle')}")

with open(os.path.join(test_data_path, "label.pickle"), "wb") as f:  # storing test label data
    pickle.dump(test_labels, f)
    print(f"test label data is saved as: {os.path.join(test_data_path, 'label.pickle')}")

test image data is saved as: C:/users/omer/desktop/demo_data\test_data/aug_200x200x3\test.pickle
test label data is saved as: C:/users/omer/desktop/demo_data\test_data/aug_200x200x3\label.pickle
