### **Library Imports**

In [1]:
import os
import cv2
import random as r
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from time import time
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

### **Helpers**

In [2]:
def breaker(num: int = 50, char: str = "*") -> None:
    print("\n" + num*char + "\n")


def get_image(path: str, size: int=224) -> np.ndarray:
    return cv2.resize(src=cv2.cvtColor(src=cv2.imread(path, cv2.IMREAD_COLOR), code=cv2.COLOR_BGR2RGB), dsize=(size, size), interpolation=cv2.INTER_AREA)


def get_images(path: str, label_names: np.ndarray, filenames: np.ndarray, size: int) -> np.ndarray:
    images = np.zeros((len(filenames), size, size, 3), dtype=np.uint8)
    
    for i in range(len(filenames)):
        images[i] = get_image(os.path.join(os.path.join(path, label_names[i]), filenames[i]), size)
    return images


def get_statistics(path: str, label_names: np.ndarray, filenames: np.ndarray, size: int=224) -> tuple:
    r_mean, g_mean, b_mean, r_std, g_std, b_std = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
    
    for i in range(len(filenames)):
        image = get_image(os.path.join(os.path.join(path, label_names[i]), filenames[i]), size)
        
        r_mean += image[:, :, 0].mean()
        g_mean += image[:, :, 1].mean()
        b_mean += image[:, :, 2].mean()
        
        r_std += image[:, :, 0].std()
        g_std += image[:, :, 1].std()
        b_std += image[:, :, 2].std()
    
    return (r_mean, g_mean, b_mean), (r_std, g_std, b_std)


TRAIN_IMAGE_BASE_PATH = "../input/paddy-disease-classification/train_images"
SIZE = 512
le = LabelEncoder()

In [3]:
class CFG(object):
    def __init__(self, seed: int=42, n_splits: int=5):
        self.seed = seed
        self.n_splits = n_splits

cfg = CFG()

### **Get Images (Fold 1)**

In [4]:
df = pd.read_csv("../input/paddy-disease-classification/train.csv")

filenames = df.image_id.copy().values
label_names = df.label.copy().values
labels = le.fit_transform(label_names)

In [5]:
start_time = time()
fold = 1

for tr_idx, va_idx in StratifiedKFold(n_splits=cfg.n_splits, random_state=cfg.seed, shuffle=True).split(filenames, labels):
    if fold == 2: break
    fold += 1

tr_filenames, va_filenames, tr_label_names, va_label_names, tr_labels, va_labels = filenames[tr_idx], filenames[va_idx], label_names[tr_idx], label_names[va_idx], labels[tr_idx], labels[va_idx]    

tr_images = get_images(TRAIN_IMAGE_BASE_PATH, tr_label_names, tr_filenames, SIZE)
va_images = get_images(TRAIN_IMAGE_BASE_PATH, va_label_names, va_filenames, SIZE)

np.save("tr-images-512-f1.npy", tr_images)
np.save("va-images-512-f1.npy", va_images)
np.save("tr-labels-512-f1.npy", tr_labels)
np.save("va-labels-512-f1.npy", va_labels)

breaker()
print(f"Time Taken : {time()-start_time:.2f} seconds")
breaker()


**************************************************

Time Taken : 187.34 seconds

**************************************************



### **Statistics**

In [6]:
breaker()

start_time = time()

means, stds = get_statistics(TRAIN_IMAGE_BASE_PATH, label_names, filenames, SIZE)
channels = ["Red  ", "Green", "Blue "]

for i in range(len(means)):
    print(f"{channels[i]} Channel Mean : {means[i] / len(filenames) / 255:.5f}")
    print(f"{channels[i]} Channel Std  : {stds[i] / len(filenames) / 255:.5f}")
    if i != 2: print("")

breaker()
print(f"Time Taken : {time()-start_time:.2f} seconds")
breaker()


**************************************************

Red   Channel Mean : 0.49650
Red   Channel Std  : 0.22750

Green Channel Mean : 0.58774
Green Channel Std  : 0.22771

Blue  Channel Mean : 0.23004
Blue  Channel Std  : 0.18430

**************************************************

Time Taken : 173.49 seconds

**************************************************

