# Prepare Dataset by Keras ( - Keras VS Pytorch - )
moriitkys

# <font color="OrangeRed">データセットのバックアップを取ってから実行してください</font>
## Make a backup of your dataset before running this program

KerasやPyTorchで学習をする際のパラメータ設定ＵＩとデータセット準備ができます。   
File Directory　クラスごとにフォルダを作り、その中に該当クラスの画像を全て入れる(<font color="OrangeRed">データがそれぞれ10以下の場合エラーが出る可能性あり</font>)   
MyOwnNN/dataset/1/img0001.png, img0002.png, ...   
MyOwnNN/dataset/2/img0001.png, img0002.png, ...   
- 1や2はクラス名で、HookWrenchやSpannerWrenchという名前でもOK   
- クラスのラベルとデータパスの対応を示したtxtやcsvのようなファイルは必要なく、データの入ったフォルダから自動でクラス名を取得してcategoriesに保持し対応付けます。

In [1]:
#Settings and prepare your dataset
import glob
import os
import sys

import keras
from keras import layers, models, optimizers
from keras.utils import np_utils
import keras.backend as K
import keras.layers as KL
import tensorflow as tf
from keras.preprocessing.image import load_img, img_to_array, array_to_img
from keras.preprocessing.image import random_rotation, random_shift, random_zoom

#numpy==1.18.4 ->  numpy-1.16.4 (管理者権限でAnaconda Prompt)
#h5py==  -> 2.8.0rc1 (pip install h5py==2.8.0rc1)
import numpy as np
import random
import matplotlib.pyplot as plt

import PIL
from PIL import Image
import cv2

from pathlib import Path
import shutil

from sklearn.model_selection import train_test_split

import mylib.makedataset_rgb as mkdataset
import mylib.create_panel as create_panel
import mylib.utils as myutils

# ------ Setting panels ------
import tkinter
from tkinter import messagebox
img_size_mynet = [28,28]# You can change input image size(Pay attention to network shape)
setting_panel = create_panel.CreatePanel(img_size_mynet = img_size_mynet)
setting_panel.create_buttons()#If you push "start", exit this line.

# ------ set params and preparing dataset ------
flag_train = setting_panel.flag_train
flag_aug = setting_panel.flag_aug
flag_split = setting_panel.flag_split
ratio_train = float(setting_panel.var_sp.get())#0.0 ~ 1.0
total_epochs = int(setting_panel.var_sp_epochs.get())

type_backbone = setting_panel.type_backbone#ex) ResNet, Mobilenet, MyNet
layer_name_gradcam = setting_panel.layer_name_gradcam# Don't use 
img_size = setting_panel.img_size#ex) ResNet:[224,224], Mobilenet:[192,192], MyNet:[28,28]
print(type_backbone)
print("img_size=" + str(img_size))

#How many classes are in "dataset" folder
categories = [i for i in os.listdir(os.getcwd().replace("/mylib", "") + "/dataset")]
categories_idx = {}#ex) HookWrench:0, SpannerWrench:1
for i, name in enumerate(categories):
    categories_idx[name] = i
nb_classes = len(categories)#ex) nb_classes=2

dirname_dataset = "dataset"# dataset folder
dirname_dataset_val = dirname_dataset + "_val"
output_folder = "outputs_keras/"+type_backbone
x_train, y_train, x_val, y_val = [],[],[],[]

def aug_dataset(dirname_dataset_1, dirname_dataset_val_1):
    '''
    This function returns updated dataset dirname 
    Contain MakeDatasetRGB() (mylib/makedataset_rgb.py)
    Argument1: Foldername (String), Argument2: Foldername (String)
    Usage:
    dirname_dataset, dirname_dataset_val = aug_dataset(dirname_dataset, dirname_dataset_val)
    '''
    dirname_dataset_aug = dirname_dataset_1 + "_aug"
    dirname_dataset_val_aug = dirname_dataset_val_1 + "_aug"
    make_dataset = mkdataset.MakeDataSetRGB()
    if os.path.exists(dirname_dataset_aug ) == True \
    or os.path.exists(dirname_dataset_val_aug ) == True:
        #https://pythonbasics.org/tkinter-messagebox/
        tki2 = tkinter.Tk()
        tki2.withdraw()
        ret = messagebox.askyesno('確認', '_augフォルダがあります。_augフォルダ内を消去してよろしいですか？')
        if ret == True:
            if os.path.exists(dirname_dataset_aug ) == True:
                shutil.rmtree(dirname_dataset_aug)
            if os.path.exists(dirname_dataset_val_aug ) == True:
                shutil.rmtree(dirname_dataset_val_aug)
            make_dataset.do_augmentation(dataset_folder_name = "dataset")
            make_dataset.do_augmentation(dataset_folder_name = "dataset_val")
            tki2.destroy()
        else:
            tki2.destroy()
        tki2.mainloop()
    else:
        make_dataset.do_augmentation(dataset_folder_name = "dataset")
        make_dataset.do_augmentation(dataset_folder_name = "dataset_val")
        
    dirname_dataset_2 = dirname_dataset_1 + "_aug"
    dirname_dataset_val_2 = dirname_dataset_val_1 + "_aug"
    return dirname_dataset_2, dirname_dataset_val_2

def prepare_dataset_val():
    for j in categories:
        if os.path.exists(dirname_dataset_val  + "\\" + str(j) ) == False:
            os.makedirs(dirname_dataset_val + "\\" + str(j))
            files = glob.glob(dirname_dataset + "\\" + str(j) + "/*")
            for imgfile in files:# move some data from "dataset" to "dataset_val"
                if myutils.train_or_val(ratio_train) == "val":
                    shutil.move(imgfile, dirname_dataset_val+"\\" + str(j) + "/")

def revert_dataset_val():
    '''
    Revert Dataset ("dataset" & "dataset_val" -> "dataset")
    This function revert splitted validation dataset directory to dataset directory
    '''
    for j in categories:
        if os.path.exists(dirname_dataset_val  + "\\" + str(j) ) == True:
            files = glob.glob(dirname_dataset_val + "\\" + str(j) + "/*")
            for imgfile in files:#Move all images in "dataset_val" to "dataset"
                shutil.move(imgfile, dirname_dataset + "\\" + str(j))
    if os.path.exists(dirname_dataset_val) == True:
        shutil.rmtree(dirname_dataset_val)#Delete "dataset_val" folder

def prepare_dataset(dirname_dataset, dirname_dataset_val):
    label = 0
    for j in categories:# Prepare Training Dataset
        files = glob.glob(dirname_dataset + "\\" + str(j) + "/*")
        for imgfile in files:
            img = load_img(imgfile, target_size=(img_size[0], img_size[1]))
            array = img_to_array(img) / 255
            x_train.append(array)
            y_train.append(label)
        label += 1

    label = 0
    for j in categories:# Prepare Validation Dataset
        files = glob.glob(dirname_dataset_val + "\\" + str(j) + "/*")
        for imgfile in files:
            img = load_img(imgfile, target_size=(img_size[0], img_size[1]))
            array = img_to_array(img) / 255
            x_val.append(array)
            y_val.append(label)
        label += 1
            
if flag_train == True:
    print("train mode")
    print("total epochs = " + str(total_epochs))
    if flag_split == True:
        revert_dataset_val()
        prepare_dataset_val()
        print("splitting complete")
    if flag_split == False and os.path.exists(dirname_dataset_val) == False:
        prepare_dataset_val()
        print("You have not splitted dataset, so splitteing automatically done")
    if flag_aug == True:
        dirname_dataset, dirname_dataset_val = aug_dataset(dirname_dataset, dirname_dataset_val)
        print("dataset source is " + dirname_dataset + "&" + dirname_dataset_val)
    elif flag_aug == False:
        dirname_dataset_aug = dirname_dataset + "_aug"
        dirname_dataset_val_aug = dirname_dataset_val + "_aug"
        make_dataset = mkdataset.MakeDataSetRGB()
        if os.path.exists(dirname_dataset_aug ) == True \
        and os.path.exists(dirname_dataset_val_aug ) == True:
            dirname_dataset = dirname_dataset_aug
            dirname_dataset_val = dirname_dataset_val_aug
    prepare_dataset(dirname_dataset, dirname_dataset_val)
    # make directory (weights_folder, outputs)
    if os.path.exists("weights_pytorch/"+type_backbone) == False:
        os.makedirs("weights_pytorch/"+type_backbone)
    if os.path.exists("outputs_pytorch/"+type_backbone) == False:
        os.makedirs("outputs_pytorch/"+type_backbone)
        
if os.path.exists(output_folder) == False:
    os.makedirs(output_folder)

# In Keras, use numpy array for NN model
if os.path.exists("tmp_npy") == False:
    os.makedirs("tmp_npy")
x_train, y_train, x_val, y_val = np.array(x_train), np.array(y_train), np.array(x_val), np.array(y_val)
np.save("tmp_npy/x_train.npy", x_train)
np.save("tmp_npy/y_train.npy", y_train)
np.save("tmp_npy/x_test.npy", x_val)
np.save("tmp_npy/y_test.npy", y_val)
x_train, y_train, x_val, y_val = [],[],[],[]
print("Complete")

Using TensorFlow backend.


Mobilenet
img_size=[192, 192]
train mode
total epochs = 55
splitting complete
Now executing augmentation :dataset/HookWrench
Now executing augmentation :dataset/SpannerWrench
Now executing augmentation :dataset_val/HookWrench
Now executing augmentation :dataset_val/SpannerWrench
dataset source is dataset_aug&dataset_val_aug
Complete


以下はdataset_valのデータをdatasetに統合するプログラムなので、必要な時以外に実行しないでください。   
Don't use the next cell except merging dataset_val and dataset

In [3]:
# Revert Dataset (dataset & dataset_val -> dataset)
import os
import glob
import shutil
#categories = [i for i in os.listdir(os.getcwd().replace("/mylib", "") + "/dataset")]
categories = [i for i in os.listdir(os.getcwd()+ "/dataset")]
print(categories)
dirname_dataset = "dataset"# dataset folder
dirname_dataset_val = dirname_dataset + "_val" # validation dataset folder

def revert_dataset_val():
    '''
    Revert Dataset
    This function revert splitted validation dataset directory to dataset directory
    '''
    for j in categories:
        if os.path.exists(dirname_dataset_val  + "\\" + str(j) ) == True:
            files = glob.glob(dirname_dataset_val + "\\" + str(j) + "/*")
            for imgfile in files:
                shutil.move(imgfile, dirname_dataset + "\\" + str(j))
    if os.path.exists(dirname_dataset_val) == True:
        shutil.rmtree(dirname_dataset_val)
revert_dataset_val()

['HookWrench', 'SpannerWrench']


### Environment 実行環境
- Windows10
- CPU:Core i7-7700HQ
- Memory: 16GB
- Graphic board: GTX1060 6GB
- Strage: NVMe M.2 SSD 1TB
- CUDA 9.0.176   
- cuDNN 7.0.5   


- Keras==2.1.5
- tensorflow-gpu==1.11.0
- torch==1.1.0
- scikit-learn==0.19.1
- scipy==1.4.1