In [None]:
from glob import glob
import random
import math
import os
import cv2
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plot
import gc
from tqdm import tqdm
from random import sample 
import multiprocess as mp
from functools import partial
from sklearn.utils import shuffle
from datetime import datetime
import natsort

In [None]:
PERCENTAGE_COMPOSITION_DATASET = {
    "top": 50,
    "mid": 50,
    "bottom": 0
}
def get_number_by_percentage(percentage, whole):
    return math.ceil(float(percentage)/100 * float(whole))

In [None]:
"""
input: array [[path_of_file <string>, label <int>]]
output: array of path [path_of_file <string>] & array of label [label <int>]
"""
def selecting_images_preprocessing(images_path_array, limit_image_to_train = "MAX", composition={}):
    # images_path_array = glob(images_path)
    final_image_path = []
    final_label = []
    def processing_image(img_data):
        img_path = img_data[0]
        label = img_data[1]
        # print(img_path, label)
        image = cv2.imread(img_path)
        # print(image)
        mean = np.mean(image)
        std = np.std(image)
        # print(mean, image.mean())
        # print(std, image.std())
        data_row = {
            "image_path": img_path,
            "mean": image.mean(),
            "std": image.std(),
            "class": label
        }
        # print(data_row)
        return data_row
    
        
    print("processed number of data: ", len(images_path_array))
    if limit_image_to_train == "MAX":
        limit_image_to_train = len(images_path_array)
            
    df_analysis = pd.DataFrame(columns=['image_path','mean','std', 'class'])
    
    # multiple processing calculating std
    
    pool = mp.Pool(5)
    data_rows = pool.map(processing_image, images_path_array)
    
    df_analysis = df_analysis.append(data_rows, ignore_index = True)
            
    final_df = df_analysis.sort_values(['std', 'mean'], ascending = [True, False])
    
    if composition == {}:
        final_df = shuffle(final_df)
        final_image_path = final_df['image_path'].head(limit_image_to_train).tolist()
        final_label = final_df['class'].head(limit_image_to_train).tolist()
    else:
        counter_available_no_data = limit_image_to_train
        if composition.get('top') != 0:
            num_rows = get_number_by_percentage(composition.get('top'), limit_image_to_train)
            if counter_available_no_data <= num_rows:
                num_rows = counter_available_no_data
            counter_available_no_data = counter_available_no_data - num_rows
            
            print(composition.get('top'), num_rows, counter_available_no_data)
            
            # get top data
            final_image_path = final_image_path + final_df['image_path'].head(num_rows).tolist()
            final_label = final_label + final_df['class'].head(num_rows).tolist()
            
        if composition.get('mid') != 0:
            num_rows = get_number_by_percentage(composition.get('mid'), limit_image_to_train)
            if counter_available_no_data <= num_rows:
                num_rows = counter_available_no_data
            counter_available_no_data = counter_available_no_data - num_rows
            
            print(composition.get('mid'), num_rows, counter_available_no_data)
            
            # top & mid
            n = len(final_df.index)
            mid_n = round(n/2)
            mid_k = round(num_rows/2)

            start = mid_n - mid_k
            end = mid_n + mid_k

            final = final_df.iloc[start:end]
            final_image_path = final_image_path + final['image_path'].head(num_rows).tolist()
            final_label = final_label + final['class'].head(num_rows).tolist()
            
        if composition.get('bottom') != 0:
            num_rows = get_number_by_percentage(composition.get('bottom'), limit_image_to_train)
            if counter_available_no_data <= num_rows:
                num_rows = counter_available_no_data
            counter_available_no_data = counter_available_no_data - num_rows
            
            print(composition.get('bottom'), num_rows, counter_available_no_data)
            
            # get bottom data
            final_image_path = final_image_path + final_df['image_path'].tail(num_rows).tolist()
            final_label = final_label + final_df['class'].tail(num_rows).tolist()
    
    
    # clear zombies memory
    del [[final_df, df_analysis]]
    gc.collect()
    
    # print(final_image_path, final_label)
    # print(len(final_image_path), len(final_label))
    return final_image_path, final_label

In [None]:
def read_data_with_labels(filepath, class_names, training=True, limit=100):
   
    image_list = []
    label_list = []
    for class_n in class_names:  # do dogs and cats
        path = os.path.join(filepath,class_n)  # create path to dogs and cats
        class_num = class_names.index(class_n)  # get the classification  (0 or a 1). 0=dog 1=cat
        path_list = []
        class_list = []
        
        list_path = natsort.natsorted(os.listdir(path))
        newarr_list_path = np.array_split(list_path, len(list_path)/1000)
        print("total number of dataset", len(list_path))
        
        list_path = newarr_list_path[0]
        print("data taken from dataset", len(list_path))
        for img in tqdm(list_path):  
            if ".DS_Store" != img:
                # print(img)
                filpath = os.path.join(path,img)
#                 print(filpath, class_num)
                
                path_list.append(filpath)
                class_list.append(class_num)
                # image_label_list.append({filpath:class_num})
        
        n_samples = None
        if limit != "MAX":
            n_samples = limit
        else: 
            n_samples = len(path_list)
            
        if training:
            ''' 
            selecting by attribute of image
            '''
            combined = np.transpose((path_list, class_list))
            # print(combined)
            path_list, class_list = selecting_images_preprocessing(combined, limit_image_to_train=n_samples, composition=PERCENTAGE_COMPOSITION_DATASET)
        
        else:
            ''' 
            random selecting
            '''
            path_list, class_list = shuffle(path_list, class_list, n_samples=n_samples ,random_state=random.randint(123, 10000))
        
        image_list = image_list + path_list
        label_list = label_list + class_list
  
    # print(image_list, label_list)
    
    return image_list, label_list

In [None]:
class_names = ["normal"]
data_path = f"data/mura_april/train_data"
filenames, labels = read_data_with_labels(data_path, class_names, training=True, limit=1000)
print(filenames, labels)
print(len(filenames))