In [1]:
!sudo apt install tree



Reading package lists... Done
Building dependency tree       
Reading state information... Done
tree is already the newest version (1.7.0-5).
0 upgraded, 0 newly installed, 0 to remove and 1 not upgraded.


In [2]:
!tree --dirsfirst --filelimit 5 "./cell_images"

./cell_images
├── Parasitized [13780 entries exceeds filelimit, not opening dir]
└── Uninfected [13780 entries exceeds filelimit, not opening dir]

2 directories, 0 files


In [3]:
import os
import glob

base_dir = os.path.join('./cell_images')
infected_dir = os.path.join(base_dir, 'Parasitized')
healthy_dir = os.path.join(base_dir, 'Uninfected')

infected_files = glob.glob(infected_dir+'/*.png')
healthy_files = glob.glob(healthy_dir+'/*.png')
len(infected_files), len(healthy_files)

(13779, 13779)

In [4]:
import numpy as np
import pandas as pd

np.random.seed(42)

files_df = pd.DataFrame({
    'filename': infected_files + healthy_files,
    'label': ['malaria']*len(infected_files) + ['healthy']*len(healthy_files)
}).sample(frac=1, random_state=42).reset_index(drop=True)

In [5]:
files_df.head()

Unnamed: 0,filename,label
0,./cell_images/Parasitized/C124P85ThinF_IMG_201...,malaria
1,./cell_images/Parasitized/C59P20thinF_IMG_2015...,malaria
2,./cell_images/Uninfected/C167P128ReThinF_IMG_2...,healthy
3,./cell_images/Uninfected/C68P29N_ThinF_IMG_201...,healthy
4,./cell_images/Uninfected/C169P130ThinF_IMG_201...,healthy


In [6]:
from sklearn.model_selection import train_test_split
from collections import Counter

train_files, test_files, train_labels, test_labels = train_test_split(files_df['filename'].values,
                                                                      files_df['label'].values,
                                                                      test_size=0.3, random_state=42)
train_files, val_files, train_labels, val_labels = train_test_split(train_files,
                                                                    train_labels,
                                                                    test_size=0.1, random_state=42)
print(train_files.shape, val_files.shape, test_files.shape)
print('Train:', Counter(train_labels), '\nValidate:', Counter(val_labels), '\nTest:', Counter(val_labels))

(17361,) (1929,) (8268,)
Train: Counter({'healthy': 8734, 'malaria': 8627}) 
Validate: Counter({'healthy': 970, 'malaria': 959}) 
Test: Counter({'healthy': 970, 'malaria': 959})


In [17]:
import cv2
from concurrent import futures
import threading
import time

def get_image_shape_parallel(idx, img, total_images):
    if idx % 5000 == 0 or idx == (total_images - 1):
        print('{}: working on image number: {}'.format(threading.current_thread().name,
                                                       idx))
        return cv2.imread(img).shape
    
ex = futures.ThreadPoolExecutor(max_workers=None)

data_input = [(idx, img, len(train_files)) for idx, img in enumerate(train_files)]
print('Starting Img shapecomputation:')
train_img_dims_map = ex.map(get_image_shape_parallel,
                           (record[0] for record in data_input),
                           (record[1] for record in data_input),
                           (record[2] for record in data_input))

train_img_dims = list(train_img_dims_map)

print('Min Dimensions:', np.min(train_img_dims, axis=0)) 
print('Avg Dimensions:', np.mean(train_img_dims, axis=0))
print('Median Dimensions:', np.median(train_img_dims, axis=0))
print('Max Dimensions:', np.max(train_img_dims, axis=0))


Starting Img shapecomputation:
Thread-205: working on image number: 0
Thread-222: working on image number: 5000
Thread-212: working on image number: 10000
Thread-217: working on image number: 15000
Thread-211: working on image number: 17360


TypeError: unorderable types: tuple() <= NoneType()