In [4]:
import os
import shutil
import itertools

import numpy as np
import pandas as pd

from collections import Counter

### Load df

In [5]:
ds = pd.read_csv("../data/Data_Entry_2017_v2020.csv")
ds.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
0,00000001_000.png,Cardiomegaly,0,1,57,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,80,M,PA,2500,2048,0.171,0.171
4,00000003_001.png,Hernia,0,3,74,F,PA,2500,2048,0.168,0.168


### Cleaning the labels

There are 100+ different labels because we have multiclasses. To fix this I have gotten the first 11 most common ones (frequency > 1000). Effusion was skipped and replaced with pneumonia, since the current files I have downloaded don't have effusion cases.

In [6]:
def find_frequent_labels(df, label, MIN_Cases, return_all = False):
    # Function that gives the most frequent labels. Can return only
    # the most frequent, or every occurance by setting return_all = True

    # Calculate the frequency of each label
    counts = Counter(df[label])
    
    # Filter labels with minimum occurrences
    frequent_labels = [(label, count) for label, count in counts.items() if count > MIN_Cases]
    
    # If true return every element
    if return_all:
        return frequent_labels, counts
    # Else just return the MF
    else:
        return frequent_labels

In [7]:
MIN_CASES = 1000

# Call the function and show variable
c_labels = find_frequent_labels(df = ds, label = "Finding Labels", MIN_Cases=1000)
c_labels

[('Cardiomegaly', 1093),
 ('No Finding', 60361),
 ('Infiltration', 9547),
 ('Effusion|Infiltration', 1603),
 ('Nodule', 2705),
 ('Effusion', 3955),
 ('Atelectasis', 4215),
 ('Mass', 2139),
 ('Pneumothorax', 2194),
 ('Pleural_Thickening', 1126),
 ('Consolidation', 1310),
 ('Atelectasis|Infiltration', 1350),
 ('Atelectasis|Effusion', 1165)]

Here I create directories which later will be used by `tf.keras.utils.image_dataset_from_directory()`. This way it will be easier to get the class labels afterwards

In [18]:
# Get the first 11 most common class names
name_dirs = [name[0] for name in c_labels][:11]

# Create the dirs
# ---------------
# For every dir (class name) in name_dirs:
for dir in name_dirs:
    # Try creating a dir and if dir is already created, pass:
    try:
        # Rename problematic class
        if dir == 'Effusion|Infiltration':
            dir = 'Effusion_Infiltration'

            # Create a dir with the calss' name
            os.mkdir(path = "../data/crx8/" + dir + "/")
        else:
            os.mkdir(path = "../data/crx8/" + dir + "/")
    except:
        pass

Here I take every set of a specific class and move it to the matching dir:

In [28]:
# For every label in name_dirs (list with 11 most frequent labels):
for label in name_dirs:
    # Get a set with those labels
    label_ds = ds[ds["Finding Labels"].str.contains(label)]
    # Get the indexes
    label_indexes = label_ds["Image Index"]

    # Rename label for this case, since label will be used as defining a dir
    if label == 'Effusion|Infiltration':
            label = 'Effusion_Infiltration'

    # Source dir, where every picture is
    SOURCE_DIR = "../data/crx8/other/"

    # Output dir, where we will put the data
    OUTPUT_DIR = "../data/crx8/" + label + "/"
    
    # Get all image files in the source directory
    image_files = os.listdir(SOURCE_DIR)
    
    # Access every img from image_files
    for img in image_files:

        # If an image from the image folder is in the current set, move to appropriate location
        if img in list(label_indexes):
            shutil.move(os.path.join(SOURCE_DIR, img), os.path.join(OUTPUT_DIR, img))
           

### First attempt at cleaning and transfering files - pneumonia cases

In [43]:
pneumonia_ds = ds[ds["Finding Labels"].str.contains("Pneumonia")]
pneumonia_indexes = pneumonia_ds["Image Index"]
pneumonia_ds

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
73,00000013_010.png,Effusion|Pneumonia|Pneumothorax,35,13,60,M,AP,3056,2544,0.139,0.139
126,00000032_012.png,Atelectasis|Consolidation|Edema|Pneumonia,12,32,54,F,AP,2500,2048,0.168,0.168
253,00000056_000.png,Nodule|Pneumonia,0,56,76,M,PA,2500,2048,0.168,0.168
276,00000061_012.png,Edema|Effusion|Infiltration|Pleural_Thickening...,12,61,77,M,AP,3056,2544,0.139,0.139
279,00000061_015.png,Pneumonia,15,61,77,M,AP,3056,2544,0.139,0.139
...,...,...,...,...,...,...,...,...,...,...,...
111557,00030536_007.png,Atelectasis|Consolidation|Pneumonia,7,30536,56,F,AP,3056,2544,0.139,0.139
111627,00030570_001.png,Edema|Infiltration|Pneumonia,1,30570,29,F,AP,3056,2544,0.139,0.139
111770,00030621_002.png,Pneumonia,5,30621,22,F,AP,3056,2544,0.139,0.139
111845,00030637_016.png,Consolidation|Pneumonia,16,30637,48,M,AP,3056,2544,0.139,0.139


In [4]:
# SOURCE_DIR = "../data/crx8/NORMAL/"
# TRAIN_DIR = "../data/train/NORMAL"
# TEST_DIR = "../data/test/NORMAL"


# # Get all image files in the source directory
# image_files = os.listdir(SOURCE_DIR)

# # Shuffle the image files
# np.random.shuffle(image_files)

# # Divide the image files into train, val, and test sets
# train_images = image_files[:int(0.8 * len(image_files))]
# test_images = image_files[int(0.9 * len(image_files)):]

# # Copy train images to the train directory
# for image_file in train_images:
#     shutil.copyfile(os.path.join(SOURCE_DIR, image_file), os.path.join(TRAIN_DIR, image_file))

# # Copy test images to the test directory
# for image_file in test_images:
#     shutil.copyfile(os.path.join(SOURCE_DIR, image_file), os.path.join(TEST_DIR, image_file))


In [41]:
SOURCE_DIR = "../data/crx8/other/"

PNEUMONIA_DIR = "../data/crx8/PNEUMONIA/" 

# Get all image files in the source directory
image_files = os.listdir(SOURCE_DIR)
pneu_list = []
# Access every img in /PNEUMONIA
for img in image_files:
    # If img is not in the list of pneumonia cases move it to /other

    # if img not in pneumonia_cases:
    #     shutil.move(os.path.join(SOURCE_DIR, img), os.path.join(OTHER_DIR, img))

    if img in list(pneumonia_indexes):
        shutil.move(os.path.join(SOURCE_DIR, img), os.path.join(PNEUMONIA_DIR, img))

In [42]:
len(os.listdir("../data/crx8/PNEUMONIA/")), len(os.listdir("../data/crx8/other/"))

(288, 10114)

In [40]:
len(os.listdir("../data/crx8/PNEUMONIA/")) + len(os.listdir("../data/crx8/other/"))

10402