### Import and load data entry

In [1]:
import pandas as pd
import numpy as np
from os.path import join, exists
import os
from tqdm import tqdm
import json

In [2]:
data_entry = pd.read_csv("Data_Entry_2017.csv", sep=";")

In [3]:
data_entry.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


### Process folder
Count how many images are there in each folder of dataset

In [6]:
root_path = "G:\\Code\\Dataset\\archive"

img_folder = ['images_001', 'images_002', 'images_003', 'images_004', 'images_005', 'images_006', 'images_007', 'images_008', 'images_009', 'images_010', 'images_011', 'images_012']
folder_size = {}

print(exists(root_path))

True


In [None]:
for folder in img_folder:

    folder_size[folder] = (len(os.listdir((join(root_path, join(folder, "images"))))))

print(folder_size)

### Get labels
Get all labels and create a class to index mapping, then save to a json file

In [24]:
label_map = {}

for labels in data_entry["Finding Labels"].to_list()[1:]:
    for label in labels.split("|"):
        label_map[label] = None    

In [None]:
c = 0
for label in label_map:
    label_map[label] = c
    c += 1

with open("label_map.json", 'w') as f:
    json.dump(label_map, f)

label_map

### Re-label image 
Assign each class with an index and then label each image with one-hot vector

In [None]:
img_labels_df = data_entry[["Image Index","Finding Labels"]]
img_labels_df.head()

In [None]:
# clear data entry since it's quite large, don't run if you didn't load it
import gc

del data_entry
gc.collect()

In [None]:
rows , cols = img_labels_df.shape

In [None]:
# save label to train folder
train_label_path = join(root_path, "train\\class_label")
for i in tqdm(range(rows - folder_size["images_012"])):
    if exists(join(train_label_path,img_name + ".npy")): break
    one_label = np.zeros(15)
    label = img_labels_df.iloc[i,1] # row , col
    for class_ in label.split('|'):
        one_label[label_map[class_]] = 1

    img_name = img_labels_df.iloc[i,0]
    np.save(join(train_label_path,img_name), one_label)

In [None]:
# save label to test folder
test_label_path = join(root_path, "test\\class_label")
for i in tqdm(range(rows - folder_size["images_012"], rows)):
    if exists(join(test_label_path,img_name + ".npy")): break
    one_label = np.zeros(15)
    label = img_labels_df.iloc[i,1] # row , col
    for class_ in label.split('|'):
        one_label[label_map[class_]] = 1

    img_name = img_labels_df.iloc[i,0]
    np.save(join(test_label_path,img_name), one_label)

### Process age, gender, VA
Turn those categorical variables into numerical

In [None]:
agv = data_entry[["Patient Age","Patient Gender","View Position"]] 
agv.head()

In [17]:
mean_ = np.mean(agv["Patient Age"].to_numpy())
std_ = np.std(agv["Patient Age"].to_numpy())
#normalize age
agv["Patient Age"] = pd.Series([(x-mean_)/std_ for x in agv["Patient Age"]])

In [5]:
agv.loc[agv["Patient Gender"] == "M", "Patient Gender"] = 0
agv.loc[agv["Patient Gender"] == "F", "Patient Gender"] = 1

agv.loc[agv["View Position"] == "PA", "View Position"] = 0
agv.loc[agv["View Position"] == "AP", "View Position"] = 1

In [None]:
agv.head(20)

In [None]:
 #add age, gender, VA to 384 features extracted from dinov2
count = 0

train_data_path = "G:\\Code\\Dataset\\archive\\train\\img_feature"
test_data_path = "G:\\Code\\Dataset\\archive\\test\\img_feature"

for folder in img_folder[:-1]:
    for img_name in tqdm(os.listdir((join(root_path, join(folder, "images"))))):
        row = agv.iloc[count,:].to_numpy()
        tmp_npa = np.load(join(train_data_path, img_name + ".npy"))
        tmp_npa = np.append(tmp_npa,row).reshape((1,387))
        np.save(join(train_data_path, img_name), tmp_npa)
        count += 1

for img_name in tqdm(os.listdir((join(root_path, join(img_folder[-1], "images"))))):
    row = agv.iloc[count,:].to_numpy(dtype=np.float32)
    tmp_npa = np.load(join(test_data_path, img_name + ".npy"))   
    tmp_npa = np.append(tmp_npa,row).reshape((1,387))
    np.save(join(test_data_path, img_name), tmp_npa)
    count += 1

print(count)

### Remove files 
Some classes have too many file that will make this dataset unbalanced,
we need to remove them so it can be equivalent to some extent

In [38]:
with open("label_map.json", 'r') as label_file:

    class_sample_count = json.load(label_file)

for class_ in class_sample_count:
    class_sample_count[class_] = []

In [39]:
for idx, labels in enumerate(tqdm(data_entry["Finding Labels"][:104999])):
    for label in labels.split('|'):
        class_sample_count[label].append(idx)

100%|██████████| 104999/104999 [00:00<00:00, 1904142.64it/s]


In [40]:
for class_ in class_sample_count:
    print(f"{class_}: {len(class_sample_count[class_])}")

#Effusion ,Infiltration, No Finding, Atelectasis

Cardiomegaly: 2555
Emphysema: 2272
Effusion: 12340
No Finding: 56720
Hernia: 210
Infiltration: 18538
Mass: 5424
Nodule: 5932
Atelectasis: 10811
Pneumothorax: 4776
Pleural_Thickening: 3163
Pneumonia: 1359
Fibrosis: 1647
Edema: 2209
Consolidation: 4365


In [31]:
import random
random.seed(2710)

remove_files = {
    "No Finding" : [],
    "Effusion" : [],
    "Infiltration" : [],
    "Atelectasis" : []
    }

for class_ in remove_files:
    class_size = len(class_sample_count[class_])
    remove_files[class_] = random.sample(class_sample_count[class_], class_size - random.randint(4500,5000))

In [32]:
for class_ in remove_files:
    print(f"{class_} : {len(remove_files[class_])}")

No Finding : 51961
Effusion : 7422
Infiltration : 13910
Atelectasis : 5893


In [34]:
idx_to_rm = set()

for class_ in remove_files:
    idx_to_rm.update(remove_files[class_])

len(idx_to_rm)

75556

In [37]:
with open("label_map.json", 'r') as label_file:

    class_rm_count = json.load(label_file)

for class_ in class_rm_count:
    class_rm_count[class_] = 0

for idx in idx_to_rm:
    labels = data_entry.iloc[idx, 1]
    for label in labels.split('|'):
        class_rm_count[label] += 1
    
class_rm_count

{'Cardiomegaly': 950,
 'Emphysema': 612,
 'Effusion': 9000,
 'No Finding': 51961,
 'Hernia': 46,
 'Infiltration': 14789,
 'Mass': 1653,
 'Nodule': 1715,
 'Atelectasis': 7515,
 'Pneumothorax': 1347,
 'Pleural_Thickening': 1030,
 'Pneumonia': 624,
 'Fibrosis': 434,
 'Edema': 974,
 'Consolidation': 1791}

In [41]:
for class_ in class_sample_count:
    print(f"{class_}: {len(class_sample_count[class_]) - class_rm_count[class_]}")

Cardiomegaly: 1605
Emphysema: 1660
Effusion: 3340
No Finding: 4759
Hernia: 164
Infiltration: 3749
Mass: 3771
Nodule: 4217
Atelectasis: 3296
Pneumothorax: 3429
Pleural_Thickening: 2133
Pneumonia: 735
Fibrosis: 1213
Edema: 1235
Consolidation: 2574


In [45]:
import shutil

old_train_path = "G:/Code/Dataset/archive/train/class_label"
new_train_path = "G:/Code/Dataset/archive/train_reduced/class_label"

for idx, file_name in enumerate(tqdm(data_entry["Image Index"][:104999])):
    if idx in idx_to_rm:
        continue
    
    file_name += ".npy"
    src = join(old_train_path, file_name)
    dst = join(new_train_path, file_name)

    shutil.copyfile(src, dst)

100%|██████████| 104999/104999 [01:16<00:00, 1370.85it/s]


In [43]:
old_train_path = "G:/Code/Dataset/archive/train/class_label"
new_train_path = "G:/Code/Dataset/archive/train_reduced/class_label"
print(len(os.listdir(new_train_path)))

29443


### Evaluate

In [20]:
batch_n = 3281

omg_pred = np.load(r"model_eval/omg_pred.npy")
omg_pred = omg_pred.reshape(32*batch_n, 15)

omg_truth = np.load(r'model_eval/omg_truth.npy')
omg_truth = omg_truth.reshape(32*batch_n,15)

In [38]:
predicted_labels = (omg_pred > 0.002)

In [39]:
predicted_labels = predicted_labels.astype(float)

In [40]:
for i in range(10):
    print(f'{predicted_labels[i]}')

[0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
[0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1.]
[0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
[1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1.]
[0. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]


In [41]:
from eval import IoU_accuracy, Hamming_Loss, Recall, Precision, F1Measure

print(IoU_accuracy(omg_truth,predicted_labels))
print(Hamming_Loss(omg_truth,predicted_labels))
print(Recall(omg_truth,predicted_labels))
print(Precision(omg_truth,predicted_labels))
print(F1Measure(omg_truth,predicted_labels))

0.13595337385533537
0.21275462257441838
0.34604389359370497
0.15460501970834434
0.19191144863930346


In [None]:
from sklearn.metrics import classification_report

print(classification_report(omg_truth , predicted_labels, output_dict=False, target_names=[label for label in label_map] ))