### Import and load data entry

In [21]:
import pandas as pd
import numpy as np
from os.path import join, exists
import os
from tqdm import tqdm
import json

In [22]:
data_entry = pd.read_csv("Data_Entry_2017.csv", sep=";")

In [23]:
data_entry.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


### Process folder
Count how many images are there in each folder of dataset

In [12]:
root_path = "G:\\Code\\Dataset\\archive"

img_folder = ['images_001', 'images_002', 'images_003', 'images_004', 'images_005', 'images_006', 'images_007', 'images_008', 'images_009', 'images_010', 'images_011', 'images_012']
folder_size = {}

print(exists(root_path))

True


In [None]:
for folder in img_folder:

    folder_size[folder] = (len(os.listdir((join(root_path, join(folder, "images"))))))

print(folder_size)

### Get labels
Get all labels and create a class to index mapping, then save to a json file

In [24]:
label_map = {}

for labels in data_entry["Finding Labels"].to_list()[1:]:
    for label in labels.split("|"):
        label_map[label] = None    

In [None]:
c = 0
for label in label_map:
    label_map[label] = c
    c += 1

with open("label_map.json", 'w') as f:
    json.dump(label_map, f)

label_map

### Re-label image 
Assign each class with an index and then label each image with one-hot vector

In [None]:
img_labels_df = data_entry[["Image Index","Finding Labels"]]
img_labels_df.head()

In [None]:
# clear data entry since it's quite large, don't run if you didn't load it
import gc

del data_entry
gc.collect()

In [None]:
rows , cols = img_labels_df.shape

In [None]:
# save label to train folder
train_label_path = join(root_path, "train\\class_label")
for i in tqdm(range(rows - folder_size["images_012"])):
    if exists(join(train_label_path,img_name + ".npy")): break
    one_label = np.zeros(15)
    label = img_labels_df.iloc[i,1] # row , col
    for class_ in label.split('|'):
        one_label[label_map[class_]] = 1

    img_name = img_labels_df.iloc[i,0]
    np.save(join(train_label_path,img_name), one_label)

In [None]:
# save label to test folder
test_label_path = join(root_path, "test\\class_label")
for i in tqdm(range(rows - folder_size["images_012"], rows)):
    if exists(join(test_label_path,img_name + ".npy")): break
    one_label = np.zeros(15)
    label = img_labels_df.iloc[i,1] # row , col
    for class_ in label.split('|'):
        one_label[label_map[class_]] = 1

    img_name = img_labels_df.iloc[i,0]
    np.save(join(test_label_path,img_name), one_label)

### Process age, gender, VA
Turn those categorical variables into numerical

In [4]:
agv = data_entry[["Patient Age","Patient Gender","View Position"]] 
agv.head()

Unnamed: 0,Patient Age,Patient Gender,View Position
0,58,M,PA
1,58,M,PA
2,58,M,PA
3,81,M,PA
4,81,F,PA


In [17]:
mean_ = np.mean(agv["Patient Age"].to_numpy())
std_ = np.std(agv["Patient Age"].to_numpy())
#normalize age
agv["Patient Age"] = pd.Series([(x-mean_)/std_ for x in agv["Patient Age"]])

In [5]:
agv.loc[agv["Patient Gender"] == "M", "Patient Gender"] = 0
agv.loc[agv["Patient Gender"] == "F", "Patient Gender"] = 1

agv.loc[agv["View Position"] == "PA", "View Position"] = 0
agv.loc[agv["View Position"] == "AP", "View Position"] = 1

In [48]:
agv.head(20)

Unnamed: 0,Patient Age,Patient Gender,View Position
0,0.659064,0,0
1,0.659064,0,0
2,0.659064,0,0
3,2.024872,0,0
4,2.024872,1,0
5,1.609191,1,0
6,1.668574,1,0
7,1.727957,1,0
8,1.78734,1,0
9,1.846723,1,0


In [None]:
 #add age, gender, VA to 384 features extracted from dinov2
count = 0

train_data_path = "G:\\Code\\Dataset\\archive\\train\\img_feature"
test_data_path = "G:\\Code\\Dataset\\archive\\test\\img_feature"

for folder in img_folder[:-1]:
    for img_name in tqdm(os.listdir((join(root_path, join(folder, "images"))))):
        row = agv.iloc[count,:].to_numpy()
        tmp_npa = np.load(join(train_data_path, img_name + ".npy"))
        tmp_npa = np.append(tmp_npa,row).reshape((1,387))
        np.save(join(train_data_path, img_name), tmp_npa)
        count += 1

for img_name in tqdm(os.listdir((join(root_path, join(img_folder[-1], "images"))))):
    row = agv.iloc[count,:].to_numpy(dtype=np.float32)
    tmp_npa = np.load(join(test_data_path, img_name + ".npy"))   
    tmp_npa = np.append(tmp_npa,row).reshape((1,387))
    np.save(join(test_data_path, img_name), tmp_npa)
    count += 1

print(count)

### Evaluate

In [35]:
batch_n = 3281

omg_pred = np.load(r"model_eval/omg_pred.npy")
omg_pred = omg_pred.reshape(32*batch_n, 15)

omg_truth = np.load(r'model_eval/omg_truth.npy')
omg_truth = omg_truth.reshape(32*batch_n,15)

In [36]:
predicted_labels = (omg_pred > 0.25)

In [37]:
predicted_labels = predicted_labels.astype(float)

In [40]:
for i in range(10):
    print(predicted_labels[i])

[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [38]:
from eval import IoU_accuracy, Hamming_Loss, Recall, Precision, F1Measure

print(IoU_accuracy(omg_truth,predicted_labels))
print(Hamming_Loss(omg_truth,predicted_labels))
print(Recall(omg_truth,predicted_labels))
print(Precision(omg_truth,predicted_labels))
print(F1Measure(omg_truth,predicted_labels))

0.5349603242635719
0.08298410037590166
0.5778030064573367
0.543530508598097
0.5529522135239471


In [39]:
from sklearn.metrics import classification_report

print(classification_report(omg_truth , predicted_labels, output_dict=False, target_names=[label for label in label_map] ))

                    precision    recall  f1-score   support

      Cardiomegaly       0.00      0.00      0.00      2555
         Emphysema       0.00      0.00      0.00      2272
          Effusion       0.32      0.22      0.26     12339
        No Finding       0.55      0.98      0.71     56716
            Hernia       0.00      0.00      0.00       210
      Infiltration       0.34      0.31      0.32     18538
              Mass       0.00      0.00      0.00      5423
            Nodule       0.00      0.00      0.00      5932
       Atelectasis       0.18      0.01      0.03     10811
      Pneumothorax       0.00      0.00      0.00      4775
Pleural_Thickening       0.00      0.00      0.00      3162
         Pneumonia       0.00      0.00      0.00      1359
          Fibrosis       0.00      0.00      0.00      1647
             Edema       0.00      0.00      0.00      2208
     Consolidation       0.10      0.00      0.00      4365

         micro avg       0.51      0.4

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
