In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
cd /content/drive/.shortcut-targets-by-id/1wKSfO3aS4r8zRFUMxOvzXeHIfFXN1QGU/Classification2D/chest-X-rays

/content/drive/.shortcut-targets-by-id/1wKSfO3aS4r8zRFUMxOvzXeHIfFXN1QGU/Classification2D/chest-X-rays


In [16]:
import numpy as np 
import pandas as pd 
import random 

import os 
import glob 
import itertools
from itertools import chain  

from classifier.utils.file_utils import read_text_file

**NIH Dataset**

In [4]:
nih_data = pd.read_csv("/content/drive/MyDrive/Classification2D/NIH-Chest-X-ray-Dataset/Data_Entry_2017.csv")
nih_data

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,
...,...,...,...,...,...,...,...,...,...,...,...,...
112115,00030801_001.png,Mass|Pneumonia,1,30801,39,M,PA,2048,2500,0.168,0.168,
112116,00030802_000.png,No Finding,0,30802,29,M,PA,2048,2500,0.168,0.168,
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168,0.168,
112118,00030804_000.png,No Finding,0,30804,30,F,PA,2048,2500,0.168,0.168,


In [6]:
nih_data = nih_data.rename(columns={"Finding Labels": "class_name", "Image Index": "image_id"})
nih_data.head()

Unnamed: 0,image_id,class_name,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


In [7]:
columns = nih_data.class_name.apply(lambda classes: classes.split("|")).values
classes = np.unique(np.concatenate(columns))
nih_classes = list(filter(lambda item: item != "No Finding", classes))
nih_classes

['Atelectasis',
 'Cardiomegaly',
 'Consolidation',
 'Edema',
 'Effusion',
 'Emphysema',
 'Fibrosis',
 'Hernia',
 'Infiltration',
 'Mass',
 'Nodule',
 'Pleural_Thickening',
 'Pneumonia',
 'Pneumothorax']

Classes

In [8]:
np.save("/content/drive/MyDrive/Classification2D/chest-X-rays/data/nih/nih_classes_14.npy", nih_classes)

Add image_path columns

In [None]:
root = "/content/drive/MyDrive/Classification2D/NIH-Chest-X-ray-Dataset"
folder_path = [f"{root}/images_{id_:03d}/images/*" for id_ in range(1,13)]
img_path = list(chain(*[glob.glob(path) for path in folder_path]))
img_id = [os.path.basename(path) for path in img_path]

In [None]:
nih_data_path = pd.DataFrame({"image_id": img_id, "img_path": img_path})
nih_data_path.head()

In [None]:
nih_data = nih_data.merge(nih_data_path, how="outer", on="image_id")

In [None]:
for idc in range(nih_data.shape[0]):
  for col in nih_classes:
    nih_data.loc[idc, col] = 1 if col in nih_data.loc[idc, "class_name"] else 0

In [None]:
nih_data = nih_data[["image_id", "class_name", "img_path", *nih_classes]]
nih_data

Unnamed: 0,image_id,class_name,img_path,Atelectasis,Cardiomegaly,Consolidation,Edema,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax
0,00000001_000.png,Cardiomegaly,/content/drive/MyDrive/Classification2D/NIH-Ch...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00000001_001.png,Cardiomegaly|Emphysema,/content/drive/MyDrive/Classification2D/NIH-Ch...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00000001_002.png,Cardiomegaly|Effusion,/content/drive/MyDrive/Classification2D/NIH-Ch...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00000002_000.png,No Finding,/content/drive/MyDrive/Classification2D/NIH-Ch...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00000003_000.png,Hernia,/content/drive/MyDrive/Classification2D/NIH-Ch...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112115,00030801_001.png,Mass|Pneumonia,/content/drive/MyDrive/Classification2D/NIH-Ch...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
112116,00030802_000.png,No Finding,/content/drive/MyDrive/Classification2D/NIH-Ch...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112117,00030803_000.png,No Finding,/content/drive/MyDrive/Classification2D/NIH-Ch...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112118,00030804_000.png,No Finding,/content/drive/MyDrive/Classification2D/NIH-Ch...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
nih_data.to_csv("/content/drive/MyDrive/Classification2D/chest-X-rays/data/nih/nih_train_val.csv")        

**Vinbigdata Dataset**

In [36]:
vin_data = pd.read_csv("/content/drive/MyDrive/Classification2D/vinbigdata/data/train.csv")
vin_data

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,width,height
0,50a418190bc3fb1ef1633bf9678929b3,No finding,14,R11,,,,,2332,2580
1,21a10246a5ec7af151081d0cd6d65dc9,No finding,14,R7,,,,,2954,3159
2,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0,2080,2336
3,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0,2304,2880
4,063319de25ce7edb9b1c6b8881290140,No finding,14,R10,,,,,2540,3072
...,...,...,...,...,...,...,...,...,...,...
67909,936fd5cff1c058d39817a08f58b72cae,No finding,14,R1,,,,,2444,3200
67910,ca7e72954550eeb610fe22bf0244b7fa,No finding,14,R1,,,,,1994,2430
67911,aa17d5312a0fb4a2939436abca7f9579,No finding,14,R8,,,,,2048,2500
67912,4b56bc6d22b192f075f13231419dfcc8,Cardiomegaly,3,R8,771.0,979.0,1680.0,1311.0,1968,2040


In [37]:
vin_classes = np.unique(vin_data.class_name.values)
vin_classes

array(['Aortic enlargement', 'Atelectasis', 'Calcification',
       'Cardiomegaly', 'Consolidation', 'ILD', 'Infiltration',
       'Lung Opacity', 'No finding', 'Nodule/Mass', 'Other lesion',
       'Pleural effusion', 'Pleural thickening', 'Pneumothorax',
       'Pulmonary fibrosis'], dtype=object)

In [38]:
np.save("/content/drive/MyDrive/Classification2D/chest-X-rays/data/vinbigdata/vin_classes_14.npy", vin_classes)

In [39]:
image_id_lst = pd.unique(vin_data["image_id"])
len(image_id_lst)

15000

In [40]:
vin_data_merge_classes = pd.DataFrame({"image_id": image_id_lst, "class_name": np.empty(len(image_id_lst))}).set_index("image_id")

for idc, image_id in enumerate(image_id_lst):
  example = vin_data[vin_data["image_id"] == image_id]
  columns = np.unique(example["class_name"])
  vin_data_merge_classes.loc[image_id, "class_name"] = "|".join(columns)

vin_data_merge_classes = vin_data_merge_classes.reset_index()
vin_data_merge_classes

Unnamed: 0,image_id,class_name
0,50a418190bc3fb1ef1633bf9678929b3,No finding
1,21a10246a5ec7af151081d0cd6d65dc9,No finding
2,9a5094b2563a1ef3ff50dc5c7ff71345,Aortic enlargement|Cardiomegaly|Pleural effusi...
3,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement|Cardiomegaly|Pleural thicke...
4,063319de25ce7edb9b1c6b8881290140,No finding
...,...,...
14995,7c0ddf911bdb68fca14d7172486827cb,No finding
14996,209e3a5c73c1195d805dd25d086b3c6d,No finding
14997,1bea772246421c37929f8cbc43dba729,No finding
14998,ae86eabab95525b41b8e79883ff1cef9,No finding


In [41]:
vin_data_merge_classes.head()

Unnamed: 0,image_id,class_name
0,50a418190bc3fb1ef1633bf9678929b3,No finding
1,21a10246a5ec7af151081d0cd6d65dc9,No finding
2,9a5094b2563a1ef3ff50dc5c7ff71345,Aortic enlargement|Cardiomegaly|Pleural effusi...
3,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement|Cardiomegaly|Pleural thicke...
4,063319de25ce7edb9b1c6b8881290140,No finding


Add image_path *columns*

In [42]:
folder_path = "/content/drive/MyDrive/Classification2D/vinbigdata/data/train/*"
img_path = glob.glob(folder_path)
img_id = [os.path.basename(path).split(".")[0] for path in img_path]
img_id[:5]

['ee0038c59a1b52cd027e0108418e0500',
 'ee04fa64f7e9a47abee0be3cfc22f5e7',
 'ee0b9e8fd3ad717bfd391f0adc3f459e',
 'ee0d65c2402ce79f2d7a66502b3ca321',
 'ee1abdde3a0e65f9e0b832dfdbbea5de']

In [43]:
for idc in range(0, vin_data_merge_classes.shape[0]):
    for c in vin_classes:
        vin_data_merge_classes.loc[idc, c] = 1 if c in vin_data_merge_classes.loc[idc, 'class_name'] else 0

In [44]:
vin_data_merge_classes = vin_data_merge_classes[["image_id", "class_name", *vin_classes]]

In [45]:
vin_data_merge_classes


Unnamed: 0,image_id,class_name,Aortic enlargement,Atelectasis,Calcification,Cardiomegaly,Consolidation,ILD,Infiltration,Lung Opacity,No finding,Nodule/Mass,Other lesion,Pleural effusion,Pleural thickening,Pneumothorax,Pulmonary fibrosis
0,50a418190bc3fb1ef1633bf9678929b3,No finding,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,21a10246a5ec7af151081d0cd6d65dc9,No finding,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,9a5094b2563a1ef3ff50dc5c7ff71345,Aortic enlargement|Cardiomegaly|Pleural effusi...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement|Cardiomegaly|Pleural thicke...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,063319de25ce7edb9b1c6b8881290140,No finding,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,7c0ddf911bdb68fca14d7172486827cb,No finding,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
14996,209e3a5c73c1195d805dd25d086b3c6d,No finding,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
14997,1bea772246421c37929f8cbc43dba729,No finding,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
14998,ae86eabab95525b41b8e79883ff1cef9,No finding,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
vin_data_merge_classes.loc[100]

image_id                               2d063af5457785f5c76ae1e6c06c0037
class_name            Pleural effusion|Pleural thickening|Pulmonary ...
Aortic enlargement                                                  0.0
Atelectasis                                                         0.0
Calcification                                                       0.0
Cardiomegaly                                                        0.0
Consolidation                                                       0.0
ILD                                                                 0.0
Infiltration                                                        0.0
Lung Opacity                                                        0.0
No finding                                                          0.0
Nodule/Mass                                                         0.0
Other lesion                                                        0.0
Pleural effusion                                                

In [47]:
vin_data_path = pd.DataFrame({"image_id": img_id, "img_path": img_path})
vin_data_path.head()

Unnamed: 0,image_id,img_path
0,ee0038c59a1b52cd027e0108418e0500,/content/drive/MyDrive/Classification2D/vinbig...
1,ee04fa64f7e9a47abee0be3cfc22f5e7,/content/drive/MyDrive/Classification2D/vinbig...
2,ee0b9e8fd3ad717bfd391f0adc3f459e,/content/drive/MyDrive/Classification2D/vinbig...
3,ee0d65c2402ce79f2d7a66502b3ca321,/content/drive/MyDrive/Classification2D/vinbig...
4,ee1abdde3a0e65f9e0b832dfdbbea5de,/content/drive/MyDrive/Classification2D/vinbig...


In [48]:
vin_data = vin_data_merge_classes.merge(vin_data_path, how="outer", on="image_id")
vin_data = vin_data[["image_id", "class_name", "img_path", *vin_classes]]
vin_data.head(5)

Unnamed: 0,image_id,class_name,img_path,Aortic enlargement,Atelectasis,Calcification,Cardiomegaly,Consolidation,ILD,Infiltration,Lung Opacity,No finding,Nodule/Mass,Other lesion,Pleural effusion,Pleural thickening,Pneumothorax,Pulmonary fibrosis
0,50a418190bc3fb1ef1633bf9678929b3,No finding,/content/drive/MyDrive/Classification2D/vinbig...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,21a10246a5ec7af151081d0cd6d65dc9,No finding,/content/drive/MyDrive/Classification2D/vinbig...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,9a5094b2563a1ef3ff50dc5c7ff71345,Aortic enlargement|Cardiomegaly|Pleural effusi...,/content/drive/MyDrive/Classification2D/vinbig...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement|Cardiomegaly|Pleural thicke...,/content/drive/MyDrive/Classification2D/vinbig...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,063319de25ce7edb9b1c6b8881290140,No finding,/content/drive/MyDrive/Classification2D/vinbig...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
vin_data.to_csv("/content/drive/MyDrive/Classification2D/chest-X-rays/data/vinbigdata/vin_train_val.csv")        