This kernel does a quick analysis of Stage 1 and Stage 2 labels and saves all patientids in an *.npz archive for use in later stages.  You can grab the *.npz file from the "V3" output tab.  Sample code to read the *.npz file is at the bottom of the kernel.

Comments welcome!

In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
# global variables
STAGE1_DETAILED_CLASSES_CSV_FILE="../input/rsna-stage1-archived-inputs/stage_1_detailed_class_info.csv"
DETAILED_CLASSES_CSV_FILE="../input/rsna-pneumonia-detection-challenge/stage_2_detailed_class_info.csv"
DETAILED_CLASSES_CSV_COLUMN_NAMES=['patientId' , 'class']
# dictionary to map string classes to numerical
CLASSES_DICT={'Normal': 0, 'Lung Opacity' : 1, 'No Lung Opacity / Not Normal' : 2}

STAGE1_TRAIN_LABELS_CSV_FILE="../input/rsna-stage1-archived-inputs/stage_1_train_labels.csv"
TRAIN_LABELS_CSV_FILE="../input/rsna-pneumonia-detection-challenge/stage_2_train_labels.csv"
# pedantic nit: we are changing 'Target' to 'label' on the way in
TRAIN_LABELS_CSV_COLUMN_NAMES=['patientId', 'x1', 'y1', 'bw', 'bh', 'label']

# saved test ids from stage1
STAGE1_TEST_IDS_FILE="../input/rsna-stage1-archived-inputs/stage1_test_ids.npy"
# list of stage1 test images
stage1testkeys=sorted(list(np.load(STAGE1_TEST_IDS_FILE)))

TEST_DIR="../input/rsna-pneumonia-detection-challenge/stage_2_test_images"
# list of test images
TEST_LIST=sorted(os.listdir(TEST_DIR))

SAVED_KEYS_FILE="rsna-stage1-and-stage2-keys.npz"

DICOM_IMAGE_SIZE=1024

In [None]:
# read STAGE1_DETAILED_CLASSES_CSV_FILE into a pandas dataframe
classesdf = pd.read_csv(STAGE1_DETAILED_CLASSES_CSV_FILE,
                        names=DETAILED_CLASSES_CSV_COLUMN_NAMES,
                        # skip the header line
                        header=0,
                        # index the dataframe on patientId
                        index_col='patientId')
#print (classesdf.shape)
#print (classesdf.head(n=10))

# remove duplicates
classesdf=classesdf.groupby(['patientId'])['class'].first()
# make classes numerical based on CLASSES_DICT
classesdf=pd.DataFrame(classesdf.replace(to_replace=CLASSES_DICT), columns=['class'])
print ("Stage 1:: {} lines read from {}".format(len(classesdf), STAGE1_DETAILED_CLASSES_CSV_FILE))

In [None]:
# capture stage1 patientids for different classes
stage1allkeys=classesdf.index.tolist()
stage1lungopacitykeys=classesdf.index[classesdf['class']==1].tolist()
stage1normalkeys=classesdf.index[classesdf['class']==0].tolist()
stage1otherabnormalkeys=classesdf.index[classesdf['class']==2].tolist()
print ("################STAGE 1 SUMMARY################")
print ("Total Training Samples: {}".format(len(stage1allkeys)))
print (">>Lung Opacity Samples: {}".format(len(stage1lungopacitykeys)))
print (">>Normal Samples: {}".format(len(stage1normalkeys)))
print (">>Not Normal / No Lung Opacity Samples: {}".format(len(stage1otherabnormalkeys)))
print ("##############################################")

In [None]:
# read DETAILED_CLASSES_CSV_FILE into a pandas dataframe
classesdf = pd.read_csv(DETAILED_CLASSES_CSV_FILE,
                        names=DETAILED_CLASSES_CSV_COLUMN_NAMES,
                        # skip the header line
                        header=0,
                        # index the dataframe on patientId
                        index_col='patientId')
#print (classesdf.shape)
#print (classesdf.head(n=10))

# remove duplicates
classesdf=classesdf.groupby(['patientId'])['class'].first()
# make classes numerical based on CLASSES_DICT
classesdf=pd.DataFrame(classesdf.replace(to_replace=CLASSES_DICT), columns=['class'])
print ("Stage 2:: {} lines read from {}".format(len(classesdf), DETAILED_CLASSES_CSV_FILE))

In [None]:
# capture stage2 patientids for different classes
allkeys=classesdf.index.tolist()
lungopacitykeys=classesdf.index[classesdf['class']==1].tolist()
normalkeys=classesdf.index[classesdf['class']==0].tolist()
otherabnormalkeys=classesdf.index[classesdf['class']==2].tolist()
print ("################STAGE 2 SUMMARY################")
print ("Total Training Samples: {}".format(len(allkeys)))
print (">>Lung Opacity Samples: {}".format(len(lungopacitykeys)))
print (">>Normal Samples: {}".format(len(normalkeys)))
print (">>Not Normal / No Lung Opacity Samples: {}".format(len(otherabnormalkeys)))
print ("##############################################")

In [None]:
print ("{} test samples from Stage 1 were distributed into Stage 2 as:".format(len(stage1testkeys)))
print (">>{} additional Lung Opacity Samples".format(len(lungopacitykeys)-len(stage1lungopacitykeys)))
print (">>{} additional Normal Samples".format(len(normalkeys)-len(stage1normalkeys)))
print (">>{} additional Not Normal / No Lung Opacity Samples".format(len(otherabnormalkeys)-len(stage1otherabnormalkeys)))

In [None]:
# double check everything
assert sorted(allkeys)==sorted(stage1normalkeys+stage1lungopacitykeys+stage1otherabnormalkeys+stage1testkeys), "Keys Mismatch"

In [None]:
# capture test keys
testkeys=[]
for filename in TEST_LIST:
    key=filename.split(".")[0]
    testkeys.append(key)
print ("##############################################")
print ("Test Samples: {}".format(len(testkeys)))
print ("##############################################")

In [None]:
# save all keys for future stages
np.savez(SAVED_KEYS_FILE,
         np.array(normalkeys),
         np.array(lungopacitykeys),
         np.array(otherabnormalkeys),
         np.array(testkeys),
         np.array(stage1normalkeys),
         np.array(stage1lungopacitykeys),
         np.array(stage1otherabnormalkeys),
         np.array(stage1testkeys))

In [None]:
# make sure everything is all set
npzfile=np.load(SAVED_KEYS_FILE)

assert normalkeys==sorted(list(npzfile['arr_0'])), "Normal Keys Mismatch"
assert lungopacitykeys==sorted(list(npzfile['arr_1'])), "Lung Opacity Keys Mismatch"
assert otherabnormalkeys==sorted(list(npzfile['arr_2'])), "Not Normal / No Lung Opacity Keys Mismatch"
assert testkeys==sorted(list(npzfile['arr_3'])), "Test Keys Mismatch"

assert stage1normalkeys==sorted(list(npzfile['arr_4'])), "Stage1 Normal Keys Mismatch"
assert stage1lungopacitykeys==sorted(list(npzfile['arr_5'])), "Stage1 Lung Opacity Keys Mismatch"
assert stage1otherabnormalkeys==sorted(list(npzfile['arr_6'])), "Stage1 Not Normal / No Lung Opacity Keys Mismatch"
assert stage1testkeys==sorted(list(npzfile['arr_7'])), "Stage1 Test Keys Mismatch"
