In [None]:
import numpy as np
import pandas as pd
import os
import random 
from shutil import copyfile

In [None]:
# set parameters here
seed = 0
np.random.seed(seed) # Reset the seed so all runs are the same.
random.seed(seed)
MAXVAL = 255  # Range [0 255]

# path to covid-19 dataset from https://github.com/ieee8023/covid-chestxray-dataset
imgpath = 'covid-chestxray-dataset/images' 
csvpath = 'covid-chestxray-dataset/metadata.csv'

# path to kaggle chest xray data from https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia
data_path = 'chest_xray'

# parameters for COVIDx dataset
train = []
test = []
split = 0.1  # train/test split
test_count = {'normal': 0, 'viral': 0, 'bacteria': 0, 'COVID-19': 0}
train_count = {'normal': 0, 'viral': 0, 'bacteria': 0, 'COVID-19': 0}

In [None]:
def save():
    # export to train and test csv
    # format as patientid, filename, label, separated by a space
    train_file = open("train_split.txt","a") 
    for sample in train:
        info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + '\n'
        train_file.write(info)
    train_file.close()

    test_file = open("test_split.txt", "a")
    for sample in test:
        info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + '\n'
        test_file.write(info)

    test_file.close()

In [None]:
# adapted from https://github.com/mlmed/torchxrayvision/blob/master/torchxrayvision/datasets.py#L814
csv = pd.read_csv(csvpath, nrows=None)
idx_pa = csv["view"] == "PA"  # Keep only the PA view
csv = csv[idx_pa]

pneumonias = ["COVID-19", "SARS", "MERS", "ARDS", "Streptococcus"]
pathologies = ["Pneumonia","Viral Pneumonia", "Bacterial Pneumonia", "No Finding"] + pneumonias
pathologies = sorted(pathologies)

mapping = dict()
mapping['COVID-19'] = 'COVID-19'
mapping['SARS'] = 'viral'
mapping['MERS'] = 'viral'
mapping['Streptococcus'] = 'bacteria'

In [None]:
# get non-COVID19 viral, bacteria, and COVID-19 infections from covid-chestxray-dataset
# stored as patient id, image filename and label
filename_label = {'normal': [], 'viral': [], 'bacteria': [], 'COVID-19': []}
count = {'normal': 0, 'viral': 0, 'bacteria': 0, 'COVID-19': 0}
for index, row in csv.iterrows():
    f = row['finding']
    if f in mapping:
        count[mapping[f]] += 1
        entry = [int(row['Patientid']), os.path.join(imgpath, row['filename']), mapping[f]]
        filename_label[mapping[f]].append(entry)

print('Data distribution from covid-chestxray-dataset:')
print(count)

In [None]:
# add covid-chestxray-dataset into COVIDx dataset
# since covid-chestxray-dataset doesn't have test dataset
# split into train/test by patientid
# for COVIDx:
# patient 8 is used as non-COVID19 viral test
# patient 31 is used as bacterial test
# patients 19, 20, 36, 42, 86 are used as COVID-19 viral test

for key in filename_label.keys():
    arr = np.array(filename_label[key])
    if arr.size == 0:
        continue
    # split by patients
    num_diff_patients = len(np.unique(arr[:,0]))
    num_test = max(1, round(split*num_diff_patients))
    # select num_test number of random patients
    test_patients = random.sample(list(arr[:,0]), num_test)
    print('Key: ', key)
    print('Test patients: ', test_patients)
    # go through all the patients
    for patient in arr:
        if patient[0] in test_patients:
            #copyfile(os.path.join(imgpath, patient[1]), os.path.join(savepath, 'test', patient[1]))
            test.append(patient)
            test_count[patient[2]] += 1
        else:
            #copyfile(os.path.join(imgpath, patient[1]), os.path.join(savepath, 'train', patient[1]))
            train.append(patient)
            train_count[patient[2]] += 1

print('test count: ', test_count)
print('train count: ', train_count)

In [None]:
# add kaggle chest xray data into COVID19
folders = ['train', 'val', 'test']

# train, val, test normal data
for folder in folders:
    path = os.path.join(data_path, folder)
    for img in os.listdir(os.path.join(path, 'NORMAL')):
        if '.jp' in img:
            new_img = img.strip('IM-')
            new_img = new_img.strip('NORMAL2-IM-')
            # add to current dataset
            patientid = '1000' + new_img.split('-')[0] # add 1000 in front of kaggle patient ids
            if folder == 'train' or folder == 'val':
                # copy files to data folder
                #copyfile(os.path.join(data_path, folder, 'NORMAL', img), os.path.join(savepath, 'train', img))
                train.append([patientid, os.path.join(path, 'NORMAL' , img), 'normal'])
                train_count['normal'] += 1
            else:
                #copyfile(os.path.join(data_path, folder, 'NORMAL', img), os.path.join(savepath, 'test', img))
                test.append([patientid, os.path.join(path, 'NORMAL' , img), 'normal'])
                test_count['normal'] += 1

# train, val, test pneumonia data
    for img in os.listdir(os.path.join(path, 'PNEUMONIA')):
        if '.jp' in img:
            new_img = img.strip('person')
            patientid = '1000' + new_img.split('_')[0]
            p_type = 'bacteria' if 'bacteria' in new_img else 'viral'
            if folder == 'train' or folder == 'val':
                #copyfile(os.path.join(data_path, folder, 'PNEUMONIA', img), os.path.join(savepath, 'train', img))
                train.append([patientid, os.path.join(path, 'PNEUMONIA' , img), p_type])
                train_count[p_type] += 1
            else:
                #copyfile(os.path.join(data_path, folder, 'PNEUMONIA', img), os.path.join(savepath, 'test', img))
                test.append([patientid, os.path.join(path, 'PNEUMONIA' , img), p_type])
                test_count[p_type] += 1
                

In [None]:
# final stats
print('Final stats')
print('Train count: ', train_count)
print('Test count: ', test_count)
print('Total length of train: ', len(train))
print('Total length of test: ', len(test))

In [None]:
# run this cell when adding in new covid data from covid-chextxray-dataset

# load in current train/test information
def new_data():
    train_filepath = 'train_split.txt'
    test_filepath = 'test_split.txt'
    file = open(train_filepath, 'r') 
    trainfiles = file.readlines() 
    trainfiles = np.array([line.split() for line in trainfiles])
    file = open(test_filepath, 'r')
    testfiles = file.readlines()
    testfiles = np.array([line.split() for line in testfiles])

    # find the new entries in csv 
    new_entries = []

    for key in filename_label.keys():
        arr = np.array(filename_label[key])
        if arr.size == 0:
            continue
        for patient in arr:
            if patient[1] not in trainfiles and patient[1] not in testfiles:
            # if key is normal, bacteria or viral add to train folder
                if key in ['normal', 'bacteria', 'viral']:
                    copyfile(os.path.join(imgpath, patient[1]), os.path.join(savepath, 'train', patient[1]))
                    train.append(patient)
                    train_count[patient[2]] += 1
                else: 
                    new_entries.append(patient)
    new_entries = np.array(new_entries)

    # 10% of new entries should go into in test
    if new_entries.size > 0:
        num_diff_patients = len(np.unique(new_entries[:,0]))
        num_test = max(1, round(split*num_diff_patients))

        i = 0
        used_i = []
        # insert patients who are already in dataset into the respective train/test
        for patient in new_entries:
            if patient[0] in trainfiles:
                copyfile(os.path.join(imgpath, patient[1]), os.path.join(savepath, 'train', patient[1]))
                train.append(patient)
                train_count[patient[2]] += 1
                used_i.append(i)
            elif patient[0] in testfiles:
                copyfile(os.path.join(imgpath, patient[1]), os.path.join(savepath, 'test', patient[1]))
                test.append(patient)
                test_count[patient[2]] += 1
                used_i.append(i)
            i += 1
        # delete patients who have already been inserted
        new_entries = np.delete(new_entries, used_i, axis=0)

        # select num_test number of random patients
        test_patients = random.sample(list(new_entries[:,0]), num_test)
        print('test patients: ', test_patients)
        # add to respective train/test folders
        for patient in new_entries:
            if patient[0] in test_patients:
                copyfile(os.path.join(imgpath, patient[1]), os.path.join(savepath, 'test', patient[1]))
                test.append(patient)
                test_count[patient[2]] += 1
            else:
                copyfile(os.path.join(imgpath, patient[1]), os.path.join(savepath, 'train', patient[1]))
                train.append(patient)
                train_count[patient[2]] += 1

        print('added test count: ', test_count)
        print('added train count: ', train_count)
    save()
