In [1]:
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--dataset_path', type=str, default='dataset_partition')
parser.add_argument('--covid_chestxray_path', type=str, default='data/covid-chestxray-dataset')
parser.add_argument('--chest_xray_pneumonia', type=str, default='data/chest-xray-pneumonia')

args = parser.parse_args("")

In [2]:
import pdb, os, random
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle

# makedir
def make_dir(dirname, rm=False):
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    elif rm:
        print('rm and mkdir ', dirname)
        shutil.rmtree(dirname)
        os.makedirs(dirname)
        
make_dir(args.dataset_path)

## Covid-19 and Normal Chest X-Ray combined

In [3]:
def read_covid_dataset():
    metadata_path = os.path.join(args.covid_chestxray_path, 'metadata.csv')
    images_dir = os.path.join(args.covid_chestxray_path, 'images')
    csv = pd.read_csv(metadata_path)
    csv = csv[csv['finding'].str.contains('COVID-19')]
    csv = csv[csv['modality'].str.contains('X-ray')]
    csv['filename'] = csv['filename'].apply(lambda x : os.path.join(images_dir, x))
    csv = csv.drop_duplicates(subset=['filename'])
    return csv.to_dict(orient='records')

In [4]:
def read_chest_xray_pneumonia():
    def get_files(dirpath):
        filelist = os.listdir(dirpath)
        filelist = [os.path.join(dirpath, f) for f in filelist if os.path.isfile(os.path.join(dirpath, f))]
        return filelist
    
    images_dir_train = os.path.join(args.chest_xray_pneumonia, 'chest_xray/train/NORMAL')
    images_dir_test = os.path.join(args.chest_xray_pneumonia, 'chest_xray/test/NORMAL')
    train_files = get_files(images_dir_train)
    test_files = get_files(images_dir_test)
    return train_files, test_files

In [5]:
def create_dataset():
    positive_all = read_covid_dataset()
    negative_train, negative_test = read_chest_xray_pneumonia()
    positive_train, positive_test = train_test_split(positive_all, train_size=0.8, shuffle=True)
    negative_sample_train = random.sample(negative_train, len(positive_train))
    negative_sample_test = random.sample(negative_test, len(positive_test))
    
    train = []
    test = []
    for item in positive_train:
        ditem = {}
        ditem['filename'] = item['filename']
        ditem['label'] = 1
        train.append(ditem)
    
    for item in positive_test:
        ditem = {}
        ditem['filename'] = item['filename']
        ditem['label'] = 1
        test.append(ditem)
    
    for item in negative_sample_train:
        ditem = {}
        ditem['filename'] = item
        ditem['label'] = 0
        train.append(ditem)
    
    for item in negative_sample_test:
        ditem = {}
        ditem['filename'] = item
        ditem['label'] = 0
        test.append(ditem)
        
    return train, test

def test_pkl(trainfname='train_new.pkl', testfname='test_new.pkl'):
    train_pkl = os.path.join(args.dataset_path, trainfname)
    test_pkl = os.path.join(args.dataset_path, testfname)
    with open(train_pkl, 'rb') as f:
        l = pickle.load(f)
        random.shuffle(l)
        print(len(l))
        display(l[:10])
    with open(test_pkl, 'rb') as f:
        l = pickle.load(f)
        random.shuffle(l)
        print(len(l))
        display(l[:10])

def generate_dataset_files():
    train, test = create_dataset()
    train_pkl = os.path.join(args.dataset_path, 'train_new.pkl')
    test_pkl = os.path.join(args.dataset_path, 'test_new.pkl')
    pickle.dump(train, open(train_pkl, 'wb'))
    pickle.dump(test, open(test_pkl, 'wb'))
    

In [6]:

# Uncomment to regenerate the dataset
generate_dataset_files()
test_pkl()

806




[{'filename': 'data/covid-chestxray-dataset/images/covid-19-pneumonia-evolution-over-a-week-1-day0-PA.jpg',
  'label': 1},
 {'filename': 'data/chest-xray-pneumonia/chest_xray/train/NORMAL/NORMAL2-IM-1333-0001.jpeg',
  'label': 0},
 {'filename': 'data/chest-xray-pneumonia/chest_xray/train/NORMAL/NORMAL2-IM-1027-0001.jpeg',
  'label': 0},
 {'filename': 'data/covid-chestxray-dataset/images/article_river_de7471906e0011eabe5f9363acaf45c4-covid-cxr-2.png',
  'label': 1},
 {'filename': 'data/chest-xray-pneumonia/chest_xray/train/NORMAL/NORMAL2-IM-0530-0001.jpeg',
  'label': 0},
 {'filename': 'data/chest-xray-pneumonia/chest_xray/train/NORMAL/IM-0411-0001.jpeg',
  'label': 0},
 {'filename': 'data/chest-xray-pneumonia/chest_xray/train/NORMAL/NORMAL2-IM-0397-0001.jpeg',
  'label': 0},
 {'filename': 'data/chest-xray-pneumonia/chest_xray/train/NORMAL/NORMAL2-IM-0481-0001.jpeg',
  'label': 0},
 {'filename': 'data/covid-chestxray-dataset/images/A7E260CE-8A00-4C5F-A7F5-27336527A981.jpeg',
  'label': 

202


[{'filename': 'data/covid-chestxray-dataset/images/covid-19-pneumonia-progression-and-regression-day13.jpg',
  'label': 1},
 {'filename': 'data/chest-xray-pneumonia/chest_xray/test/NORMAL/NORMAL2-IM-0303-0001.jpeg',
  'label': 0},
 {'filename': 'data/covid-chestxray-dataset/images/all14238-fig-0001-m-c.jpg',
  'label': 1},
 {'filename': 'data/chest-xray-pneumonia/chest_xray/test/NORMAL/NORMAL2-IM-0098-0001.jpeg',
  'label': 0},
 {'filename': 'data/chest-xray-pneumonia/chest_xray/test/NORMAL/IM-0075-0001.jpeg',
  'label': 0},
 {'filename': 'data/chest-xray-pneumonia/chest_xray/test/NORMAL/NORMAL2-IM-0323-0001.jpeg',
  'label': 0},
 {'filename': 'data/chest-xray-pneumonia/chest_xray/test/NORMAL/NORMAL2-IM-0343-0001.jpeg',
  'label': 0},
 {'filename': 'data/chest-xray-pneumonia/chest_xray/test/NORMAL/NORMAL2-IM-0302-0001.jpeg',
  'label': 0},
 {'filename': 'data/chest-xray-pneumonia/chest_xray/test/NORMAL/IM-0028-0001.jpeg',
  'label': 0},
 {'filename': 'data/chest-xray-pneumonia/chest_xr