In [1]:
import sys, os, time

import numpy as np
import pandas as pd

In [2]:
BASE_DIR = "PATH/TO/WHERE/YOU/DOWNLOADED/THE/DATA/"

In [3]:
def get_split(fn):
    with open(fn,"r") as f:
        lines = f.read().strip().split("\n")
        lines = [
            line.strip().split(" ")
            for line in lines
        ]
        
    columns = {
        "patientid": [],
        "filename": [],
        "label": [],
        "dataset": []
    }
    for line in lines:
        patientid, filename, label, dataset = line
        columns["patientid"].append(patientid)
        columns["filename"].append(filename)
        columns["label"].append(label)
        columns["dataset"].append(dataset)
    split = pd.DataFrame.from_dict(columns)
    return split

In [4]:
train_split = get_split("train_split.txt")
test_split = get_split("test_split.txt")

In [5]:
train_split.groupby("label")["dataset"].value_counts()

label      dataset
COVID-19   cohen       429
           sirm         29
           actmed       25
           fig1         24
normal     rsna       7966
pneumonia  rsna       5423
           cohen        46
Name: dataset, dtype: int64

In [6]:
test_split.groupby("label")["dataset"].value_counts()

label      dataset
COVID-19   cohen       39
           actmed      33
           sirm        17
           fig1        11
normal     rsna       885
pneumonia  rsna       589
           cohen        5
Name: dataset, dtype: int64

In [7]:
for fn in train_split["filename"].values:
    full_fn = os.path.join(BASE_DIR, "covidx/train/", fn)
    assert os.path.exists(full_fn), full_fn 

In [8]:
for fn in test_split["filename"].values:
    full_fn = os.path.join(BASE_DIR, "covidx/test/", fn)
    assert os.path.exists(full_fn), full_fn 

In [9]:
train_split["original_image_path"] = BASE_DIR + "covidx/train/" + train_split["filename"] 

for fn in train_split["original_image_path"].values:
    assert os.path.exists(fn)

In [10]:
test_split["original_image_path"] = BASE_DIR + "covidx/test/" + test_split["filename"] 

for fn in test_split["original_image_path"].values:
    assert os.path.exists(fn)

In [11]:
merged = pd.concat([train_split, test_split])

In [12]:
merged.reset_index(drop=True,inplace=True)

In [13]:
merged.groupby("label")["dataset"].value_counts()

label      dataset
COVID-19   cohen       468
           actmed       58
           sirm         46
           fig1         35
normal     rsna       8851
pneumonia  rsna       6012
           cohen        51
Name: dataset, dtype: int64

In [14]:
# Drop all but the first patient
merged.reset_index(inplace=True)
idxs = merged.groupby("patientid").first()["index"].sort_values().values
merged = merged.loc[idxs]
del merged["index"]
merged.reset_index(drop=True, inplace=True)

In [15]:
merged.groupby("label")["dataset"].value_counts()

label      dataset
COVID-19   cohen       289
           actmed       51
           sirm         46
           fig1         35
normal     rsna       8851
pneumonia  rsna       6012
           cohen        28
Name: dataset, dtype: int64

In [16]:
merged.to_csv("../data/metadata_covidx.csv", index=False)

In [17]:
os.remove("train_split.txt")
os.remove("test_split.txt")