In [1]:
import re
import pandas as pd
import ntpath

In [2]:
image_names = {"non-infection":list(), "covid-19 pneumonia":list(),"non-covid-19 pneumonia":list()}

### COVID-19 RADIOGRAPHY DATABASE

In [3]:
normal = pd.read_excel("NORMAL.metadata.xlsx").assign(label="non-infection")
covid19 = pd.read_excel("COVID-19.metadata.xlsx").assign(label="covid-19 pneumonia")
pneumonia = pd.read_excel("Viral Pneumonia.matadata.xlsx").assign(label="non-covid-19 pneumonia")

radiography = pd.concat([normal, covid19, pneumonia]).rename(columns={"URL":"url"})
def process_filename(filename):
    return re.sub(r"-(\d+)$", r" (\1)", filename)+'.png'
df = radiography.assign(dataset="Radiography",
                        view="AP",
                        filename=radiography["FILE NAME"].apply(process_filename))[["filename","url", "label","dataset", "view"]]


### covid-chestxray-dataset
X-ray and CT images of patients which are positive or suspected of COVID-19 or other viral and bacterial pneumonias (MERS, SARS, and ARDS.)

In [4]:
chest = pd.read_csv("covid_chestxray_metadata.csv")
chest.columns

Index(['patientid', 'offset', 'sex', 'age', 'finding', 'RT_PCR_positive',
       'survival', 'intubated', 'intubation_present', 'went_icu', 'in_icu',
       'needed_supplemental_O2', 'extubated', 'temperature', 'pO2_saturation',
       'leukocyte_count', 'neutrophil_count', 'lymphocyte_count', 'view',
       'modality', 'date', 'location', 'folder', 'filename', 'doi', 'url',
       'license', 'clinical_notes', 'other_notes'],
      dtype='object')

In [5]:
def convert_finding_to_label(finding):
    if "COVID-19" in finding:
        return "covid-19 pneumonia"
    return "non-covid-19 pneumonia"

chest= chest.assign(label=chest.finding.apply(convert_finding_to_label), dataset="covid_chestxray")

In [6]:
df = pd.concat([df, chest[["filename","url", "label", "dataset", "view"]]])
df

Unnamed: 0,filename,url,label,dataset,view
0,NORMAL (1).png,https://www.kaggle.com/paultimothymooney/chest...,non-infection,Radiography,AP
1,NORMAL (2).png,https://www.kaggle.com/paultimothymooney/chest...,non-infection,Radiography,AP
2,NORMAL (3).png,https://www.kaggle.com/paultimothymooney/chest...,non-infection,Radiography,AP
3,NORMAL (4).png,https://www.kaggle.com/paultimothymooney/chest...,non-infection,Radiography,AP
4,NORMAL (5).png,https://www.kaggle.com/paultimothymooney/chest...,non-infection,Radiography,AP
...,...,...,...,...,...
925,e0e3a6526a3fecadfca2be13242798_jumbo.jpg,https://radiopaedia.org/cases/left-lower-lobe-...,non-covid-19 pneumonia,covid_chestxray,L
926,2264f643b18b1010ec10a850f17550_jumbo.jpeg,https://radiopaedia.org/cases/silhouette-sign-...,non-covid-19 pneumonia,covid_chestxray,PA
927,800f798a58d0cbcc72eb234f192461_jumbo.jpeg,https://radiopaedia.org/cases/silhouette-sign-...,non-covid-19 pneumonia,covid_chestxray,L
928,55f5189d2c23688ac8dc1d58eb65cf_jumbo.jpg,https://radiopaedia.org/cases/left-lower-lobe-...,non-covid-19 pneumonia,covid_chestxray,PA


### Figure1-COVID-chestxray-dataset

In [7]:
def convert_figure1_finding(finding):
    if finding == "No finding":
        return "non-infection"
    elif finding == "COVID-19":
        return "covid-19 pneumonia"
    elif finding == "Pneumonia":
        return "non-covid-19 pneumonia"

In [8]:
import os
def find_extension(filename):
    path = os.path.join("..", "datasets","figure1","images", filename)
    png = path+'.png'
    jpg = path+'.jpg'
    if os.path.isfile(png):
        return png
    elif os.path.isfile(jpg):
        return jpg


In [9]:
figure1 = pd.read_csv("figure1_covid_chestxray_metadata.csv", encoding='iso-8859-1')
figure1 = figure1.assign(url='', dataset="figure1",
                         filename=figure1.patientid.apply(find_extension),
                         label=figure1.finding.apply(convert_figure1_finding)).dropna(subset=["label"])

# figure1

In [10]:
df = pd.concat([df, figure1[["filename", "url","label", "dataset", "view"]]])
df

Unnamed: 0,filename,url,label,dataset,view
0,NORMAL (1).png,https://www.kaggle.com/paultimothymooney/chest...,non-infection,Radiography,AP
1,NORMAL (2).png,https://www.kaggle.com/paultimothymooney/chest...,non-infection,Radiography,AP
2,NORMAL (3).png,https://www.kaggle.com/paultimothymooney/chest...,non-infection,Radiography,AP
3,NORMAL (4).png,https://www.kaggle.com/paultimothymooney/chest...,non-infection,Radiography,AP
4,NORMAL (5).png,https://www.kaggle.com/paultimothymooney/chest...,non-infection,Radiography,AP
...,...,...,...,...,...
48,..\datasets\figure1\images\COVID-00043a.jpg,,covid-19 pneumonia,figure1,
49,..\datasets\figure1\images\COVID-00043b.jpg,,covid-19 pneumonia,figure1,
51,..\datasets\figure1\images\COVID-00045.png,,covid-19 pneumonia,figure1,
52,..\datasets\figure1\images\COVID-00046.png,,covid-19 pneumonia,figure1,


###  Actualmed-COVID-chestxray-dataset

In [11]:
actual_med = pd.read_csv("actualmed_covid_chestxray_metadata.csv")


In [12]:
actual_med = actual_med.rename(columns={"imagename":"filename"}).assign(label=actual_med.finding.apply(convert_figure1_finding), url='', dataset='actualmed')
actual_med = actual_med[["filename", "label", "url", "dataset", "view"]].dropna(subset=["label"]).drop_duplicates(subset=["filename"])
df =pd.concat([actual_med, df])

In [13]:
def process_view(x):
    if type(x) == str:
        return x.upper()
    return None
df = df.assign(view=df.view.apply(process_view))

In [14]:
kaggle = df[df.url=="https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia"]
non_url = df[df.url=='']
non_kaggle = df[df.url!='https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia']

## merge the datasets

In [27]:
merged = pd.concat([non_kaggle.drop_duplicates(subset=['url']), kaggle, non_url])
merged = merged.assign(imagename=merged.filename.apply(ntpath.basename)).drop("filename", axis=1).drop_duplicates()
merged.index = pd.RangeIndex(len(merged))
merged

Unnamed: 0,label,url,dataset,view,imagename
0,non-infection,,actualmed,PA,CR.1.2.840.113564.1722810170.20200317090830828...
1,covid-19 pneumonia,https://www.sciencedirect.com/science/article/...,Radiography,AP,COVID-19(1).png
2,covid-19 pneumonia,https://www.sciencedirect.com/science/article/...,Radiography,AP,COVID-19(2).png
3,covid-19 pneumonia,https://www.sciencedirect.com/science/article/...,Radiography,AP,COVID-19(6).png
4,covid-19 pneumonia,https://www.sciencedirect.com/science/article/...,Radiography,AP,COVID-19(8).png
...,...,...,...,...,...
3281,covid-19 pneumonia,,figure1,,COVID-00043a.jpg
3282,covid-19 pneumonia,,figure1,,COVID-00043b.jpg
3283,covid-19 pneumonia,,figure1,,COVID-00045.png
3284,covid-19 pneumonia,,figure1,,COVID-00046.png


In [28]:
merged = merged.assign(filename=merged.dataset.astype(str)+'/'+merged.imagename.astype(str))
merged

Unnamed: 0,label,url,dataset,view,imagename,filename
0,non-infection,,actualmed,PA,CR.1.2.840.113564.1722810170.20200317090830828...,actualmed/CR.1.2.840.113564.1722810170.2020031...
1,covid-19 pneumonia,https://www.sciencedirect.com/science/article/...,Radiography,AP,COVID-19(1).png,Radiography/COVID-19(1).png
2,covid-19 pneumonia,https://www.sciencedirect.com/science/article/...,Radiography,AP,COVID-19(2).png,Radiography/COVID-19(2).png
3,covid-19 pneumonia,https://www.sciencedirect.com/science/article/...,Radiography,AP,COVID-19(6).png,Radiography/COVID-19(6).png
4,covid-19 pneumonia,https://www.sciencedirect.com/science/article/...,Radiography,AP,COVID-19(8).png,Radiography/COVID-19(8).png
...,...,...,...,...,...,...
3281,covid-19 pneumonia,,figure1,,COVID-00043a.jpg,figure1/COVID-00043a.jpg
3282,covid-19 pneumonia,,figure1,,COVID-00043b.jpg,figure1/COVID-00043b.jpg
3283,covid-19 pneumonia,,figure1,,COVID-00045.png,figure1/COVID-00045.png
3284,covid-19 pneumonia,,figure1,,COVID-00046.png,figure1/COVID-00046.png


In [29]:
merged = merged[["filename", "imagename","label","view"]]
merged

Unnamed: 0,filename,imagename,label,view
0,actualmed/CR.1.2.840.113564.1722810170.2020031...,CR.1.2.840.113564.1722810170.20200317090830828...,non-infection,PA
1,Radiography/COVID-19(1).png,COVID-19(1).png,covid-19 pneumonia,AP
2,Radiography/COVID-19(2).png,COVID-19(2).png,covid-19 pneumonia,AP
3,Radiography/COVID-19(6).png,COVID-19(6).png,covid-19 pneumonia,AP
4,Radiography/COVID-19(8).png,COVID-19(8).png,covid-19 pneumonia,AP
...,...,...,...,...
3281,figure1/COVID-00043a.jpg,COVID-00043a.jpg,covid-19 pneumonia,
3282,figure1/COVID-00043b.jpg,COVID-00043b.jpg,covid-19 pneumonia,
3283,figure1/COVID-00045.png,COVID-00045.png,covid-19 pneumonia,
3284,figure1/COVID-00046.png,COVID-00046.png,covid-19 pneumonia,


In [31]:
merged.to_csv("merged_dataset.csv", index=None)