<a href="https://colab.research.google.com/github/mr7495/covid19/blob/master/dataset_preparing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install zipfile36
!pip install pydicom

In [0]:
import numpy as np
import pandas as pd
import os
import random 
import shutil
import pydicom as dicom
import cv2
import csv
import zipfile

In [0]:
# This is a link to kaggle dataset. If you have downloaded it already, save it as kaggle.zip in the current directory.
# If the link expired, get the new link from https://www.kaggle.com/c/rsna-pneumonia-detection-challenge/data
!wget -cO - 'https://storage.googleapis.com/kaggle-competitions-data/kaggle-v2/10338/862042/bundle/archive.zip?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1587124102&Signature=UbIsEpcNjy3ymL%2BCt5cNunBYytcPNlMjVW4RmBKzzuwTL%2BqGHXDzKGbFM3rsewy6nWa9GJgU5ScP%2FVPFUVJdAU3gsqw7aR6En0AqbLMjZ3JE%2BMducSHY94zyZH%2Fn6LqBOwq%2F3FQmK6OC8Ze0OW5oJyNFD7nATMQU7GxbrarIMH6F6zg%2BmL%2BZF%2B6uqlZhAwYpKKLQtzVm7Tyu04Hse0ODtfKV78U3nREvAifK9CzPTRHzAh8AxIdNunMInOn10U4bzxWN%2F5x3Cex7kP6UHsTyJX2XF98eBrQinlgBuyWLbInpQDJVVl1QGFebCa7CN6lnOO2wEeV8Xy5MN6B%2FwlZvEw%3D%3D&response-content-disposition=attachment%3B+filename%3Drsna-pneumonia-detection-challenge.zip' > kaggle.zip

In [0]:
#load covid-chestxray-dataset
!git clone https://github.com/ieee8023/covid-chestxray-dataset

In [0]:
archive = zipfile.ZipFile('kaggle.zip') #Extract Kaggle Dataset
for file in archive.namelist():
     archive.extract(file, '.')

In [0]:
# Define addresses
covid_chestxray = './covid-chestxray-dataset/metadata.csv'

kaggle_csvname = 'stage_2_detailed_class_info.csv' # normal cases from kaggle dataset
kaggle_csvname2 = 'stage_2_train_labels.csv' # pneumonia cases from kaggle dataset
kaggle_imgpath = 'stage_2_train_images'

related_views=["AP","PA","AP Supine","AP semi erect"]  #The view column in the covid_chestxray dataset that has suitable data
filename_label = {'normal': [], 'pneumonia': [], 'COVID-19': []}

mapping = dict() #mapping the types of the suitable data in the covid-chestxray-dataset into 3 classes
mapping['COVID-19'] = 'COVID-19'
mapping['COVID-19, ARDS'] = 'COVID-19'
mapping['Pneumocystis'] = 'pneumonia'
mapping['SARS'] = 'pneumonia'
mapping['Streptococcus'] = 'pneumonia'
mapping['Normal'] = 'normal'

In [0]:
# In the next two cells we create All.csv
covid_csv = pd.read_csv(covid_chestxray)
for index, row in covid_csv.iterrows():
    if row['finding'] in mapping:
        if row['view'] in related_views:
            if row['filename'] not in filename_label[mapping[row['finding']]]:
                filename_label[mapping[row['finding']]].append(row['filename']) #add the suitable images names in the covid-chestxray-dataset


csv_normal = pd.read_csv(kaggle_csvname)
csv_pneu = pd.read_csv(kaggle_csvname2)
all_names=[]
for index, row in csv_normal.iterrows():
    if row['class'] == 'Normal':
        if row['patientId'] not in all_names:
            all_names.append(row['patientId'])
            new_name=row['patientId']+'.dcm'
            if new_name not in filename_label['normal']:
                filename_label['normal'].append(new_name) #add the suitable normal cases names in the kaggle dataset
        
for index, row in csv_pneu.iterrows():
    if int(row['Target']) == 1:
        if row['patientId'] not in all_names:
            all_names.append(row['patientId'])
            new_name=row['patientId']+'.dcm'
            if new_name not in filename_label['pneumonia']:
                filename_label['pneumonia'].append(new_name) #add the suitable pneumonia cases names in the kaggle dataset



In [0]:
#Export All.csv
with open('All.csv',newline='', mode='w') as csvfile:
     csv_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
     csv_writer.writerow(['filename','class'])
     for key in filename_label:
         for row in filename_label[key]:
             if '.dcm' in row:
                 new_row=row[:-4]+'.png'
                 csv_writer.writerow([new_row,key])
             else:
                 csv_writer.writerow([row,key])     

In [0]:
#We have divided patients infected to COVID-19 into five folds. This division is based on the covid-chestxray-dataset on 12 April
#This dataset can change anytime, and you have to select the patients wisely
ultimate_test_pneumonia = ['8', '31','171']
ultimate_test_covid1  = ['19', '20', '36', '42', '86','13','96','51','49','116','150','151','168','56','70']
ultimate_test_covid2=['2','4','6','11','12','13','14','15','117','152','163','167','142']
ultimate_test_covid3=['16','17','18','21','33','34','36','37','44','45','46','47','165','166','164','161','160','132','162','159','158']
ultimate_test_covid4=['39','40','41','43','44','45','46','47','48','50','51','52','157','156','155','154','153','151','149','148','147','146','145','71','72','73','74']
ultimate_test_covid5=['53','56','57','58','59','60','61','62','63','64','65','66','67','68','69','118','132','139','138','141','144']

In [0]:
try:
  os.mkdir('Our_data_fold')
  os.mkdir('Our_data_fold/fold1')
  os.mkdir('Our_data_fold/fold2')
  os.mkdir('Our_data_fold/fold3')
  os.mkdir('Our_data_fold/fold4')
  os.mkdir('Our_data_fold/fold5')
except:
  pass

In [0]:
#In the next cell we create the 8 training phases for each fold

In [0]:
for fo in range(1,6):

    related_views=["AP","PA","AP Supine","AP semi erect"]
    filename_label = {'normal': [], 'pneumonia': [], 'COVID-19': []}
    patients_id={'normal': [], 'pneumonia': [], 'COVID-19': []}
    covid_csv = pd.read_csv(covid_chestxray)
    for index, row in covid_csv.iterrows():
        if row['finding'] in mapping:
            if row['view'] in related_views:
                if row['filename'] not in filename_label[mapping[row['finding']]]:
                    filename_label[mapping[row['finding']]].append(row['filename'])
                    patients_id[mapping[row['finding']]].append(row['patientid'])


    ultimate_train={'normal': [], 'pneumonia': [], 'COVID-19': []} #the data that is common between all training phases
    ultimated_test=[] #The covid-19 and some pneumonia cases that is considered for testing in each fold
    ultimate_test_pneumonia = ['8', '31','171'] #The pneumonia cases that are selected for testing
    ultimate_test_covid  = globals()['ultimate_test_covid{}'.format(fo)].copy()
    for index, row in covid_csv.iterrows(): #add the suitable images names in the covid-chestxray-dataset
        if  str(row['patientid']) in ultimate_test_covid or str(row['patientid']) in ultimate_test_pneumonia:
            if row['view'] in related_views:
                if row['filename'] not in ultimated_test:
                    ultimated_test.append(row['filename'])
    for flp in filename_label['pneumonia']:
        if flp not in ultimated_test:
            ultimate_train['pneumonia'].append(flp)
    for flp in filename_label['COVID-19']:
        if flp not in ultimated_test:
            ultimate_train['COVID-19'].append(flp)
        
    csv_normal = pd.read_csv(kaggle_csvname)
    csv_pneu =   pd.read_csv(kaggle_csvname2)
    patients = {'normal': [], 'pneumonia': []}
    all_names=[]
    all_data=[]
    for index, row in csv_normal.iterrows(): #add the suitable normal cases in the kaggle dataset
        if row['class'] == 'Normal':
            if row['patientId'] not in all_names:
                all_names.append(row['patientId'])
                all_data.append([row['patientId'],'normal'])
                patients['normal'].append(row['patientId'])
                new_name=row['patientId']+'.dcm'
                if new_name not in filename_label['normal']:
                    filename_label['normal'].append(new_name)
    for index, row in csv_pneu.iterrows(): #add the suitable pneumonia cases in the kaggle dataset
        if int(row['Target']) == 1:
            if row['patientId'] not in all_names:
                all_names.append(row['patientId'])
                all_data.append([row['patientId'],'pneumonia'])
                patients['pneumonia'].append(row['patientId'])
                new_name=row['patientId']+'.dcm'
                if new_name not in filename_label['pneumonia']:
                    filename_label['pneumonia'].append(new_name)
                             
    temp_all_train={'normal': [], 'pneumonia': [], 'COVID-19': []}
    for key in filename_label:
        for fl in filename_label[key]:
            if fl not in ultimated_test:
                if fl not in ultimate_train[key]:
                    temp_all_train[key].append(fl)    #the images that can be considered for training
                    
    for i in range(10):
        for key in temp_all_train:
            random.shuffle(temp_all_train[key])  #shuffle the training data
    for i in range(1,9): # Choose data for each training phase
        globals()['train{}'.format(i)]={'normal': [], 'pneumonia': ultimate_train['pneumonia'].copy(), 'COVID-19': ultimate_train['COVID-19'].copy()}
        globals()['train{}'.format(i)]['normal'].extend(temp_all_train['normal'][250*i:(250*i)+250])
        globals()['train{}'.format(i)]['pneumonia'].extend(temp_all_train['pneumonia'][200*i:(200*i)+200])

    for i in range(1,9): #Export CSV
        with open('Our_data_fold/fold{}/train{}.csv'.format(fo,i),newline='', mode='w') as csvfile:
             all_rows=[]
             csv_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
             csv_writer.writerow(['filename','class'])
             for key in globals()['train{}'.format(i)]:
                 for row in globals()['train{}'.format(i)][key]:
                     if '.dcm' in row:
                         new_row=row[:-4]+'.png'
                         all_rows.append([new_row,key])
                     else:
                         if '.png ' in row:
                             all_rows.append([row[:-1],key])
                         else:
                            all_rows.append([row,key])
             for j in range(10):
                random.shuffle(all_rows)
             for arow in all_rows:
                csv_writer.writerow(arow)
    print('train list in fold{}'.format(fo),{'normal':len(globals()['train{}'.format(i)]['normal']),
                                             'pneumonia':len(globals()['train{}'.format(i)]['pneumonia']),
                                                             'COVID-19':len(globals()['train{}'.format(i)]['COVID-19'])})
