# Seperation

## Import

In [1]:
from PIL import Image
import numpy as np
import pandas as pd
import torch 
import torchvision.transforms as transforms
from torch.utils.data import Dataset
import torch.nn.functional as F
import os
import cv2
import shutil
import warnings
import random
import glob

In [2]:
def set_all_seeds(SEED):
    # REPRODUCIBILITY
    torch.manual_seed(SEED)
    np.random.seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
warnings.simplefilter(action='ignore', category=FutureWarning) # Delete Future Warning
set_all_seeds(123)
print(pd.__version__)

2.0.3


## Declaration

In [3]:
origin_folder = os.path.join(os.getcwd(),'CheXpert_origin')
origin_train_folder = os.path.join(origin_folder,'train')
origin_valid_folder = os.path.join(origin_folder,'valid')
origin_test_folder = os.path.join(origin_folder,'test')
os.makedirs(origin_folder, exist_ok=True)
os.makedirs(origin_train_folder, exist_ok=True)
os.makedirs(origin_valid_folder, exist_ok=True)
os.makedirs(origin_test_folder, exist_ok=True)
origin_train_csv = os.path.join(origin_folder,'train.csv')
origin_valid_csv = os.path.join(origin_folder,'valid.csv')
origin_test_csv = os.path.join(origin_folder,'test.csv')
origin_train_valid_test_csv = os.path.join(origin_folder,'train_valid_test.csv')
origin_train_valid_test_pd = pd.read_csv(origin_train_valid_test_csv)
if os.path.exists(origin_train_csv):
    origin_train_pd = pd.read_csv(origin_train_csv)
else:
    origin_train_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
    origin_train_pd.to_csv(origin_train_csv,index=False)
if os.path.exists(origin_valid_csv):
    origin_valid_pd = pd.read_csv(origin_valid_csv)
else:
    origin_valid_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
    origin_valid_pd.to_csv(origin_valid_csv,index=False)
if os.path.exists(origin_test_csv):
    origin_test_pd = pd.read_csv(origin_test_csv)
else:
    origin_test_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
    origin_test_pd.to_csv(origin_test_csv,index=False)

male_folder = os.path.join(os.getcwd(),'CheXpert_male')
male_train_folder = os.path.join(male_folder,'train')
male_valid_folder = os.path.join(male_folder,'valid')
male_test_folder = os.path.join(male_folder,'test')
os.makedirs(male_folder, exist_ok=True)
os.makedirs(male_train_folder, exist_ok=True)
os.makedirs(male_valid_folder, exist_ok=True)
os.makedirs(male_test_folder, exist_ok=True)
male_train_csv = os.path.join(male_folder,'train.csv')
male_valid_csv = os.path.join(male_folder,'valid.csv')
male_test_csv = os.path.join(male_folder,'test.csv')
male_train_valid_test_csv = os.path.join(male_folder,'train_valid_test.csv')
if os.path.exists(male_train_csv):
    male_train_pd = pd.read_csv(male_train_csv)
else:
    male_train_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
    male_train_pd.to_csv(male_train_csv,index=False)
if os.path.exists(male_valid_csv):
    male_valid_pd = pd.read_csv(male_valid_csv)
else:
    male_valid_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
    male_valid_pd.to_csv(male_valid_csv,index=False)
if os.path.exists(male_test_csv):
    male_test_pd = pd.read_csv(male_test_csv)
else:
    male_test_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
    male_test_pd.to_csv(male_test_csv,index=False)
if os.path.exists(male_train_valid_test_csv):
    male_train_valid_test_pd = pd.read_csv(male_train_valid_test_csv)
else:
    male_train_valid_test_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
    male_train_valid_test_pd.to_csv(male_train_valid_test_csv,index=False)

female_folder = os.path.join(os.getcwd(),'CheXpert_female')
female_train_folder = os.path.join(female_folder,'train')
female_valid_folder = os.path.join(female_folder,'valid')
female_test_folder = os.path.join(female_folder,'test')
os.makedirs(female_folder, exist_ok=True)
os.makedirs(female_train_folder, exist_ok=True)
os.makedirs(female_valid_folder, exist_ok=True)
os.makedirs(female_test_folder, exist_ok=True)
female_train_csv = os.path.join(female_folder,'train.csv')
female_valid_csv = os.path.join(female_folder,'valid.csv')
female_test_csv = os.path.join(female_folder,'test.csv')
female_train_valid_test_csv = os.path.join(female_folder,'train_valid_test.csv')
if os.path.exists(female_train_csv):
    female_train_pd = pd.read_csv(female_train_csv)
else:
    female_train_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
    female_train_pd.to_csv(female_train_csv,index=False)
if os.path.exists(female_valid_csv):
    female_valid_pd = pd.read_csv(female_valid_csv)
else:
    female_valid_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
    female_valid_pd.to_csv(female_valid_csv,index=False)
if os.path.exists(female_test_csv):
    female_test_pd = pd.read_csv(female_test_csv)
else:
    female_test_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
    female_test_pd.to_csv(female_test_csv,index=False)
if os.path.exists(female_train_valid_test_csv):
    female_train_valid_test_pd = pd.read_csv(female_train_valid_test_csv)
else:
    female_train_valid_test_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
    female_train_valid_test_pd.to_csv(female_train_valid_test_csv,index=False)

before40_folder = os.path.join(os.getcwd(),'CheXpert_before40')
before40_train_folder = os.path.join(before40_folder,'train')
before40_valid_folder = os.path.join(before40_folder,'valid')
before40_test_folder = os.path.join(before40_folder,'test')
os.makedirs(before40_folder, exist_ok=True)
os.makedirs(before40_train_folder, exist_ok=True)
os.makedirs(before40_valid_folder, exist_ok=True)
os.makedirs(before40_test_folder, exist_ok=True)
before40_train_csv = os.path.join(before40_folder,'train.csv')
before40_valid_csv = os.path.join(before40_folder,'valid.csv')
before40_test_csv = os.path.join(before40_folder,'test.csv')
before40_train_valid_test_csv = os.path.join(before40_folder,'train_valid_test.csv')
if os.path.exists(before40_train_csv):
    before40_train_pd = pd.read_csv(before40_train_csv)
else:
    before40_train_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
    before40_train_pd.to_csv(before40_train_csv,index=False)
if os.path.exists(before40_valid_csv):
    before40_valid_pd = pd.read_csv(before40_valid_csv)
else:
    before40_valid_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
    before40_valid_pd.to_csv(before40_valid_csv,index=False)
if os.path.exists(before40_test_csv):
    before40_test_pd = pd.read_csv(before40_test_csv)
else:
    before40_test_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
    before40_test_pd.to_csv(before40_test_csv,index=False)
if os.path.exists(before40_train_valid_test_csv):
    before40_train_valid_test_pd = pd.read_csv(before40_train_valid_test_csv)
else:
    before40_train_valid_test_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
    before40_train_valid_test_pd.to_csv(before40_train_valid_test_csv,index=False)

after40_folder = os.path.join(os.getcwd(),'CheXpert_after40')
after40_train_folder = os.path.join(after40_folder,'train')
after40_valid_folder = os.path.join(after40_folder,'valid')
after40_test_folder = os.path.join(after40_folder,'test')
os.makedirs(after40_folder, exist_ok=True)
os.makedirs(after40_train_folder, exist_ok=True)
os.makedirs(after40_valid_folder, exist_ok=True)
os.makedirs(after40_test_folder, exist_ok=True)
after40_train_csv = os.path.join(after40_folder,'train.csv')
after40_valid_csv = os.path.join(after40_folder,'valid.csv')
after40_test_csv = os.path.join(after40_folder,'test.csv')
after40_train_valid_test_csv = os.path.join(after40_folder,'train_valid_test.csv')
if os.path.exists(after40_train_csv):
    after40_train_pd = pd.read_csv(after40_train_csv)
else:
    after40_train_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
    after40_train_pd.to_csv(after40_train_csv,index=False)
if os.path.exists(after40_valid_csv):
    after40_valid_pd = pd.read_csv(after40_valid_csv)
else:
    after40_valid_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
    after40_valid_pd.to_csv(after40_valid_csv,index=False)
if os.path.exists(after40_test_csv):
    after40_test_pd = pd.read_csv(after40_test_csv)
else:
    after40_test_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
    after40_test_pd.to_csv(after40_test_csv,index=False)
if os.path.exists(after40_train_valid_test_csv):
    after40_train_valid_test_pd = pd.read_csv(after40_train_valid_test_csv)
else:
    after40_train_valid_test_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
    after40_train_valid_test_pd.to_csv(after40_train_valid_test_csv,index=False)

## Seperation of train_valid_test into male and female

In [4]:
male_train_valid_test_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
female_train_valid_test_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
# train_valid_test
for i in range(len(origin_train_valid_test_pd)):
    if i%1000==0:
        print(f"train_valid_test: {i}")
        male_train_valid_test_pd.to_csv(male_train_valid_test_csv,index=False)
        female_train_valid_test_pd.to_csv(female_train_valid_test_csv,index=False)
    row = origin_train_valid_test_pd.loc[i]
    row_keys = list(row.keys())
    row_values = list(row.values)
    row_pd = pd.DataFrame(data=[row_values],columns=row_keys)
    
    path = row['Path']
    path_elements_lst = path.split('/')
    patient = path_elements_lst[2]
    study = path_elements_lst[3]
    view = path_elements_lst[4]
    sex = row['Sex']
    if sex == 'Male':
        after_study = os.path.join(male_train_folder,patient,study)
        male_train_valid_test_pd = pd.concat([male_train_valid_test_pd,row_pd],ignore_index=True)
    elif sex == 'Female':
        after_study = os.path.join(female_train_folder,patient,study)
        female_train_valid_test_pd = pd.concat([female_train_valid_test_pd,row_pd],ignore_index=True)
    else:
        print(path)
        continue
    os.makedirs(after_study, exist_ok=True)
    before_path = os.path.join(origin_train_folder,patient,study,view)
    after_path = os.path.join(after_study,view)
    shutil.copy(before_path, after_path)
male_train_valid_test_pd.to_csv(male_train_valid_test_csv,index=False)
female_train_valid_test_pd.to_csv(female_train_valid_test_csv,index=False)

train_valid_test: 0
train_valid_test: 1000
train_valid_test: 2000
train_valid_test: 3000
train_valid_test: 4000
train_valid_test: 5000
train_valid_test: 6000
train_valid_test: 7000
train_valid_test: 8000
train_valid_test: 9000
train_valid_test: 10000
train_valid_test: 11000
train_valid_test: 12000
train_valid_test: 13000
train_valid_test: 14000
train_valid_test: 15000
train_valid_test: 16000
train_valid_test: 17000
train_valid_test: 18000
train_valid_test: 19000
train_valid_test: 20000
train_valid_test: 21000
train_valid_test: 22000
train_valid_test: 23000
train_valid_test: 24000
train_valid_test: 25000
train_valid_test: 26000
train_valid_test: 27000
train_valid_test: 28000
train_valid_test: 29000
train_valid_test: 30000
train_valid_test: 31000
train_valid_test: 32000
train_valid_test: 33000
train_valid_test: 34000
train_valid_test: 35000
train_valid_test: 36000
train_valid_test: 37000
train_valid_test: 38000
train_valid_test: 39000
train_valid_test: 40000
train_valid_test: 41000
train

## Seperation of male train_valid_test into train, valid, test

In [5]:
male_train_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
male_valid_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
male_test_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
train_count = 0
valid_count = 0
test_count = 0
train_patient = []
valid_patient = []
test_patient = []

random_tag = 0
male_train_valid_test_num = len(male_train_valid_test_pd)
for i in range(male_train_valid_test_num):
    if i%1000 == 0:
        total_count = train_count+valid_count+test_count
        print(f"train:{train_count}({train_count/max(total_count,1)*100:.2f}%), valid:{valid_count}({valid_count/max(total_count,1)*100:.2f}%),\
        test:{test_count}({test_count/max(total_count,1)*100:.2f}%), total:{total_count}")
        male_train_pd.to_csv(male_train_csv,index=False)
        male_valid_pd.to_csv(male_valid_csv,index=False)
        male_test_pd.to_csv(male_test_csv,index=False)
        
    row = male_train_valid_test_pd.loc[i]
    row_keys = list(row.keys())
    row_values = list(row.values)
    row_pd = pd.DataFrame(data=[row_values],columns=row_keys)
    
    path = row['Path']
    path_elements_lst = path.split('/')
    patient = path_elements_lst[2]
    study = path_elements_lst[3]
    view = path_elements_lst[4]
    try:
        patient_num = int(patient[-5:])
    except:
        print(f"errors:{patient}")
        continue
    valid_tag = False
    test_tag = False
    
    random_tag = random.randrange(10)
    is_train = patient_num in train_patient
    is_valid = patient_num in valid_patient
    is_test = patient_num in test_patient
    
    if not (is_train or is_valid or is_test):
        if (random_tag <= 1 and test_count <= 0.2*male_train_valid_test_num) or i-test_count>0.85*male_train_valid_test_num:
            test_tag = True
            test_patient.append(patient_num)
    
            before_patient = os.path.join(male_train_folder,patient)
            test_count += len(glob.glob(os.path.join(before_patient,"**","*.jpg")))
            shutil.move(before_patient,male_test_folder)
        elif (random_tag == 2 and valid_count <= 0.1*male_train_valid_test_num) or i-valid_count>0.95*male_train_valid_test_num:
            valid_tag = True
            valid_patient.append(patient_num)
    
            before_patient = os.path.join(male_train_folder,patient)
            valid_count += len(glob.glob(os.path.join(before_patient,"**","*.jpg")))
            shutil.move(before_patient,male_valid_folder)
        else:
            train_patient.append(patient_num)
    else:
        valid_tag = is_valid
        test_tag = is_test

    if valid_tag:
        male_valid_pd = pd.concat([male_valid_pd,row_pd],ignore_index=True)
    elif test_tag:
        male_test_pd = pd.concat([male_test_pd,row_pd],ignore_index=True)
    else:
        male_train_pd = pd.concat([male_train_pd,row_pd],ignore_index=True)
        train_count += 1

male_train_pd.to_csv(male_train_csv,index=False)
male_valid_pd.to_csv(male_valid_csv,index=False)
male_test_pd.to_csv(male_test_csv,index=False)

train:0(0.00%), valid:0(0.00%),        test:0(0.00%), total:0
train:685(68.36%), valid:121(12.08%),        test:196(19.56%), total:1002
train:1413(70.65%), valid:201(10.05%),        test:386(19.30%), total:2000
train:2135(71.17%), valid:292(9.73%),        test:573(19.10%), total:3000
train:2895(72.38%), valid:379(9.47%),        test:726(18.15%), total:4000
train:3653(73.06%), valid:462(9.24%),        test:885(17.70%), total:5000
train:4363(72.72%), valid:533(8.88%),        test:1104(18.40%), total:6000
train:5022(71.74%), valid:597(8.53%),        test:1381(19.73%), total:7000
train:5725(71.56%), valid:752(9.40%),        test:1523(19.04%), total:8000
train:6391(71.01%), valid:863(9.59%),        test:1746(19.40%), total:9000
train:7118(71.18%), valid:984(9.84%),        test:1898(18.98%), total:10000
train:7775(70.56%), valid:1125(10.21%),        test:2119(19.23%), total:11019
train:8458(70.48%), valid:1192(9.93%),        test:2350(19.58%), total:12000
train:9201(70.77%), valid:1271(9.78%

## Seperation of female train_valid_test into train, valid, test

In [6]:
female_train_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
female_valid_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
female_test_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
train_count = 0
valid_count = 0
test_count = 0
train_patient = []
valid_patient = []
test_patient = []

random_tag = 0
female_train_valid_test_num = len(female_train_valid_test_pd)
for i in range(female_train_valid_test_num):
    if i%1000 == 0:
        total_count = train_count+valid_count+test_count
        print(f"train:{train_count}({train_count/max(total_count,1)*100:.2f}%), valid:{valid_count}({valid_count/max(total_count,1)*100:.2f}%),\
        test:{test_count}({test_count/max(total_count,1)*100:.2f}%), total:{total_count}")
        female_train_pd.to_csv(female_train_csv,index=False)
        female_valid_pd.to_csv(female_valid_csv,index=False)
        female_test_pd.to_csv(female_test_csv,index=False)
        
    row = female_train_valid_test_pd.loc[i]
    row_keys = list(row.keys())
    row_values = list(row.values)
    row_pd = pd.DataFrame(data=[row_values],columns=row_keys)
    
    path = row['Path']
    path_elements_lst = path.split('/')
    patient = path_elements_lst[2]
    study = path_elements_lst[3]
    view = path_elements_lst[4]
    try:
        patient_num = int(patient[-5:])
    except:
        print(f"errors:{patient}")
        continue
    valid_tag = False
    test_tag = False
    
    random_tag = random.randrange(10)
    is_train = patient_num in train_patient
    is_valid = patient_num in valid_patient
    is_test = patient_num in test_patient
    
    if not (is_train or is_valid or is_test):
        if (random_tag <= 1 and test_count <= 0.2*female_train_valid_test_num) or i-test_count>0.85*female_train_valid_test_num:
            test_tag = True
            test_patient.append(patient_num)
    
            before_patient = os.path.join(female_train_folder,patient)
            test_count += len(glob.glob(os.path.join(before_patient,"**","*.jpg")))
            shutil.move(before_patient,female_test_folder)
        elif (random_tag == 2 and valid_count <= 0.1*female_train_valid_test_num) or i-valid_count>0.95*female_train_valid_test_num:
            valid_tag = True
            valid_patient.append(patient_num)
    
            before_patient = os.path.join(female_train_folder,patient)
            valid_count += len(glob.glob(os.path.join(before_patient,"**","*.jpg")))
            shutil.move(before_patient,female_valid_folder)
        else:
            train_patient.append(patient_num)
    else:
        valid_tag = is_valid
        test_tag = is_test
        
    if valid_tag:
        female_valid_pd = pd.concat([female_valid_pd, row_pd], ignore_index=True)
    elif test_tag:
        female_test_pd = pd.concat([female_test_pd, row_pd], ignore_index=True)
    else:
        female_train_pd = pd.concat([female_train_pd, row_pd], ignore_index=True)
        train_count += 1

female_train_pd.to_csv(female_train_csv,index=False)
female_valid_pd.to_csv(female_valid_csv,index=False)
female_test_pd.to_csv(female_test_csv,index=False)

train:0(0.00%), valid:0(0.00%),        test:0(0.00%), total:0
train:673(67.30%), valid:224(22.40%),        test:103(10.30%), total:1000
train:1343(67.15%), valid:340(17.00%),        test:317(15.85%), total:2000
train:1972(65.73%), valid:471(15.70%),        test:557(18.57%), total:3000
train:2679(66.97%), valid:579(14.47%),        test:742(18.55%), total:4000
train:3458(69.16%), valid:701(14.02%),        test:841(16.82%), total:5000
train:4191(69.85%), valid:808(13.47%),        test:1001(16.68%), total:6000
train:4937(70.53%), valid:891(12.73%),        test:1172(16.74%), total:7000
train:5563(69.49%), valid:1007(12.58%),        test:1435(17.93%), total:8005
train:6320(70.22%), valid:1119(12.43%),        test:1561(17.34%), total:9000
train:7020(70.20%), valid:1179(11.79%),        test:1801(18.01%), total:10000
train:7737(70.31%), valid:1229(11.17%),        test:2038(18.52%), total:11004
train:8492(70.77%), valid:1285(10.71%),        test:2223(18.52%), total:12000
train:9159(70.45%), vali

In [7]:
print(f"{len(male_train_pd)} + {len(male_valid_pd)} + {len(male_test_pd)} = {len(male_train_pd) + len(male_valid_pd) + len(male_test_pd)}")
print(f"{len(female_train_pd)} + {len(female_valid_pd)} + {len(female_test_pd)} = {len(female_train_pd) + len(female_valid_pd) + len(female_test_pd)}")
print(len(male_train_pd) + len(male_valid_pd) + len(male_test_pd) + len(female_train_pd) + len(female_valid_pd) + len(female_test_pd))

92889 + 13219 + 26528 = 132636
63543 + 9078 + 18156 = 90777
223413


## Seperation of train_valid_test of origin

In [8]:
origin_train_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
origin_valid_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
origin_test_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)

train_count = 0
valid_count = 0
test_count = 0

train_patient = []
valid_patient = []
test_patient = []
for i in range(len(origin_train_valid_test_pd)):
    if i%1000==0:
        total_count = train_count+valid_count+test_count
        print(f"train:{train_count}({train_count/max(total_count,1)*100:.2f}%), valid:{valid_count}({valid_count/max(total_count,1)*100:.2f}%),\
        test:{test_count}({test_count/max(total_count,1)*100:.2f}%), total:{total_count}")
        origin_train_pd.to_csv(origin_train_csv,index=False)
        origin_valid_pd.to_csv(origin_valid_csv,index=False)
        origin_test_pd.to_csv(origin_test_csv,index=False)
    row = origin_train_valid_test_pd.loc[i]
    row_keys = list(row.keys())
    row_values = list(row.values)
    row_pd = pd.DataFrame(data=[row_values],columns=row_keys)
    
    path = row['Path']
    path_elements_lst = path.split('/')
    patient = path_elements_lst[2]
    study = path_elements_lst[3]
    view = path_elements_lst[4]

    patient_num = int(patient[-5:])
    is_train = patient_num in train_patient
    is_valid = patient_num in valid_patient
    is_test = patient_num in test_patient
    valid_tag = False
    test_tag = False

    sex = row['Sex']
    if sex == 'Male':
        if not (is_train or is_valid or is_test):
            male_valid_view_path = os.path.join(male_valid_folder,patient,study,view)
            male_test_view_path = os.path.join(male_test_folder,patient,study,view)
            if os.path.exists(male_valid_view_path):
                valid_tag = True
                valid_patient.append(patient_num)
                
                before_patient = os.path.join(origin_train_folder,patient)
                valid_count += len(glob.glob(os.path.join(before_patient,'**','*.jpg')))
                shutil.move(before_patient,origin_valid_folder)
            elif os.path.exists(male_test_view_path):
                test_tag = True
                test_patient.append(patient_num)
                
                before_patient = os.path.join(origin_train_folder,patient)
                test_count += len(glob.glob(os.path.join(before_patient,'**','*.jpg')))
                shutil.move(before_patient,origin_test_folder)
            else:
                train_patient.append(patient_num)
        else:
            valid_tag = is_valid
            test_tag = is_test
    elif sex == 'Female':
        if not (is_train or is_valid or is_test):
            female_valid_view_path = os.path.join(female_valid_folder,patient,study,view)
            female_test_view_path = os.path.join(female_test_folder,patient,study,view)
            if os.path.exists(female_valid_view_path):
                valid_tag = True
                valid_patient.append(patient_num)
                
                before_patient = os.path.join(origin_train_folder,patient)
                valid_count += len(glob.glob(os.path.join(before_patient,'**','*.jpg')))
                shutil.move(before_patient,origin_valid_folder)
            elif os.path.exists(female_test_view_path):
                test_tag = True
                test_patient.append(patient_num)
                
                before_patient = os.path.join(origin_train_folder,patient)
                test_count += len(glob.glob(os.path.join(before_patient,'**','*.jpg')))
                shutil.move(before_patient,origin_test_folder)
            else:
                train_patient.append(patient_num)
        else:
            valid_tag = is_valid
            test_tag = is_test
    else:
        print(path)
        continue
    if valid_tag:
        origin_valid_pd = pd.concat([origin_valid_pd,row_pd],ignore_index=True)
    elif test_tag:
        origin_test_pd = pd.concat([origin_test_pd,row_pd],ignore_index=True)
    else:
        origin_train_pd = pd.concat([origin_train_pd,row_pd],ignore_index=True)
        train_count += 1
origin_train_pd.to_csv(origin_train_csv,index=False)
origin_valid_pd.to_csv(origin_valid_csv,index=False)
origin_test_pd.to_csv(origin_test_csv,index=False)

train:0(0.00%), valid:0(0.00%),        test:0(0.00%), total:0
train:662(66.07%), valid:179(17.86%),        test:161(16.07%), total:1002
train:1392(69.60%), valid:312(15.60%),        test:296(14.80%), total:2000
train:2112(70.40%), valid:437(14.57%),        test:451(15.03%), total:3000
train:2772(69.30%), valid:548(13.70%),        test:680(17.00%), total:4000
train:3507(70.14%), valid:614(12.28%),        test:879(17.58%), total:5000
train:4206(70.10%), valid:755(12.58%),        test:1039(17.32%), total:6000
train:4921(70.30%), valid:822(11.74%),        test:1257(17.96%), total:7000
train:5608(70.10%), valid:921(11.51%),        test:1471(18.39%), total:8000
train:6319(70.16%), valid:1031(11.45%),        test:1657(18.40%), total:9007
train:7058(70.58%), valid:1097(10.97%),        test:1845(18.45%), total:10000
train:7711(70.09%), valid:1244(11.31%),        test:2046(18.60%), total:11001
train:8437(70.30%), valid:1351(11.26%),        test:2213(18.44%), total:12001
train:9180(70.62%), valid

In [9]:
print(f"{len(origin_train_pd)} + {len(origin_valid_pd)} + {len(origin_test_pd)} = {len(origin_train_pd) + len(origin_valid_pd) + len(origin_test_pd)}")

156432 + 22297 + 44684 = 223413


## Seperation of train set of origin into before40, after40

In [10]:
before40_train_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
after40_train_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
for i in range(len(origin_train_pd)):
    if i%1000==0:
        print(f"train: {i}")
        before40_train_pd.to_csv(before40_train_csv,index=False)
        after40_train_pd.to_csv(after40_train_csv,index=False)
    row = origin_train_pd.loc[i]
    row_keys = list(row.keys())
    row_values = list(row.values)
    row_pd = pd.DataFrame(data=[row_values],columns=row_keys)

    path = row['Path']
    path_elements_lst = path.split('/')
    patient = path_elements_lst[2]
    study = path_elements_lst[3]
    view = path_elements_lst[4]
    age = int(row['Age'])
    if age <= 40:
        after_study = os.path.join(before40_train_folder,patient,study)
        before40_train_pd = pd.concat([before40_train_pd,row_pd],ignore_index=True)
    else:
        after_study = os.path.join(after40_train_folder,patient,study)
        after40_train_pd = pd.concat([after40_train_pd,row_pd],ignore_index=True)
    os.makedirs(after_study, exist_ok=True)
    before_path = os.path.join(origin_train_folder,patient,study,view)
    after_path = os.path.join(after_study,view)
    shutil.copy(before_path, after_path)
before40_train_pd.to_csv(before40_train_csv,index=False)
after40_train_pd.to_csv(after40_train_csv,index=False)

train: 0
train: 1000
train: 2000
train: 3000
train: 4000
train: 5000
train: 6000
train: 7000
train: 8000
train: 9000
train: 10000
train: 11000
train: 12000
train: 13000
train: 14000
train: 15000
train: 16000
train: 17000
train: 18000
train: 19000
train: 20000
train: 21000
train: 22000
train: 23000
train: 24000
train: 25000
train: 26000
train: 27000
train: 28000
train: 29000
train: 30000
train: 31000
train: 32000
train: 33000
train: 34000
train: 35000
train: 36000
train: 37000
train: 38000
train: 39000
train: 40000
train: 41000
train: 42000
train: 43000
train: 44000
train: 45000
train: 46000
train: 47000
train: 48000
train: 49000
train: 50000
train: 51000
train: 52000
train: 53000
train: 54000
train: 55000
train: 56000
train: 57000
train: 58000
train: 59000
train: 60000
train: 61000
train: 62000
train: 63000
train: 64000
train: 65000
train: 66000
train: 67000
train: 68000
train: 69000
train: 70000
train: 71000
train: 72000
train: 73000
train: 74000
train: 75000
train: 76000
train: 77000

In [11]:
print(f"{len(before40_train_pd)} + {len(after40_train_pd)} = {len(before40_train_pd) + len(after40_train_pd)}")

23256 + 133176 = 156432


## Seperation of valid set of origin into before40, after40

In [12]:
before40_valid_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
after40_valid_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
for i in range(len(origin_valid_pd)):
    if i%1000==0:
        print(f"valid: {i}")
        before40_valid_pd.to_csv(before40_valid_csv,index=False)
        after40_valid_pd.to_csv(after40_valid_csv,index=False)
    row = origin_valid_pd.loc[i]
    row_keys = list(row.keys())
    row_values = list(row.values)
    row_pd = pd.DataFrame(data=[row_values],columns=row_keys)
    
    path = row['Path']
    path_elements_lst = path.split('/')
    patient = path_elements_lst[2]
    study = path_elements_lst[3]
    view = path_elements_lst[4]
    age = int(row['Age'])
    if age <= 40:
        after_study = os.path.join(before40_valid_folder,patient,study)
        before40_valid_pd = pd.concat([before40_valid_pd,row_pd],ignore_index=True)
    else:
        after_study = os.path.join(after40_valid_folder,patient,study)
        after40_valid_pd = pd.concat([after40_valid_pd,row_pd],ignore_index=True)
    os.makedirs(after_study, exist_ok=True)
    before_path = os.path.join(origin_valid_folder,patient,study,view)
    after_path = os.path.join(after_study,view)
    shutil.copy(before_path, after_path)
before40_valid_pd.to_csv(before40_valid_csv,index=False)
after40_valid_pd.to_csv(after40_valid_csv,index=False)

valid: 0
valid: 1000
valid: 2000
valid: 3000
valid: 4000
valid: 5000
valid: 6000
valid: 7000
valid: 8000
valid: 9000
valid: 10000
valid: 11000
valid: 12000
valid: 13000
valid: 14000
valid: 15000
valid: 16000
valid: 17000
valid: 18000
valid: 19000
valid: 20000
valid: 21000
valid: 22000


In [13]:
print(f"{len(before40_valid_pd)} + {len(after40_valid_pd)} = {len(before40_valid_pd) + len(after40_valid_pd)}")

3293 + 19004 = 22297


## Seperation of test set of origin into before40, after40

In [14]:
before40_test_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
after40_test_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
for i in range(len(origin_test_pd)):
    if i%1000==0:
        print(f"test: {i}")
        before40_test_pd.to_csv(before40_test_csv,index=False)
        after40_test_pd.to_csv(after40_test_csv,index=False)
    row = origin_test_pd.loc[i]
    row_keys = list(row.keys())
    row_values = list(row.values)
    row_pd = pd.DataFrame(data=[row_values],columns=row_keys)
    
    path = row['Path']
    path_elements_lst = path.split('/')
    patient = path_elements_lst[2]
    study = path_elements_lst[3]
    view = path_elements_lst[4]
    age = int(row['Age'])
    if age <= 40:
        after_study = os.path.join(before40_test_folder,patient,study)
        before40_test_pd = pd.concat([before40_test_pd,row_pd],ignore_index=True)
    else:
        after_study = os.path.join(after40_test_folder,patient,study)
        after40_test_pd = pd.concat([after40_test_pd,row_pd],ignore_index=True)
    os.makedirs(after_study, exist_ok=True)
    before_path = os.path.join(origin_test_folder,patient,study,view)
    after_path = os.path.join(after_study,view)
    shutil.copy(before_path, after_path)
before40_test_pd.to_csv(before40_test_csv,index=False)
after40_test_pd.to_csv(after40_test_csv,index=False)

test: 0
test: 1000
test: 2000
test: 3000
test: 4000
test: 5000
test: 6000
test: 7000
test: 8000
test: 9000
test: 10000
test: 11000
test: 12000
test: 13000
test: 14000
test: 15000
test: 16000
test: 17000
test: 18000
test: 19000
test: 20000
test: 21000
test: 22000
test: 23000
test: 24000
test: 25000
test: 26000
test: 27000
test: 28000
test: 29000
test: 30000
test: 31000
test: 32000
test: 33000
test: 34000
test: 35000
test: 36000
test: 37000
test: 38000
test: 39000
test: 40000
test: 41000
test: 42000
test: 43000
test: 44000


In [15]:
print(f"{len(before40_test_pd)} + {len(after40_test_pd)} = {len(before40_test_pd) + len(after40_test_pd)}")

6896 + 37788 = 44684


## Seperate symptoms

In [27]:
symptoms = ['Cardiomegaly','Edema','Consolidation','Atelectasis','Pleural Effusion']
symptoms_train_pds = []
symptoms_valid_pds = []
symptoms_test_pds = []
for j in range(len(symptoms)):
    symptom = symptoms[j]
    symptom_folder = os.path.join(os.getcwd(),'symptoms',symptom)
    symptom_train = os.path.join(symptom_folder,'train')
    symptom_valid = os.path.join(symptom_folder,'valid')
    symptom_test = os.path.join(symptom_folder,'test')
    os.makedirs(symptom_folder, exist_ok=True)
    os.makedirs(symptom_train, exist_ok=True)
    os.makedirs(symptom_valid, exist_ok=True)
    os.makedirs(symptom_test, exist_ok=True)
    symptom_train_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
    symptom_valid_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)
    symptom_test_pd = pd.DataFrame(columns=origin_train_valid_test_pd.columns)    
    symptom_train_pd.to_csv(os.path.join(symptom_folder,'train.csv'),index=False)
    symptom_valid_pd.to_csv(os.path.join(symptom_folder,'valid.csv'),index=False)
    symptom_test_pd.to_csv(os.path.join(symptom_folder,'test.csv'),index=False)
    symptoms_train_pds.append(symptom_train_pd)
    symptoms_valid_pds.append(symptom_valid_pd)
    symptoms_test_pds.append(symptom_test_pd)

In [28]:
for i in range(len(origin_train_pd)):
    if i % 1000 == 0:
        print(f"train:{i}", end=" ")
    row = origin_train_pd.loc[i]
    path = row['Path']
    path_elements_lst = path.split('/')
    patient = path_elements_lst[2]
    study = path_elements_lst[3]
    view = path_elements_lst[4]
    
    row_keys = list(row.keys())
    row_values = list(row.values)
    row_values[0] = f"{path_elements_lst[0]}/train/{patient}/{study}/{view}"
    row_pd = pd.DataFrame(data=[row_values],columns=row_keys)
    
    for j in range(len(symptoms)):
        symptom = symptoms[j]
        symptom_folder = os.path.join(os.getcwd(),'symptoms',symptom)
        symptom_train = os.path.join(symptom_folder,'train')
        if i % 1000 == 0:
            print(f"{symptom}", end=" ")
            symptoms_train_pds[j].to_csv(os.path.join(symptom_folder,'train.csv'),index=False)
            
        is_positive = row[symptom]
        if not np.isnan(is_positive):
            if j == 1 or j == 3:
                if is_positive < 0 or is_positive > 0:
                    symptoms_train_pds[j] = pd.concat([symptoms_train_pds[j],row_pd],ignore_index=True)
                    
                    before_path = os.path.join(origin_train_folder,patient,study,view)
                    after_study = os.path.join(symptom_train,patient,study)
                    os.makedirs(after_study,exist_ok=True)
                    after_path = os.path.join(after_study,view)
                    shutil.copy(before_path,after_path)
            else:
                if is_positive > 0:
                    symptoms_train_pds[j] = pd.concat([symptoms_train_pds[j],row_pd],ignore_index=True)
                    
                    before_path = os.path.join(origin_train_folder,patient,study,view)
                    after_study = os.path.join(symptom_train,patient,study)
                    os.makedirs(after_study,exist_ok=True)
                    after_path = os.path.join(after_study,view)
                    shutil.copy(before_path,after_path)
    if i % 1000 == 0:
        print()
for j in range(len(symptoms)):
    symptom = symptoms[j]
    symptom_folder = os.path.join(os.getcwd(),'symptoms',symptom)
    symptom_train = os.path.join(symptom_folder,'train')
    symptoms_train_pds[j].to_csv(os.path.join(symptom_folder,'train.csv'),index=False)

train:0
train:1000
train:2000
train:3000
train:4000
train:5000
train:6000
train:7000
train:8000
train:9000
train:10000
train:11000
train:12000
train:13000
train:14000
train:15000
train:16000
train:17000
train:18000
train:19000
train:20000
train:21000
train:22000
train:23000
train:24000
train:25000
train:26000
train:27000
train:28000
train:29000
train:30000
train:31000
train:32000
train:33000
train:34000
train:35000
train:36000
train:37000
train:38000
train:39000
train:40000
train:41000
train:42000
train:43000
train:44000
train:45000
train:46000
train:47000
train:48000
train:49000
train:50000
train:51000
train:52000
train:53000
train:54000
train:55000
train:56000
train:57000
train:58000
train:59000
train:60000
train:61000
train:62000
train:63000
train:64000
train:65000
train:66000
train:67000
train:68000
train:69000
train:70000
train:71000
train:72000
train:73000
train:74000
train:75000
train:76000
train:77000
train:78000
train:79000
train:80000
train:81000
train:82000
train:83000
train

In [29]:
for i in range(len(origin_valid_pd)):
    if i % 1000 == 0:
        print(f"valid:{i}", end=" ")
    row = origin_valid_pd.loc[i]
    path = row['Path']
    path_elements_lst = path.split('/')
    patient = path_elements_lst[2]
    study = path_elements_lst[3]
    view = path_elements_lst[4]
    
    row_keys = list(row.keys())
    row_values = list(row.values)
    row_values[0] = f"{path_elements_lst[0]}/valid/{patient}/{study}/{view}"
    row_pd = pd.DataFrame(data=[row_values],columns=row_keys)
    
    for j in range(len(symptoms)):
        symptom = symptoms[j]
        symptom_folder = os.path.join(os.getcwd(),'symptoms',symptom)
        symptom_valid = os.path.join(symptom_folder,'valid')
        if i % 1000 == 0:
            print(f"{symptom}({len(symptoms_valid_pds[j])})", end=" ")
            symptoms_valid_pds[j].to_csv(os.path.join(symptom_folder,'valid.csv'),index=False)
            
        is_positive = row[symptom]
        if not np.isnan(is_positive):
            if j == 1 or j == 3:
                if is_positive < 0 or is_positive > 0:
                    symptoms_valid_pds[j] = pd.concat([symptoms_valid_pds[j],row_pd],ignore_index=True)
                    
                    before_path = os.path.join(origin_valid_folder,patient,study,view)
                    after_study = os.path.join(symptom_valid,patient,study)
                    os.makedirs(after_study,exist_ok=True)
                    after_path = os.path.join(after_study,view)
                    shutil.copy(before_path,after_path)
            else:
                if is_positive > 0:
                    symptoms_valid_pds[j] = pd.concat([symptoms_valid_pds[j],row_pd],ignore_index=True)
                    
                    before_path = os.path.join(origin_valid_folder,patient,study,view)
                    after_study = os.path.join(symptom_valid,patient,study)
                    os.makedirs(after_study,exist_ok=True)
                    after_path = os.path.join(after_study,view)
                    shutil.copy(before_path,after_path)
    if i % 1000 == 0:
        print()
for j in range(len(symptoms)):
    symptom = symptoms[j]
    symptom_folder = os.path.join(os.getcwd(),'symptoms',symptom)
    symptom_valid = os.path.join(symptom_folder,'valid')
    symptoms_valid_pds[j].to_csv(os.path.join(symptom_folder,'valid.csv'),index=False)


valid:0
valid:1000
valid:2000
valid:3000
valid:4000
valid:5000
valid:6000
valid:7000
valid:8000
valid:9000
valid:10000
valid:11000
valid:12000
valid:13000
valid:14000
valid:15000
valid:16000
valid:17000
valid:18000
valid:19000
valid:20000
valid:21000
valid:22000


In [30]:
for i in range(len(origin_test_pd)):
    if i % 1000 == 0:
        print(f"test:{i}", end=" ")
    row = origin_test_pd.loc[i]
    path = row['Path']
    path_elements_lst = path.split('/')
    patient = path_elements_lst[2]
    study = path_elements_lst[3]
    view = path_elements_lst[4]
    
    row_keys = list(row.keys())
    row_values = list(row.values)
    row_values[0] = f"{path_elements_lst[0]}/test/{patient}/{study}/{view}"
    row_pd = pd.DataFrame(data=[row_values],columns=row_keys)
    
    for j in range(len(symptoms)):
        symptom = symptoms[j]
        symptom_folder = os.path.join(os.getcwd(),'symptoms',symptom)
        symptom_test = os.path.join(symptom_folder,'test')
        if i % 1000 == 0:
            print(f"{symptom}({len(symptoms_test_pds[j])})", end=" ")
            symptoms_test_pds[j].to_csv(os.path.join(symptom_folder,'test.csv'),index=False)
            
        is_positive = row[symptom]
        if not np.isnan(is_positive):
            if j == 1 or j == 3:
                if is_positive < 0 or is_positive > 0:
                    symptoms_test_pds[j] = pd.concat([symptoms_test_pds[j],row_pd],ignore_index=True)
                    
                    before_path = os.path.join(origin_test_folder,patient,study,view)
                    after_study = os.path.join(symptom_test,patient,study)
                    os.makedirs(after_study,exist_ok=True)
                    after_path = os.path.join(after_study,view)
                    shutil.copy(before_path,after_path)
            else:
                if is_positive > 0:
                    symptoms_test_pds[j] = pd.concat([symptoms_test_pds[j],row_pd],ignore_index=True)
                    
                    before_path = os.path.join(origin_test_folder,patient,study,view)
                    after_study = os.path.join(symptom_test,patient,study)
                    os.makedirs(after_study,exist_ok=True)
                    after_path = os.path.join(after_study,view)
                    shutil.copy(before_path,after_path)
    if i % 1000 == 0:
        print()
for j in range(len(symptoms)):
    symptom = symptoms[j]
    symptom_folder = os.path.join(os.getcwd(),'symptoms',symptom)
    symptom_test = os.path.join(symptom_folder,'test')
    symptoms_test_pds[j].to_csv(os.path.join(symptom_folder,'test.csv'),index=False)

test:0
test:1000
test:2000
test:3000
test:4000
test:5000
test:6000
test:7000
test:8000
test:9000
test:10000
test:11000
test:12000
test:13000
test:14000
test:15000
test:16000
test:17000
test:18000
test:19000
test:20000
test:21000
test:22000
test:23000
test:24000
test:25000
test:26000
test:27000
test:28000
test:29000
test:30000
test:31000
test:32000
test:33000
test:34000
test:35000
test:36000
test:37000
test:38000
test:39000
test:40000
test:41000
test:42000
test:43000
test:44000
