Created: 2020.07.17

Modified: 2020.07.11

### Prepare csv files with file paths to images and masks

Paths saved to csv files are used to prepare a DataBunch object in fastai algorithm. While df is built, a three values are compared to have correspond images. We check the following values:
- folder name (i.e. subject name)
- 2D slice number
- set name: "val" or "train"

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import glob
import pandas as pd
from pathlib import Path

In [3]:
import sys
sys.path.append('functions')

%aimport functions00, functions01
from functions00 import *
from functions01 import *

import functions24 as fun

In [4]:
mk_get_host_info()

**********************************************************************************************************************************
Settings:
	HOST:  mmiv-ml-titan
	PATH_ROOT_DATA:  /data-10tb/shared/skull/train-3d-iso
	PATH_GIT_HUB:  /data-10tb/marek/github_codes/skull-stripping-1/fastai
	PATH_2D: /data-10tb/shared/skull

3D DFs paths (_mk_3D):
	IXI_TEST_3D: /data-10tb/marek/github_codes/skull-stripping-1/fastai/2.2_train_valid_test_sets/ixi_test_mk_3d.csv
	TEST_3D : /data-10tb/marek/github_codes/skull-stripping-1/fastai/2.2_train_valid_test_sets/test_mk_3d.csv
	TRAIN_VAL_3D: /data-10tb/marek/github_codes/skull-stripping-1/fastai/2.2_train_valid_test_sets/train_val_mk_3d.csv
	NFBS_TEST_3D: /data-10tb/marek/github_codes/skull-stripping-1/fastai/2.2_train_valid_test_sets/nfbs_test_mk_3d.csv

Error files (pickle):
	ERROR_FILES: /data-10tb/marek/github_codes/skull-stripping-1/fastai/2.2_train_valid_test_sets/error_files
**********************************************************************

### AXIAL / CORONAL / SAGITTAL - train & valid

In [68]:
%%time
cross_sections = ['axial', 'coronal', 'sagittal']
save = 1

for cross_sec in cross_sections:
    t1_all = sorted(glob.glob(f'/data-10tb/shared/skull/{cross_sec}-2d/*/*/*/T1_iso_*.png'))
    bias_all = sorted(glob.glob(f'/data-10tb/shared/skull/{cross_sec}-2d/*/*/*/T1_biascorr_iso_*.png'))
    mask_all = sorted(glob.glob(f'/data-10tb/shared/skull/{cross_sec}-2d/*/*/*/*brain_mask_iso_*.png'))
      
    assert len(t1_all)==len(mask_all), f'Different image number t1 and mask in {cross_sec}-2d!!!'
    assert len(bias_all)==len(mask_all), f'Different image number t1_biascorr and mask in {cross_sec}-2d!!!'
    

    root_list = []
    t1_list = []
    t1_full_list = []
    bias_list = []
    bias_full_list = []
    mask_list = []
    usage_list = []

    i=1
    tot = len(t1_all)
    for t,b,m in zip(t1_all, bias_all, mask_all):

        t = Path(t)
        b = Path(b)
        m = Path(m)

        # parent folders
        assert t.parent.name == m.parent.name, f'Different parent folder names in {t} and {m}'
        assert b.parent.name == m.parent.name, f'Different parent folder names in {b} and {m}'

        # train / val folder path part
        train_val_t = 'train' if 'train' in str(t) else 'val'
        train_val_b = 'train' if 'train' in str(b) else 'val'
        train_val_m = 'train' if 'train' in str(m) else 'val'

        assert train_val_t == train_val_m, f'Different train_val name in {t} and {m}'
        assert train_val_b == train_val_m, f'Different train_val name in {b} and {m}'

        # slice number e.g. 001
        t_nr = t.stem.split('_')[-1]
        b_nr = b.stem.split('_')[-1]
        m_nr = m.stem.split('_')[-1]

        assert t_nr == m_nr, f'Different image number in {t} and {bm}'
        assert b_nr == m_nr, f'Different image number in {b} and {bm}'

        root, t1 = str(t).split(f'{cross_sec}-2d')
        bias = str(b).split(f'{cross_sec}-2d')[1]
        #mask = str(m).split(f'{cross_sec}-2d')[1]

        root_list.append(root + f'{cross_sec}-2d')
        t1_list.append(t1)
        t1_full_list.append(str(t))
        bias_list.append(bias)
        bias_full_list.append(str(b))
        mask_list.append(str(m))
        usage_list.append(train_val_t)

        print(f'{i:06}/{tot}', end='\r')
        i+=1
        
    print()
    df_t1 = pd.DataFrame.from_dict({'root':root_list, 't1_path':t1_list, 'image_full_path':t1_full_list, 'mask_full_path':mask_list, 'usage_txt':usage_list})
    df_bias = pd.DataFrame.from_dict({'root':root_list, 'bias_path':bias_list,'image_full_path':bias_full_list, 'mask_full_path':mask_list, 'usage_txt':usage_list})
    
    
    df_t1['usage'] = df_t1['usage_txt'].apply(lambda x: True if x =='val' else False)
    df_bias['usage'] = df_bias['usage_txt'].apply(lambda x: True if x =='val' else False)
    

    if save:
        fun.save_df(df_t1, cross_sec, 't1_mask')
        fun.save_df(df_bias, cross_sec, 'bias_mask')
        
    print(f'{cross_sec} {df_t1.shape[0]}..done\n')  

471836/471836
Saved files:
	/data-10tb/marek/github_codes/skull-stripping-1/fastai/2.4_train_val_3d_path_tables/t1_mask-test-val-axial-2d.csv
	/data-10tb/shared/skull/axial-2d/t1_mask-test-val-axial-2d.csv
Saved files:
	/data-10tb/marek/github_codes/skull-stripping-1/fastai/2.4_train_val_3d_path_tables/bias_mask-test-val-axial-2d.csv
	/data-10tb/shared/skull/axial-2d/bias_mask-test-val-axial-2d.csv
axial 471836..done

668562/668562
Saved files:
	/data-10tb/marek/github_codes/skull-stripping-1/fastai/2.4_train_val_3d_path_tables/t1_mask-test-val-coronal-2d.csv
	/data-10tb/shared/skull/coronal-2d/t1_mask-test-val-coronal-2d.csv
Saved files:
	/data-10tb/marek/github_codes/skull-stripping-1/fastai/2.4_train_val_3d_path_tables/bias_mask-test-val-coronal-2d.csv
	/data-10tb/shared/skull/coronal-2d/bias_mask-test-val-coronal-2d.csv
coronal 668562..done

523754/523754
Saved files:
	/data-10tb/marek/github_codes/skull-stripping-1/fastai/2.4_train_val_3d_path_tables/t1_mask-test-val-sagittal-2d.c

In [69]:
print(len(t1_all), len(bias_all), len(mask_all))

523754 523754 523754


### Check if all files from path exists on a disc

Test if all paths point to existing images.

In [5]:
%%time
csv_files = sorted(glob.glob(f'/{PATH_GIT_HUB}/2.4_train_val_2d_path_tables/*.csv'))
for csv in csv_files:
    fun.file_exist2(csv)

bias_mask-test-val-axial-2d.csv
bias_mask-test-val-coronal-2d.csv
bias_mask-test-val-sagittal-2d.csv
t1_mask-test-val-axial-2d.csv
t1_mask-test-val-coronal-2d.csv
t1_mask-test-val-sagittal-2d.csv
CPU times: user 7min 15s, sys: 1min 7s, total: 8min 23s
Wall time: 7min 18s


## Remove failed files (based on Sathiesh's error list)

#### All files csv in the folder 2.4_train_val_2d_path_tables

In [7]:
# remove = '123456'
remove = '0'

#1
if '1' in remove:
    f_bias_axial = 'bias_mask-test-val-axial-2d.csv'
    fun.remove_fialed_files_from_csv_file(f_bias_axial, replace_csv_file=True)
#2
if '2' in remove:
    f_bias_coronal = 'bias_mask-test-val-coronal-2d.csv'
    fun.remove_fialed_files_from_csv_file(f_bias_coronal, replace_csv_file=True)
#3
if '3' in remove:
    f_bias_sagittal = 'bias_mask-test-val-sagittal-2d.csv'
    fun.remove_fialed_files_from_csv_file(f_bias_sagittal, replace_csv_file=True)
#4
if '4' in remove:
    f_t1_axial = 't1_mask-test-val-axial-2d.csv'
    fun.remove_fialed_files_from_csv_file(f_t1_axial, replace_csv_file=True)
#5
if '5' in remove:
    f_t1_coronal = 't1_mask-test-val-coronal-2d.csv'
    fun.remove_fialed_files_from_csv_file(f_t1_coronal, replace_csv_file=True)
#6
if '6' in remove:
    f_t1_sagittal = 't1_mask-test-val-sagittal-2d.csv'
    fun.remove_fialed_files_from_csv_file(f_t1_sagittal, replace_csv_file=True)

## An old version

In [None]:
def file_exist(df):
    k, l = 0, df.shape[0]
    root = Path(df.root[0])
    for i,m in zip(df.t1_path, df.mask_path):        
        if not Path(root.as_posix() + i).exists(): print(i);
        if not Path(root.as_posix() + m).exists(): print(m);
        print(f'{k:07}/{l}', end='\r')
        k += 1

In [3]:
cross_sections = ['axial', 'coronal', 'sagittal']
save = 1

for cross_sec in cross_sections:
    t1 = glob.glob(f'/data-10tb/shared/skull/{cross_sec}-2d/*/*/*/T1_iso_*.png')
    bias = glob.glob(f'/data-10tb/shared/skull/{cross_sec}-2d/*/*/*/T1_biascorr_iso_*.png')
    mask = glob.glob(f'/data-10tb/shared/skull/{cross_sec}-2d/*/*/*/*brain_mask_iso_*.png')
    
    all_ = []
    all_.extend(t1)
    all_.extend(bias)
    all_.extend(mask)
    
    df = pd.DataFrame.from_dict({'path':all_})
    df.loc[df['path'].str.contains('train'), 'usage'] = 'train'
    df.loc[df['path'].str.contains('valid'), 'usage'] = 'val'
    

    if save:
        save_folder = Path(f'/data-10tb/marek/github_codes/skull-stripping-1/fastai/2.4_train_val_2d_path_tables')
        save_folder.mkdir(parents=True, exist_ok=True)

        save_name = f'test-val-{cross_sec}-2d.csv'    
        pth = save_folder/save_name

        df.to_csv(pth, index=False)
    print(f'Done:\t{cross_sec} {df.shape[0]}')
    

Done:	axial ((1277298, 2))
Done:	coronal ((1808145, 2))
Done:	sagittal ((1374648, 2))


In [5]:
%%time
file_exist2(df)

CPU times: user 2min 35s, sys: 27.8 s, total: 3min 3s
Wall time: 2min 33s
