# Preview the README file

In [None]:
!cat ../datasets/bimcv-covid19/README.md

# Preview the dataset description

In [None]:
!cat ../datasets/bimcv-covid19/dataset_description.json

In [None]:
!nvcc --version

In [None]:
!nvcc --version

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
    print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
    print('and then re-execute this cell.')
else:
    print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))
if ram_gb < 20:
    print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
    print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
    print('re-execute this cell.')
else:
    print('You are using a high-RAM runtime!')

# Preview the participants file

In [None]:
import pandas as pd
participants = pd.read_csv('../datasets/abimcv-covid19/participants.tsv', delimiter='\t')
participants.describe()

# Create map for subject to each scan

In [None]:
import os
from IPython.display import display, HTML

debug = False

root = '../datasets/bimcv-covid19/'
subject_dirs = [name for name in os.listdir(root) if '.' not in name and 'sub' == name[0:3]]
print('There are %d directories with patient data' % len(subject_dirs))
subject_to_sessions = {}
for subject_dir in subject_dirs:
    print(subject_dir)
    # find tsv file
    sessions_df = pd.read_csv('%s%s/%s_sessions.tsv' % (root, subject_dir, subject_dir), delimiter='\t')
    sessions_to_scans = {'sessions_df': sessions_df}
    # find png files per session
    if debug:
        display(HTML(sessions_df.to_html()))
    for session_id in sessions_df['session_id']:
        scans_df = pd.read_csv('%s%s/%s/%s_%s_scans.tsv' % (root, subject_dir, session_id, subject_dir, session_id), delimiter='\t')
        if debug:
            display(HTML(scans_df.to_html()))
        sessions_to_scans[session_id] = scans_df
        
    subject_to_sessions[subject_dir] = sessions_to_scans

# Preview each image and decide if it is lateral or frontal view

In [None]:
def convert_filename(filename):
    
    # We only consider X-rays - CT scans are in an archive
    if '.gz' in filename:
        return []
    
    # there could be a comma in a filename
    if ',' in filename:
        converted_filename = []
        for filename_comma_split in filename.split(','):
            split = filename_comma_split.split('_')
            split[2], split[4] = split[4], split[2]

            converted_filename.append('_'.join(split))
        return converted_filename
    else:
        if '_cr' in filename:
            filename = filename.replace('_bp-chest', '').replace('_cr', '_bp-chest_cr')
        elif 'acq-1' in filename:
            print(filename)
            filename = filename.replace('_bp-chest', '').replace('_vp', '_bp-chest_vp').replace('_acq-1', '').replace('pa_dx', 'ap_dx')
        else:
            filename_split = filename.split('_')
            filename_split[2], filename_split[3] = filename_split[3], filename_split[2]
            filename = '_'.join(filename_split)
            
        filename = filename.replace('vp-ap_bp-chest_cr', 'bp-chest_vp-ap_cr')
        return [filename]

# Preview and Manual Labelling of Frontal/Lateral Views

In [None]:
from matplotlib import pyplot as plt
from PIL import Image
import numpy as np
from IPython.display import clear_output

counter = 0
images_to_session_day_name_path = []
for subject_id, sessions in subject_to_sessions.items():
    print(counter, subject_id)
    counter += 1
    for session, scans in sessions.items():
        if session != 'sessions_df':
            print('\t\t', session)
            for filename in scans['filename']:
                for filename_corrected in convert_filename(filename):
                    print('\t\t\t', filename_corrected)
                    full_path = '%s%s/%s/%s' % (root, subject_id, session, filename_corrected)
                    try:
                        im_frame = Image.open(full_path)
                    except:
                        try:
                            im_frame = Image.open(full_path.replace('ap_dx', 'pa_dx'))
                        except:
                            full_path_split = full_path.split('_')
                            full_path_split[3], full_path_split[4] = full_path_split[4], full_path_split[3]
                            full_path = '_'.join(full_path_split)
                            im_frame = Image.open(full_path)
                                    
                    np_frame = np.array(im_frame)
                    plt.imshow(np_frame)
                    plt.show()
                    is_frontal = input('Is frontal (y/n)') or 'y'
                    clear_output(wait=True)
                    date = sessions['sessions_df'][sessions['sessions_df']['session_id'] == session]['study_date'].values[0]
                    age = scans.filter(regex='Patient\'s Age ').values[0][0]
                    sex = scans.filter(regex='Patient\'s Sex ').values[0][0]
                    images_to_session_day_name_path.append([subject_id, session, date, age, sex, filename, full_path, is_frontal])

In [None]:
 pd.DataFrame(images_to_session_day_name_path, columns=['Subject', 'Session', 'Date', 'Age', 'Sex', 'Filename', 'Path', 'Is frontal']).to_csv('bimcv-covid19.csv')

In [None]:
import pandas as pd
pd.read_csv('bimcv-covid19.csv', index_col=0)