In [None]:
import sys; sys.version_info

In [None]:
import pandas as pd
import os
from collections import Counter

In [None]:
DATASET_DIR = os.environ.get('DATASET_DIR_COVID_UC')

### Load given dataframe

In [None]:
LABEL_COL = 'Resultado consenso BSTI'

In [None]:
fpath = os.path.join(DATASET_DIR, 'anonymus_match.xlsx')
main_df = pd.read_excel(fpath)
main_df.head()

In [None]:
main_df.dropna(how='all', axis=1, inplace=True)

In [None]:
unnamed_cols = [c for c in main_df.columns if 'unnamed' in c.lower()]
unnamed_cols

In [None]:
Counter(main_df['Clasificación BSTI'])

In [None]:
Counter(main_df['Resultado consenso BSTI'])

In [None]:
main_df['PCR']

#### Check patients with _

In [None]:
patients_double = [v for v in main_df['ID'] if '_' in str(v)]
patients_double = list(set(v.split('_')[0] for v in patients_double))
patients_double

In [None]:
target_cols = ['Edad', 'Sexo']

for patient_id in patients_double:
    patients = [v for v in main_df['ID'] if str(v).split('_')[0] == str(patient_id)]
    sub_df = labels_df.loc[main_df['ID'].isin(patients)]
    
    for col in target_cols:
        values = set(sub_df[col])
        if len(values) != 1:
            print(patient_id, col, values, patients)

In [None]:
main_df.loc[main_df['ID'].isin(['185', '185_2', '185_3'])]

#### Check indeterminados

In [None]:
cols = ['Clasificación BSTI', 'Clasificación BSTI.1']
main_df.loc[main_df['Resultado consenso BSTI'] == 'Indeterminado'][cols]

### Read images

In [None]:
from collections import defaultdict

In [None]:
import pydicom
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np

In [None]:
%run ../../utils/common.py

In [None]:
DATA_DIR = os.environ.get('DATASET_DIR_COVID_UC_RAW_DATA')
DATA_DIR_ANON_3 = os.path.join(DATA_DIR, 'anon_3')
DATA_DIR_TEST_ANON = os.path.join(DATA_DIR, 'test_anon')

#### Inspect one sample

In [None]:
# fpath = os.path.join(DATA_DIR, '95', 'Torax Pa-L', 'Tórax - 1', 'IM-0001-0002-0001.dcm')
# fpath = DATA_DIR_ANON_3 + '/376/Rx Torax Ap-L/unnamed - 100/IM-0001-0001-0001.dcm'
# fpath = DATA_DIR_TEST_ANON + '/10/Rx Torax Ap Portatil/AP horizontal - 1/IM-0001-0001-0001.dcm'
# fpath = DATA_DIR_TEST_ANON + '/3/Examen Previo No Uc/MIP Axial Pulmon - 1004/IM-0001-0045-0001.dcm'
fpath = DATA_DIR_TEST_ANON + '/395/Rx Torax Ap-L/unnamed - 100/IM-0001-0001-0001.dcm'
ds = pydicom.dcmread(fpath)

In [None]:
ds.StudyDescription

In [None]:
print(ds)

In [None]:
ds.pixel_array

In [None]:
ds.StudyDate

In [None]:
image = ds.pixel_array

In [None]:
print(image.shape)

plt.imshow(arr_to_range(image, 0, 255), cmap='gray')

In [None]:
image.min(), image.max()

In [None]:
scaled_image = arr_to_range(image, 0, 255)
scaled_image.min(), scaled_image.max()

In [None]:
fname = 'sample_file.png'

arr_to_range(image, 0, 255)
pil_image = Image.fromarray(scaled_image).convert('L')
pil_image.save(fname)

In [None]:
loaded_image = Image.open(fname)
loaded_image.size

In [None]:
plt.imshow(loaded_image, cmap='gray')

In [None]:
np.min(loaded_image), np.max(loaded_image)

#### Dicom to images

In [None]:
%run ../../utils/common.py

In [None]:
from tqdm import tqdm # tqdm_notebook as tqdm

In [None]:
IMAGES_DIR = os.path.join(DATASET_DIR, 'images')

In [None]:
def split_date(date_str):
    year = date_str[:4]
    month = date_str[4:6]
    day = date_str[6:]
    return '-'.join(v for v in [year, month, day])
split_date('20200308')

In [None]:
def iter_dicom_images(data_dir, save_png=False, max_samples=None,
                      prev_images={},
                     ):
    errors = defaultdict(lambda: list())
    images_by_patient = defaultdict(lambda: list())
    result = []

    patients = sorted(os.listdir(data_dir))

    if max_samples:
        patients = patients[:max_samples]
    # patients = ['312', '315', '344', '351']
    # patients = ['185_2', '414', '582', '616']

    for patient_id in tqdm(patients):
        patient_id = str(patient_id)
        patient_folder = os.path.join(data_dir, patient_id)
        
        if patient_id in prev_images:
            # There are images present already!!
            image_counter = len(prev_images[patient_id])
        else:
            image_counter = 0

        for basepath, _, filenames in os.walk(patient_folder):
            for filename in sorted(filenames):
                filepath = os.path.join(basepath, filename)

                # Filter only RX
                filepath_clean = filepath.replace(data_dir, '').replace(patient_id, '')
                filepath_lower = filepath_clean.lower()
                is_other_scan = False
                for scan_type in ('scanner', 'tac', 'ecograf', 'prev'):
                    if scan_type in filepath_lower:
                        errors[scan_type].append(filepath_clean)
                        is_other_scan = True
                if is_other_scan:
                    continue
                
                if not filename.endswith('.dcm'):
                    errors['non-dcm'].append(filepath)
                    continue
                images_by_patient[patient_id].append(filename)

                # Read DICOM
                ds = pydicom.dcmread(filepath)

                try:
                    image = ds.pixel_array
                except AttributeError as e:
                    errors['broken'].append(filepath)
                    continue

                # Set image name and path
                image_name = '-'.join(str(s) for s in [patient_id, image_counter, filename[:-4]])
                image_name += '.png'

                # Read DICOM useful information
                try:
                    view_position = ds.ViewPosition
                except AttributeError as e:
                    view_position = ''
                    print('NO VIEW POSITION: ', filepath, image_name)
                date = split_date(ds.StudyDate)
                # transfer_syntax = ds.file_meta.TransferSyntaxUID.name

                # HACK: fill empty view positions
                if not view_position:
                    # Only valid for anon_3
                    if data_dir.endswith('anon_3'):
                        if image_name.startswith('312-0'):
                            view_position = 'AP'
                        else:
                            view_position = 'LL'
                    errors['missing-view'].append((filepath, image_name, view_position))

                # Save image to png
                if save_png:
                    image_filepath = os.path.join(IMAGES_DIR, image_name)
                    if os.path.isfile(image_filepath):
                        errors['overriden-image'].append((filepath, image_name))
                    scaled_image = arr_to_range(image, 0, 255)
                    pil_image = Image.fromarray(scaled_image).convert('L')
                    pil_image.save(image_filepath)

                # Save into main list
                result.append((patient_id, image_name, view_position, date))

                image_counter += 1

    return result, images_by_patient, errors

In [None]:
results_anon_3, i_by_p_anon_3, errors1 = iter_dicom_images(DATA_DIR_ANON_3,
                                                           save_png=False)

In [None]:
results_test_anon, i_by_p_test_anon, errors2 = iter_dicom_images(DATA_DIR_TEST_ANON,
                                                                 save_png=True,
                                                                 prev_images=i_by_p_anon_3,
                                                                )

##### Check errors

In [None]:
errors.keys()

In [None]:
def different_values(strings):
    values = set()
    for s in strings:
        for w in s.split('/'):
            if not w:
                continue
            values.add(w)
            break
    return values

In [None]:
different_values(errors['ecograf'])

In [None]:
different_values(errors['prev'])

In [None]:
different_values(errors['scanner'])

In [None]:
different_values(errors['tac'])

In [None]:
results_2

#### DF with image information

In [None]:
columns = ['ID', 'image_name', 'view', 'date']

image_df_anon_3 = pd.DataFrame(results_anon_3, columns=columns)
image_df_anon_3.head()

In [None]:
image_df_test_anon = pd.DataFrame(results_test_anon, columns=columns)
image_df_test_anon.head()

In [None]:
set(image_df_test_anon['image_name']).intersection(image_df_anon_3['image_name'])

In [None]:
len(image_df_anon_3), len(image_df_test_anon)

In [None]:
image_df = image_df_anon_3.append(image_df_test_anon)
print(len(image_df))
image_df.head()

##### Solve empty views

In [None]:
empty_view = image_df.loc[image_df['view'] == '']
print(' '.join(s for s in empty_view['image_name']))
empty_view

In [None]:
rl_view = image_df.loc[image_df['view'] == 'RL']
print(' '.join(s for s in rl_view['image_name']))
rl_view

In [None]:
Counter(image_df['view'])

##### Solve repeated images

In [None]:
[(k, v) for (k, v) in Counter(image_df['image_name']).items() if v > 1]

##### Check same patients

In [None]:
image_df.loc[image_df['ID'].str.startswith('37')]

In [None]:
Counter(image_df['ID'])

#### Merge with labels

In [None]:
cols = ['ID', 'Fecha consulta SU', 'Resultado consenso BSTI']
labels_df = main_df[cols]
labels_df.head()

##### Fix: use str types for ease of merge

In [None]:
labels_df['date'] = labels_df['Fecha consulta SU'].astype(str)
labels_df['ID'] = labels_df['ID'].astype(str)

labels_df['date'].dtypes

In [None]:
labels_df.loc[labels_df['ID'] == '37']

##### Patient 185 vs 185_2

is reversed in labels_df

In [None]:
image_df.loc[image_df['ID'].str.startswith('185')]

In [None]:
labels_df.loc[labels_df['ID'].str.startswith('185')]

In [None]:
labels_df.loc[184, 'ID'] = '185_2'
labels_df.loc[524, 'ID'] = '185'

In [None]:
labels_df.loc[labels_df['ID'].str.startswith('185')]

##### Merge!

In [None]:
merged = labels_df.merge(image_df, how='right',
                         left_on=['ID', 'date'], # Fecha consulta SU
                         right_on=['ID', 'date']
                        )
merged.head()

In [None]:
Counter(merged[LABEL_COL])

In [None]:
len(merged)

In [None]:
clean_df = merged[merged[LABEL_COL] != 'Indeterminado']

# clean_df.replace('Non-COVID', 'pneumonia', inplace=True)
clean_df.replace('Normal', 'normal', inplace=True)
clean_df.replace('Sugerente COVID', 'covid', inplace=True)
clean_df.tail()

In [None]:
len(clean_df)

In [None]:
clean_df.head()

##### Revisar NaN

In [None]:
clean_df.loc[clean_df[LABEL_COL] != clean_df[LABEL_COL]]

In [None]:
clean_df.dropna(axis=0, how='any', inplace=True)

In [None]:
clean_df.head()

##### Revisar cantidades

In [None]:
Counter(clean_df.loc[clean_df['view'].str.contains('P')][LABEL_COL])

In [None]:
Counter(clean_df[LABEL_COL])

In [None]:
Counter(labels_df[LABEL_COL])

#### Save

In [None]:
metadata_path = os.path.join(DATASET_DIR, 'metadata.csv')
clean_df.to_csv(metadata_path)

### Read metadatas

In [None]:
df = pd.read_csv(os.path.join(DATASET_DIR, 'metadata_anon_3.csv'), index_col=0)
df.head()

In [None]:
Counter(df[LABEL_COL])

In [None]:
# RESULTS ANON_3

### IMAGE SIZE 256

## FRONTAL ONLY

{'acc': 0.47306791569086654,                                                                                                        
         'cm': tensor([[  1,   1,   1],                                                                                                     
        [ 14,   9,   9],                                                                                                                    
        [145,  57, 190]]),                                                                                                                  
         'loss': 2.7440714836120605,                                                                                                        
         'prec_Non-COVID': 0.16279069767441862,                                                                                             
         'prec_covid': 0.005555555555555556,                                                                                                
         'prec_normal': 0.9509803921568627,                                                                                                 
         'recall_Non-COVID': 0.21875,                                                                                                       
         'recall_covid': 0.3333333333333333,                                                                                                
         'recall_normal': 0.49489795918367346,                                                                                              
         'spec_Non-COVID': 0.9088607594936708,                                                                                              
         'spec_covid': 0.5778301886792453,                                                                                                  
         'spec_normal': 0.7142857142857143}


## RESULTS ANON_3 FRONTAL + LATERAL
{'acc': 0.3447251114413076,
         'cm': tensor([[  1,   3,   0],
        [ 25,  16,   9],
        [253, 151, 215]]),
         'loss': 2.953134059906006,
         'prec_Non-COVID': 0.12396694214876033,
         'prec_covid': 0.003067484662576687,
         'prec_normal': 0.9557522123893806,
         'recall_Non-COVID': 0.3,
         'recall_covid': 0.25,
         'recall_normal': 0.34894991922455576,
         'spec_Non-COVID': 0.8298555377207063,
         'spec_covid': 0.5142002989536621,
         'spec_normal': 0.8148148148148148}

In [None]:
### IMAGE SIZE 512

### FRONTAL ONLY
{'acc': 0.3185011709601874,
         'cm': tensor([[  3,   0,   0],
        [ 20,   6,   6],
        [233,  28, 131]]),
         'loss': 3.2618606090545654,
         'prec_Non-COVID': 0.21739130434782608,
         'prec_covid': 0.011111111111111112,
         'prec_normal': 0.9552238805970149,
         'recall_Non-COVID': 0.15625,
         'recall_covid': 1.0,
         'recall_normal': 0.32653061224489793,
         'spec_Non-COVID': 0.9544303797468354,
         'spec_covid': 0.37028301886792453,
         'spec_normal': 0.8285714285714286}

## FRONTAL + LATERAL
{'acc': 0.24071322436849926,                                                                                                        
         'cm': tensor([[  2,   2,   0],                                                                                                     
        [ 24,  21,   5],                                                                                                                    
        [303, 160, 156]]),                                                                                                                  
         'loss': 5.56564998626709,                                                                                                          
         'prec_Non-COVID': 0.11875,                                                                                                         
         'prec_covid': 0.005449591280653951,                                                                                                
         'prec_normal': 0.9657534246575342,                                                                                                 
         'recall_Non-COVID': 0.38,                                                                                                          
         'recall_covid': 0.5,                                                                                                               
         'recall_normal': 0.22778675282714056,                                                                                              
         'spec_Non-COVID': 0.7736757624398074,                                                                                              
         'spec_covid': 0.45440956651718983,                                                                                                 
         'spec_normal': 0.9074074074074074}