In [None]:
import sys; sys.version_info

In [None]:
import os
import pandas as pd
import torch
from torchvision import transforms
from tqdm.notebook import tqdm
from PIL import Image
import matplotlib.pyplot as plt

In [None]:
%run ../covid_uc.py

### Calculate mean and std

In [None]:
%run ../../utils/images.py

In [None]:
labels_fpath = os.path.join(DATASET_DIR, 'metadata.csv')
metadata = pd.read_csv(labels_fpath, index_col=0)
metadata.head()

In [None]:
def get_image_names(frontal_only=False):
    if frontal_only:
        df = metadata.loc[metadata['view'].str.contains('P')]
    else:
        df = metadata
    return list(df['image_name'])

In [None]:
image_names_frontal = get_image_names(True)
image_names_all = get_image_names(False)
len(image_names_frontal), len(image_names_all)

In [None]:
images_dir = os.path.join(DATASET_DIR, 'images')

In [None]:
mean, std = compute_mean_std(ImageFolderIterator(images_dir, image_names_frontal), show=True)
mean, std

In [None]:
mean, std = compute_mean_std(ImageFolderIterator(images_dir, image_names_all), show=True)
mean, std

### Test `CovidUCDataset` class

In [None]:
%run ../covid_uc.py

In [None]:
dataset = CovidUCDataset('test', image_size=(256, 256), frontal_only=True)
len(dataset)

In [None]:
dataset._metadata_df

In [None]:
for image, label in dataset:
    break

In [None]:
label

In [None]:
image.size()

In [None]:
dataset.labels

In [None]:
dataset.get_labels_presence_for('Non-COVID')

### Split

In [None]:
%run ../covid_uc.py

In [None]:
import random
import os
from collections import Counter

In [None]:
LABEL_COL = 'Resultado consenso BSTI'

In [None]:
def save_list(items, name):
    filepath = os.path.join(DATASET_DIR, f'{name}.txt')
    with open(filepath, 'w') as f:
        for item in items:
            f.write(f'{item}\n')

    print(f'List saved to: {filepath}')

In [None]:
labels_fpath = os.path.join(DATASET_DIR, 'metadata.csv')
metadata = pd.read_csv(labels_fpath, index_col=0)
metadata.head()

In [None]:
images_by_patient = metadata.groupby('ID')['image_name'].apply(list)
images_by_patient

In [None]:
patients = list(set(metadata['ID']))
len(patients)

In [None]:
VAL_SPLIT = 0
TEST_SPLIT = 0.1

In [None]:
n_val = int(VAL_SPLIT * len(patients))
n_test = int(TEST_SPLIT * len(patients))
n_val, n_test

In [None]:
val_test_patients = random.sample(patients, n_val + n_test)
len(val_test_patients)

In [None]:
train_patients = [pat for pat in patients if pat not in val_test_patients]
len(train_patients)

In [None]:
combine_images = lambda pats: sum((images_by_patient[pat] for pat in pats), [])
count_images = lambda pats: sum(len(images_by_patient[pat]) for pat in pats)

In [None]:
count_images(train_patients), count_images(val_test_patients)

In [None]:
train_images = combine_images(train_patients)
val_test_images = combine_images(val_test_patients)

In [None]:
filter_meta = lambda images: metadata.loc[metadata['image_name'].isin(images)]

In [None]:
train_df = filter_meta(train_images)
val_test_df = filter_meta(val_test_images)
# train_df

In [None]:
Counter(train_df[LABEL_COL])

In [None]:
total = (94 + 678 + 23)
mult = 100/total
94 * mult, 678 * mult, 23 * mult

In [None]:
Counter(val_test_df[LABEL_COL])

In [None]:
4 / 88 * 100, 79 / 88 * 100, 5 / 88 * 100

In [None]:
len(train_images), len(val_test_images)

In [None]:
val_images = val_test_images[:n_val]
test_images = val_test_images[n_val:]
len(val_images), len(test_images)

In [None]:
save_list(train_images, 'train')
save_list(val_images, 'val')
save_list(test_images, 'test')