## Imports

In [None]:
import os
from PIL import Image
import matplotlib.pyplot as plt
from torchvision import transforms
import torch
from tqdm.notebook import tqdm
import pandas as pd

In [None]:
%run ../covid_x.py

## Calculate mean and std

In [None]:
%run ../covid_x.py
%run ../../utils/images.py

In [None]:
images_dir = os.path.join(DATASET_DIR, 'train')
with open(os.path.join(DATASET_DIR, 'further_train_split.txt')) as f:
    train_images = [l.strip() for l in f.readlines()]
len(train_images)

In [None]:
mean, std = compute_mean_std(ImageFolderIterator(images_dir, train_images), show=True)
mean, std

## Load labels

In [None]:
labels_path = os.path.join(DATASET_DIR, 'train_split.txt')
columns = ['patient_id', 'image_name', 'label', 'source']

df = pd.read_csv(labels_path, sep=' ', header=None, names=columns)
df.head()

In [None]:
set(df['label'])

In [None]:
df.replace('COVID-19', 'covid', inplace=True)
df.head()

In [None]:
set(['1', 'a']) == set(['a', '1'])

In [None]:
df[df['label'] != 'covid'].index

## Split train-val

In [None]:
import random

In [None]:
%run covid_x.py

In [None]:
train_dataset = CovidXDataset('train')
test_dataset = CovidXDataset('test')
len(train_dataset), len(test_dataset)

In [None]:
train_dataset._metadata_df.groupby('label').count()['image_name']

In [None]:
test_dataset._metadata_df.groupby('label').count()['image_name']

In [None]:
(13892-468) // 468

In [None]:
def split_train_val(df, split=0.1):
    """The label distribution is mantained."""
    images_chosen = []
    
    labels = list(set(df['label']))
    for label in labels:
        # Filter only this label
        df_with_label = df[df['label'] == label]
        
        # Group images by patient
        images_by_patient = df_with_label.groupby('patient_id')['image_name'].apply(list)
        
        patients = list(images_by_patient.index)
        
        # Calculate split length
        n_images = len(df_with_label)
        split_len = int(n_images * split)
        
        # Choose images
        n_chosen = 0
        while n_chosen < split_len:
            # Choose one random patient
            patient = random.choice(patients)
            
            # Patient has 1 or more images
            images_from_patient = images_by_patient[patient]
            n_chosen += len(images_from_patient)

            # Add chosen images to main list
            images_chosen.extend(images_from_patient)

            # Remove patient from posible options
            patients.remove(patient)

    return images_chosen

In [None]:
labels_fpath = os.path.join(DATASET_DIR, 'train_split.txt')
columns = ['patient_id', 'image_name', 'label', 'source']
df = pd.read_csv(labels_fpath, sep=' ', header=None, names=columns)
df.head()

In [None]:
val_images = split_train_val(df, split=0.1)
train_images = [i for i in train_dataset._metadata_df['image_name'] if i not in val_images]

assert len(df) == len(train_images) + len(val_images)

len(train_images), len(val_images)

### Save split to files

In [None]:
def write_to_txt(arr, fname, sep='\n'):
    with open(fname, 'w') as f:
        for line in arr:
            f.write(line + sep)

In [None]:
write_to_txt(train_images, os.path.join(DATASET_DIR, 'further_train_split.txt'))

In [None]:
write_to_txt(val_images, os.path.join(DATASET_DIR, 'further_val_split.txt'))

## Debug `CovidXDataset` class

In [None]:
%run covid_x.py

In [None]:
train_dataset = CovidXDataset('train')
val_dataset = CovidXDataset('val')
test_dataset = CovidXDataset('test')
len(train_dataset), len(val_dataset), len(test_dataset)

In [None]:
train_patients = set(train_dataset._metadata_df['patient_id'])
val_patients = set(val_dataset._metadata_df['patient_id'])
test_patients = set(test_dataset._metadata_df['patient_id'])
len(train_patients), len(val_patients), len(test_patients)

In [None]:
train_patients.intersection(test_patients), \
train_patients.intersection(val_patients), \
val_patients.intersection(test_patients)

In [None]:
def get_dataset_distribution(dataset):
    temp_df = dataset._metadata_df.groupby('label').count()
    temp_df.rename(columns={'patient_id': 'counts'}, inplace=True)
    
    counts = temp_df['counts'].to_numpy()
    total = sum(counts)
    
    temp_df['percentage'] = counts / total * 100
    
    return temp_df[['counts', 'percentage']]

In [None]:
get_dataset_distribution(train_dataset)

In [None]:
get_dataset_distribution(val_dataset)

In [None]:
get_dataset_distribution(test_dataset)

In [None]:
100 + 885 + 594

In [None]:
covid = 421 + 47 + 100
pneum = 4913 + 545 + 594
normal = 7170 + 796 + 594
total = covid + pneum + normal
total, covid, pneum, normal

In [None]:
covid / total * 100, pneum / total * 100, normal /total * 100 

In [None]:
4913 / 421, 7170 / 421

In [None]:
100 + 885 + 594