# About

### This notebook is a utility notebook that downloads a zipped subset of the original [`prostate-cancer-grade-assessment`](https://www.kaggle.com/competitions/prostate-cancer-grade-assessment) dataset according to the user's input. This dataset has been hosted on the Kaggle and hence this notebook must be used in the Kaggle environment.

#### Steps to get started:
1. Click on this link [(click here)](https://www.kaggle.com/competitions/prostate-cancer-grade-assessment) and navigate to the code section.
2. Click on `New Notebook` and upload this notebook.
3. Run all the cells.
4. After a successful run click on the link saying `Download zip File`.

> Your subset will be packed in a zip file

# Import Dependencies

In [1]:
import os
import shutil
import glob

import pandas as pd
import numpy as np

### Define Constants

In [2]:
# Constants for the Input data directories
BASE_DIR = '../input/prostate-cancer-grade-assessment'
IMAGES_DIR = os.path.join(BASE_DIR, 'train_images')
MASK_DIR = os.path.join(BASE_DIR, 'train_label_masks')

# Constants for the output data directories
__output_base = './subset'
__train_images_subset = './subset/train_images_subset'
__train_masks_subset = './subset/train_masks_subset'

### The below utility function is needed, as the authors of this dataset has not provided the masks for all the tiff image slides. Hence, to mitigate any `FileNotFoundErrors` this utility will be useful!

In [3]:
def create_mask_safe_df() -> pd.DataFrame:
    """Returns a pandas DataFrame object containing valid pairs of slide images and their associated mask images
    
    Returns:
        mask_safe_df: pandas.DataFrame. This dataframe contains information about the images which has a valid 
        mask associated in the original data.
    """
    
    train_df = pd.read_csv('../input/prostate-cancer-grade-assessment/train.csv') # read the original csv file
    masks = os.listdir('../input/prostate-cancer-grade-assessment/train_label_masks') # list the files in the mask dir
    masks_df = pd.Series(masks).to_frame()
    masks_df.columns = ['mask_file_name']
    masks_df['image_id'] = masks_df.mask_file_name.apply(lambda x: x.split('_')[0]) # remove '_mask' from the mask_id
    train_df = pd.merge(train_df, masks_df, on='image_id', how='outer')
    train_df = train_df[~train_df.mask_file_name.isna()] # drop rows whose masks are not present
    mask_safe_df = train_df.copy()
    mask_safe_df.reset_index(drop=True, inplace=True)
    
    return mask_safe_df

mask_safe_df = create_mask_safe_df()

In [4]:
def get_subset(subset_size: float, df: pd.DataFrame, output_filename: str, mode='balanced'):
    """Generates a subset of the original dataset and packs it in a zip file
    
    Args:
        subset_size: float. Size of the desired subset of the original data in GigaBytes(GB).
        df: pandas.DataFrame. A dataframe that contains raw information about the images and their masks.
        output_filename: str. filename of the final zipped subset (without '.zip' extension).
        mode: One of {"balanced", "random"}. Default is "balanced". Represents the desired distribution of all the
        classes in the subset.
        - 'balanced': Class distribution will be balanced throughout the subset.
        - 'random': Class distribution will be random.
        
    Raises:
        OSError: if the argument `subset_size` is set greater than the size of the original dataset.
    """
    
    size = 0
    for image_id in df['image_id'].values:
        size += os.path.getsize(f'{IMAGES_DIR}/{image_id}.tiff')
        if (subset_size * 1e+9) <= size:
            break
    
    else:
        raise OSError(f"size of the subset should be less than or eqaul to {size / 1e+9} GB") # 370.181554234 GB == size / 1e+9

    if not os.path.exists(__output_base):
        os.mkdir(__output_base)
    
    if not os.path.exists(__train_images_subset):
        os.mkdir(__train_images_subset)
        
    if not os.path.exists(__train_masks_subset):
        os.mkdir(__train_masks_subset)
        
        
    size = 0
    print(f'Making a {mode} subset')
    filenames = []


    if mode == 'balanced':
        
        counter = 0
        num_classes = len(df['isup_grade'].unique())
        class_wise_images = [list(df['image_id'][df['isup_grade'] == i].values) for i in range(num_classes)]
        
        while True:
            ind = np.random.randint(0, len(class_wise_images[counter]), 1)[0]
            
            img = f'{IMAGES_DIR}/{class_wise_images[counter][ind]}.tiff'            
            mask = f'{MASK_DIR}/{class_wise_images[counter][ind]}_mask.tiff'
            
            size += os.path.getsize(img)
            size += os.path.getsize(mask)
            
            if size >= subset_size * 1e+9:
                break
            
            shutil.copy2(img, __train_images_subset)
            shutil.copy2(mask, __train_masks_subset)
            
            filenames.append(class_wise_images[counter].pop(ind))
            
            counter = (counter + 1) % num_classes
        
    elif mode == 'random':
        
        class_wise_images = list(df['image_id'].values)
        
        while True:
            ind = np.random.randint(0, len(class_wise_images), 1)[0]
            
            img = f'{IMAGES_DIR}/{class_wise_images[ind]}.tiff'            
            mask = f'{MASK_DIR}/{class_wise_images[ind]}_mask.tiff'
            
            size += os.path.getsize(img)
            size += os.path.getsize(mask)
            
            if size >= subset_size * 1e+9:
                break
            
            shutil.copy2(img, __train_images_subset)
            shutil.copy2(mask, __train_masks_subset)
            
            filenames.append(class_wise_images.pop(ind))

            
                    
    df_copy = df.copy()
    df_copy.set_index('image_id', drop=False, inplace=True)
    sliced_df_save = df_copy.loc[filenames]
    sliced_df_save.to_csv('./subset/train_subset.csv', index=False, encoding='utf-8')
        
    shutil.make_archive(output_filename, 'zip', './subset')
        
    print(f'{output_filename}.zip file created with {len(filenames)} '
            'tiff files and corresponding masks')

In [5]:
get_subset(0.1, mask_safe_df, 'PANDA_subset', mode='random')

Making a random subset
PANDA_subset.zip file created with 1 tiff files and corresponding masks


Click this link below to download the zipfile

<a href="./PANDA_subset.zip"> Download zip File </a>

In [6]:
# shutil.rmtree("./subset")
# os.remove("./PANDA_subset.zip")