In [2]:
# install some libraries if not installed already
!pip install --upgrade scikit-image 
!pip install imgaug
!pip install pandas==0.23.4
!pip install pydicom
!pip install seaborn==0.9.0

Requirement already up-to-date: scikit-image in /opt/conda/lib/python3.6/site-packages (0.14.2)
Collecting seaborn
[?25l  Downloading https://files.pythonhosted.org/packages/a8/76/220ba4420459d9c4c9c9587c6ce607bf56c25b3d3d2de62056efe482dadc/seaborn-0.9.0-py3-none-any.whl (208kB)
[K    100% |████████████████████████████████| 215kB 41.5MB/s ta 0:00:01
Installing collected packages: seaborn
Successfully installed seaborn-0.9.0


In [3]:
# import some libraries we will use
import os, sys, glob, csv, random
import pydicom

import matplotlib.pyplot as plt
import numpy as np

from skimage import io

import seaborn as sns
import pandas as pd

In [6]:
import Image

ModuleNotFoundError: No module named 'Image'

In [None]:
# MNIST handwritten digits

In [None]:
# get the dataset if you don't have it already or on Colab - 1
!git clone https://github.com/myleott/mnist_png.git

In [None]:
# get the dataset if you don't have it already or on Colab - 2
os.chdir('mnist_png')

In [None]:
# get the dataset if you don't have it already or on Colab - 3
!tar -xzf mnist_png.tar.gz
!ls

In [None]:
mnist_base_dir_tr = 'mnist_png' # or point to appropriate directory if you're on Biowulf and already have the data

In [None]:
# load all images in the training set
mnist_digits = dict.fromkeys(range(10))
for num in range(10):
    mnist_digits[num] = glob.glob(os.path.join(mnist_base_dir_tr, 'training', str(num), '*.png'))

In [None]:
# visualize the number of class instances
sns.barplot(x=[num for num in range(10)], y=[len(mnist_digits[num]) for num in mnist_digits])

In [None]:
# Exercise #1: Get the total number of instances (digits 0-9) in the training set

In [None]:
# Exercise #2: Do the same (visualize number of class instances, get total number of instances) for the testing set

In [None]:
# Get the mean and std of the 1000 sample images of each digits in the training set
digits_sample_1000 = np.empty((10*1000,28*28))
idx = 0
for num in range(10):
    ridx = random.sample(range(len(mnist_digits[num])), 1000)
    for i in range(1000):
        digits_sample_1000[idx] = io.imread(mnist_digits[num][ridx[i]]).flatten()
        idx += 1

In [None]:
print('mean:', round(np.mean(digits_sample_1000), 2), 'std:', round(np.std(digits_sample_1000), 2))

In [None]:
# normalize the data to have zero mean and 1 std
digits_sample_1000_norm = (digits_sample_1000 - np.mean(digits_sample_1000)) / np.std(digits_sample_1000)

In [None]:
print('mean:', round(np.mean(digits_sample_1000_norm), 2), 'std:', round(np.std(digits_sample_1000_norm), 2))

In [None]:
# chest x-ray

In [None]:
# Download and unzip pneumonia detection data if you're on Colab or don't have it already on Biowulf - 1
ROOT_DIR = '../pn_chest_xray'
if not os.path.exists(ROOT_DIR):
    os.makedirs(ROOT_DIR)
os.chdir(ROOT_DIR)

In [None]:
# Download and unzip pneumonia detection data if you're on Colab or don't have it already on Biowulf - 2
!wget --no-check-certificate -r 'https://docs.google.com/uc?export=download&id=1ertt7A696f8HC0qk-yqmSoTZKCsp2XWU' -O stage_1_train_images.zip

In [None]:
# Download and unzip pneumonia detection data if you're on Colab or don't have it already on Biowulf - 3
!wget --no-check-certificate -r 'https://docs.google.com/uc?export=download&id=1ppgqzDVa9hQYbxgjV_k7AUUOKTw7ca9Q' -O stage_1_test_images.zip

In [None]:
# Download and unzip pneumonia detection data if you're on Colab or don't have it already on Biowulf - 4
!wget --no-check-certificate -r 'https://docs.google.com/uc?export=download&id=1yI8jJw9DGMeZeMgHaoZBUeuebt11qe0v' -O stage_1_train_labels.csv.zip

In [None]:
# Download and unzip pneumonia detection data if you're on Colab or don't have it already on Biowulf - 5
!wget --no-check-certificate -r 'https://docs.google.com/uc?export=download&id=1lpa5bsCKOgesx1zP7h5UIDwN4FzvjooD' -O stage_1_detailed_class_info.csv.zip

In [None]:
# Download and unzip pneumonia detection data if you're on Colab or don't have it already on Biowulf - 6
# unzipping takes a few minutes
!unzip -q -o stage_1_test_images.zip -d stage_1_test_images
!unzip -q -o stage_1_train_images.zip -d stage_1_train_images
!unzip -q -o stage_1_train_labels.csv.zip
!unzip -q -o stage_1_detailed_class_info.csv.zip

In [None]:
# Download and unzip pneumonia detection data if you're on Colab or don't have it already on Biowulf - 7
# fix some permission issues
!chmod 644 stage_1_detailed_class_info.csv
!chmod 644 stage_1_train_labels.csv

In [None]:
# Helper function to load the chest x-ray pneumonia dataset
def load_cxray_list(ROOT_DIR, stage_num, test_only_part):
  train_dicom_dir = os.path.join(ROOT_DIR, 'stage_' + stage_num + '_train_images')
  test_dicom_dir = os.path.join(ROOT_DIR, 'stage_' + stage_num + '_test_images')
  bbox_path = os.path.join(ROOT_DIR, 'stage_' + stage_num + '_train_labels.csv')

  # make DataFrame with images
  det_class_df = pd.read_csv(os.path.join(ROOT_DIR, 'stage_' + stage_num + '_detailed_class_info.csv'))
  bbox_df = pd.read_csv(os.path.join(ROOT_DIR, 'stage_' + stage_num + '_train_labels.csv'))
  comb_bbox_df = pd.concat([bbox_df, 
                            det_class_df.drop('patientId',1)], 1)
 
  image_df = pd.DataFrame({'path': glob.glob(os.path.join(train_dicom_dir, '*.dcm'))})
  image_df['patientId'] = image_df['path'].map(lambda x: os.path.splitext(os.path.basename(x))[0])

  if test_only_part:
    det_class_df = det_class_df.iloc[:1000]
    bbox_df = bbox_df.iloc[:1000]
    comb_bbox_df = comb_bbox_df.iloc[:1000]

    image_df = (comb_bbox_df.merge(image_df))[['path', 'patientId']]
 
  print(image_df.shape[0], 'images found')
  img_pat_ids = set(image_df['patientId'].values.tolist())
  box_pat_ids = set(comb_bbox_df['patientId'].values.tolist())
  # check to make sure there is no funny business
  assert img_pat_ids.union(box_pat_ids)==img_pat_ids, "Patient IDs should be the same"

  # get data statistics
  DCM_TAG_LIST = ['PatientAge', 'BodyPartExamined', 'ViewPosition', 'PatientSex']
  def get_tags(in_path):
      c_dicom = pydicom.read_file(in_path, stop_before_pixels=True)
      tag_dict = {c_tag: getattr(c_dicom, c_tag, '') 
           for c_tag in DCM_TAG_LIST}
      tag_dict['path'] = in_path
      return pd.Series(tag_dict)
  image_meta_df = image_df.apply(lambda x: get_tags(x['path']), 1)
  image_meta_df['PatientAge'] = image_meta_df['PatientAge'].map(int)

  # merge into one DataFrame
  image_full_df = pd.merge(image_df,
                           image_meta_df,
                           on='path')
  image_bbox_df = pd.merge(comb_bbox_df, 
                           image_full_df, 
                           on='patientId',
                          how='left') 

  # make (image_path, label) lists
  image_paths = []
  labels = []
  class_to_label = {'Normal': 0, 'Lung Opacity': 1, 'No Lung Opacity / Not Normal': 2}
  for idx, row in image_bbox_df.iterrows():
      image_paths.append(row['path'])
      labels.append(class_to_label[row['class']])


  # shuffle
  image_paths_sh = []
  labels_sh = []
  idxs = np.random.permutation(len(image_paths))
  for i in range(len(image_paths)):
      image_paths_sh.append(image_paths[idxs[i]])
      labels_sh.append(labels[idxs[i]])

  # split the data into train/val
  image_paths_train = image_paths_sh[:round(len(image_paths_sh)*0.8)]
  labels_train = labels_sh[:round(len(labels_sh)*0.8)]
  
  image_paths_val = image_paths_sh[int(round(len(image_paths_sh)*0.8)):]
  labels_val = labels_sh[int(round(len(labels_sh)*0.8)):]

  return image_paths_train, labels_train, image_paths_val, labels_val

In [None]:
image_paths_train, labels_train, image_paths_val, labels_val = load_cxray_list(ROOT_DIR, '1', False)

In [None]:
# Exercise 3: Plot the number of instances for each class in the training set
# ('Normal': 0, 'Lung Opacity': 1, 'No Lung Opacity / Not Normal': 2)

In [None]:
# Exercise 4: Plot the number of instances for each class in the validation set

In [None]:
# Exercise 5: Get the mean and std for the images in the training set

In [None]:
# Normalize the images, and print the mean and std for the images in the training set

In [9]:
seaborn.__version__

'0.9.0'

In [8]:
import seaborn