# Read in data

In [14]:
import matplotlib.pyplot as plt
import numpy as np
import pydicom
import pylab
import pandas as pd
from glob import glob
import os.path as op
from sklearn import decomposition

np.random.seed(10000)

class_info_path = '../input/stage_1_detailed_class_info.csv'
train_labels_path = '../input/stage_1_train_labels.csv'
images_dir = '../input/stage_1_train_images/'

# data frames
class_info_df = pd.read_csv(class_info_path)
train_labels_df = pd.read_csv(train_labels_path)
images_df = pd.DataFrame({'path': glob(op.join(images_dir, '*.dcm'))})
images_df['patientId'] = images_df['path'].map(lambda x: op.splitext(op.basename(x))[0])
# parse DICOM header into dataframe
DICOM_TAGS = ['PatientAge', 'ViewPosition', 'PatientSex']
def get_tags(image_path):
    tag_data = pydicom.read_file(image_path, stop_before_pixels = True)
    tag_dict = {tag: getattr(tag_data, tag, '') for tag in DICOM_TAGS}
    tag_dict['path'] = image_path
    return pd.Series(tag_dict)
meta_df = images_df.apply(lambda x: get_tags(x['path']), 1)
meta_df['PatientAge'] = meta_df['PatientAge'].map(int)
meta_df.drop('path', 1).describe(exclude=[np.number])

# concatenate the data frames
info_df = pd.concat([class_info_df, train_labels_df.drop('patientId', 1)], 1)
image_with_meta_df = pd.merge(images_df, meta_df, on='path')
bbox_with_info_df = pd.merge(info_df, image_with_meta_df, on='patientId', how='left')

# parse data into a dictionary by patientId
def parse_patient_data(df):
    """
    Parse pandas dataframe into the following dictionary:
      data = {
        patientID: {
          'dicom': path/to/dicom/file,
          'target': 0 if normal, 1 if pneumonia,
          'boxes': list of box(es), each box is an array of number [x y width height],
          'class': one of the three values 'Lung Opacity', 'No Lung Opacity / Not Norma', 'Normal',
          'age': age of the patient,
          'view': either 'AP' - anteriorposterior, or 'PA' - posterioranterior,
          'sex': either 'Male' or 'Female'
        },
        ...
      }
    """
    
    extract_box = lambda row: [row['x'], row['y'], row['width'], row['height']]
    
    data = {}
    for n, row in df.iterrows():
        pid = row['patientId']
        if pid not in data:
            data[pid] = {
                'dicom': '%s/%s.dcm' % (images_dir, pid),
                'target': row['Target'],
                'class': row['class'],
                'age': row['PatientAge'],
                'view': row['ViewPosition'],
                'sex': row['PatientSex'],
                'boxes': []}
            
        if data[pid]['target'] == 1:
            data[pid]['boxes'].append(extract_box(row))
    return data

patients_data = parse_patient_data(bbox_with_info_df)
patient_ids = list(patients_data.keys())
print(patients_data[np.random.choice(patient_ids)])

{'dicom': '../input/stage_1_train_images//677b6db0-31ab-47d4-bb96-5cd301c8f6ac.dcm', 'target': 0, 'class': 'Normal', 'age': 11, 'view': 'PA', 'sex': 'F', 'boxes': []}


# Get image matrix

In [24]:
num_samples = 100
im_dim = 1024
im_matrix = np.zeros((im_dim * im_dim, num_samples))
for i in range(num_samples):
    pid = patient_ids[i]
    pdata = patients_data[pid]
    d = pydicom.read_file(pdata['dicom'])
    im = d.pixel_array
    
    im_matrix[:, i] = im.flatten()

# Run PCA

In [39]:
def visualize_component(component, im_dim):
    im = component.reshape((im_dim, im_dim))
    im = np.stack([im] * 3, axis = 2)
    
    pylab.imshow(im, cmap = pylab.cm.gist_gray)

pca = decomposition.FastICA(n_components = 10)
pca.fit(im_matrix)
transformed_im_matrix = pca.transform(im_matrix)

print(im_matrix.shape)
print(transformed_im_matrix.shape)
print(np.amax(transformed_im_marix))
print(np.amin(transformed_im_marix))

for i in range(10):
    plt.figure()
    visualize_component(transformed_im_matrix[:, i], im_dim)

(1048576, 100)
(1048576, 10)


NameError: name 'amax' is not defined