## Kaggle Competition: Passenger Screening Algorithm Challenge

###  Data preparation and analysis

In [None]:
#from __future__ import print_function
#from __future__ import division

import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.animation as anim
%matplotlib inline
import cv2
import glob
from os.path import basename
import pandas as pd
import h5py

# get package versions
def get_version(*vars):
    for var in vars:
        module = __import__(var)    
        print ('%s: %s' %(var,module.__version__))
    
# package version    
get_version('numpy','matplotlib','cv2')

### settings

In [None]:
COLORMAP = 'pink'

# data folder
path2data='../data/'

# select data extension
extension='*.aps'

# path to raw data
# raw .aps files are located here
path2rawData=path2data

# stage 1 labels
path2_stage1_labels=path2data+"/stage1_labels.csv"

# sample submission
path2_stage1_samplesubmission=path2data+"/stage1_sample_submission.csv"
path2_stage2_samplesubmission=path2data+"stage2_sample_submission.csv"

# outputs are stored here
path2stage1_train=path2data+'stage1_train.hdf5'
path2stage1_test=path2data+'stage1_test.hdf5'
path2stage2_test=path2data+'stage2_test.hdf5'

# body zone photo
path2bodyzone=path2data+'body_zones.png'

# body zone descriptions
# zone orders in the csv file are like this
zones=['zone1','zone10','zone11','zone12',
       'zone13','zone14','zone15','zone16',
       'zone17','zone2','zone3','zone4',
       'zone5','zone6','zone7','zone8','zone9']

body_zone_desc={
        'zone1' :'Right Bicep',
        'zone10':'Upper left Hip/thigh',
        'zone11':'Lower Right Thigh',
        'zone12':'Lower left Thigh',
        'zone13':'Right Calf',
        'zone14':'Left Calf(below knee)',
        'zone15':'Right Ankle Bone',
        'zone16':'Left Ankle Bone',
        'zone17':'Upper Back',    
        'zone2':'Right Forearm',
        'zone3':'Left Bicep',
        'zone4':'Left Forearm',
        'zone5':'Upper Chest',
        'zone6':'Right Rib Cage and Abs',
        'zone7':'Left Side Rib Cage and Abs',
        'zone8':'Upper Right Hip/Tigh',
        'zone9':'Groin (Sensetive area)'
        } 

In [None]:
# show the threat zones
body_zones_img = plt.imread(path2bodyzone)
fig, ax = plt.subplots(figsize=(15,15))
ax.imshow(body_zones_img);


### Read header

In [None]:
def read_header(infile):
    """Read image header (first 512 bytes)
    """
    h = dict()
    fid = open(infile, 'r+b')
    h['filename'] = b''.join(np.fromfile(fid, dtype = 'S1', count = 20))
    h['parent_filename'] = b''.join(np.fromfile(fid, dtype = 'S1', count = 20))
    h['comments1'] = b''.join(np.fromfile(fid, dtype = 'S1', count = 80))
    h['comments2'] = b''.join(np.fromfile(fid, dtype = 'S1', count = 80))
    h['energy_type'] = np.fromfile(fid, dtype = np.int16, count = 1)
    h['config_type'] = np.fromfile(fid, dtype = np.int16, count = 1)
    h['file_type'] = np.fromfile(fid, dtype = np.int16, count = 1)
    h['trans_type'] = np.fromfile(fid, dtype = np.int16, count = 1)
    h['scan_type'] = np.fromfile(fid, dtype = np.int16, count = 1)
    h['data_type'] = np.fromfile(fid, dtype = np.int16, count = 1)
    h['date_modified'] = b''.join(np.fromfile(fid, dtype = 'S1', count = 16))
    h['frequency'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['mat_velocity'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['num_pts'] = np.fromfile(fid, dtype = np.int32, count = 1)
    h['num_polarization_channels'] = np.fromfile(fid, dtype = np.int16, count = 1)
    h['spare00'] = np.fromfile(fid, dtype = np.int16, count = 1)
    h['adc_min_voltage'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['adc_max_voltage'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['band_width'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['spare01'] = np.fromfile(fid, dtype = np.int16, count = 5)
    h['polarization_type'] = np.fromfile(fid, dtype = np.int16, count = 4)
    h['record_header_size'] = np.fromfile(fid, dtype = np.int16, count = 1)
    h['word_type'] = np.fromfile(fid, dtype = np.int16, count = 1)
    h['word_precision'] = np.fromfile(fid, dtype = np.int16, count = 1)
    h['min_data_value'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['max_data_value'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['avg_data_value'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['data_scale_factor'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['data_units'] = np.fromfile(fid, dtype = np.int16, count = 1)
    h['surf_removal'] = np.fromfile(fid, dtype = np.uint16, count = 1)
    h['edge_weighting'] = np.fromfile(fid, dtype = np.uint16, count = 1)
    h['x_units'] = np.fromfile(fid, dtype = np.uint16, count = 1)
    h['y_units'] = np.fromfile(fid, dtype = np.uint16, count = 1)
    h['z_units'] = np.fromfile(fid, dtype = np.uint16, count = 1)
    h['t_units'] = np.fromfile(fid, dtype = np.uint16, count = 1)
    h['spare02'] = np.fromfile(fid, dtype = np.int16, count = 1)
    h['x_return_speed'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['y_return_speed'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['z_return_speed'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['scan_orientation'] = np.fromfile(fid, dtype = np.int16, count = 1)
    h['scan_direction'] = np.fromfile(fid, dtype = np.int16, count = 1)
    h['data_storage_order'] = np.fromfile(fid, dtype = np.int16, count = 1)
    h['scanner_type'] = np.fromfile(fid, dtype = np.int16, count = 1)
    h['x_inc'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['y_inc'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['z_inc'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['t_inc'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['num_x_pts'] = np.fromfile(fid, dtype = np.int32, count = 1)
    h['num_y_pts'] = np.fromfile(fid, dtype = np.int32, count = 1)
    h['num_z_pts'] = np.fromfile(fid, dtype = np.int32, count = 1)
    h['num_t_pts'] = np.fromfile(fid, dtype = np.int32, count = 1)
    h['x_speed'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['y_speed'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['z_speed'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['x_acc'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['y_acc'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['z_acc'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['x_motor_res'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['y_motor_res'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['z_motor_res'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['x_encoder_res'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['y_encoder_res'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['z_encoder_res'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['date_processed'] = b''.join(np.fromfile(fid, dtype = 'S1', count = 8))
    h['time_processed'] = b''.join(np.fromfile(fid, dtype = 'S1', count = 8))
    h['depth_recon'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['x_max_travel'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['y_max_travel'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['elevation_offset_angle'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['roll_offset_angle'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['z_max_travel'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['azimuth_offset_angle'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['adc_type'] = np.fromfile(fid, dtype = np.int16, count = 1)
    h['spare06'] = np.fromfile(fid, dtype = np.int16, count = 1)
    h['scanner_radius'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['x_offset'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['y_offset'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['z_offset'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['t_delay'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['range_gate_start'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['range_gate_end'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['ahis_software_version'] = np.fromfile(fid, dtype = np.float32, count = 1)
    h['spare_end'] = np.fromfile(fid, dtype = np.float32, count = 10)
    return h

# unit test ----------------------------------
#APS_FILE_NAME = path+'/00360f79fd6e02781457eda48f85da90.aps'
#header = read_header(APS_FILE_NAME)

#for data_item in sorted(header):
    #print ('{} -> {}'.format(data_item, header[data_item]))
    
# display a sample subject
def dispSampleSubject(X,y_zone): 
    # X shape: N*C*H*W
    sbj_num=np.random.randint(len(X))
    print ('subject: %s' %sbj_num)
    #array_stats(X[sbj_num])

    plt.figure(figsize=(15,15))
    for k in range(16):
        plt.subplot(4,4,k+1)
        plt.imshow(X[sbj_num,k],cmap='gray')

    # zones with objects
    nz_label=np.nonzero(y_zone[sbj_num,:])[0]
    for nz_l in nz_label:
        print ('%s: %s' %(zones[nz_l],body_zone_desc[zones[nz_l]]))
    return sbj_num    

### Read image data

In [None]:
def read_data(infile):
    """Read any of the 4 types of image files, returns a numpy array of the image contents
    """
    extension = os.path.splitext(infile)[1]
    h = read_header(infile)
    nx = int(h['num_x_pts'])
    ny = int(h['num_y_pts'])
    nt = int(h['num_t_pts'])
    fid = open(infile, 'rb')
    fid.seek(512) #skip header
    if extension == '.aps' or extension == '.a3daps':
        if(h['word_type']==7): #float32
            data = np.fromfile(fid, dtype = np.float32, count = nx * ny * nt)
        elif(h['word_type']==4): #uint16
            data = np.fromfile(fid, dtype = np.uint16, count = nx * ny * nt)
        data = data * h['data_scale_factor'] #scaling factor
        data = data.reshape(nx, ny, nt, order='F').copy() #make N-d image
    elif extension == '.a3d':
        if(h['word_type']==7): #float32
            data = np.fromfile(fid, dtype = np.float32, count = nx * ny * nt)
        elif(h['word_type']==4): #uint16
            data = np.fromfile(fid, dtype = np.uint16, count = nx * ny * nt)
        data = data * h['data_scale_factor'] #scaling factor
        data = data.reshape(nx, nt, ny, order='F').copy() #make N-d image
    elif extension == '.ahi':
        data = np.fromfile(fid, dtype = np.float32, count = 2* nx * ny * nt)
        data = data.reshape(2, ny, nx, nt, order='F').copy()
        real = data[0,:,:,:].copy()
        imag = data[1,:,:,:].copy()
    fid.close()
    if extension != '.ahi':
        return data
    else:
        return real, imag
    
    

matplotlib.rc('animation', html='html5')

def plot_image(path):
    data = read_data(path)
    
    fig = plt.figure(figsize = (8,8))
    ax = fig.add_subplot(111)
    def animate(i):
        im = ax.imshow(np.flipud(data[:,:,i].transpose()), cmap = 'viridis')
        return [im]
    return anim.FuncAnimation(fig, animate, frames=range(0,data.shape[2]), interval=200, blit=True)

def dump_to_video(dbpath, video_path):
    data = read_data(dbpath)
    w, h, n = data.shape
    fourcc = cv2.VideoWriter_fourcc(*'MJPG')
    out = cv2.VideoWriter(video_path, fourcc, 2.0, (w, h)) 

    for i in range(n):
        img = np.flipud(data[:,:,i].transpose())
        norm = plt.Normalize()
        img = norm(img) 
        img = plt.cm.viridis(img)
        img = (255.0 * img).astype(np.uint8)
        out.write(img)

    out.release()
    
def convert_to_grayscale(img):
    # scale pixel values to grayscale
    base_range = np.amax(img) - np.amin(img)
    rescaled_range = 255 - 0
    img_rescaled = (((img - np.amin(img)) * rescaled_range) / base_range)

    return np.uint8(img_rescaled)    

### Load list of subjects and labels

In [None]:
stagenn='stage1_'

# get list of files
path2raw1=path2rawData+stagenn+extension[2:]+'/'+extension
print('looking into '+path2raw1) 
list_of_files=glob.glob(path2raw1)
print ('number of files: %s' %(len(list_of_files)))

# read labels
stage1_train=pd.read_csv(path2_stage1_labels)
stage1_test=pd.read_csv(path2_stage1_samplesubmission)


# number of zones
nb_zones=17
print ('stage 1 train labels:', len(stage1_train))
print ('stage 1 train subjects:', len(stage1_train)/nb_zones)
print ('stage 1 test labels:', len(stage1_test))
print ('stage 1 test subjects: ', len(stage1_test)/nb_zones)

# labels
Probability=stage1_train.Probability
y=np.array(Probability)

n1=int(len(y)/nb_zones)
y=np.reshape(y,(n1,nb_zones))
print ('labels shape: '+str( y.shape))


# stats of labels
#nb_y_perclass=np.count_nonzero(y,axis=0)
#print ('labels per zone: %s' %nb_y_perclass)
#print ('labels per zone pcn: %s' %(1.*nb_y_perclass/n1*100.))
#plt.figure(figsize=(15,5))
#plt.stem((1.*nb_y_perclass/n1*100.))
#plt.xticks(range(nb_zones), zones)
#plt.title('# labels per zone')
#plt.xlabel('zones')
#plt.ylabel('# labels %')
#plt.show()

### Playing with data: fetch and dispaly sample video/images


In [None]:
path2image=np.random.choice(list_of_files)

# get file name
file_name=basename(path2image)
print ('sample video: '+ file_name)

# dump as video
path2video='./output/videos/'+file_name+'.avi'
#dump_to_video(path2image,path2video)

#rnd_num=np.random.randint(len(list_of_files))
#path2image=list_of_files[rnd_num]
X= read_data(path2image)
Xcp=X.transpose()
print ('video id, shape: ', file_name, X.shape)

# get label
sbj_num=stage1_train.index[stage1_train.Id==file_name[:-4]+'_Zone1']/nb_zones
print (sbj_num)
print (y.shape)
print (y[sbj_num,:])


# zones with objects
nz_label=np.nonzero(y[sbj_num,:])[1]
for nz_l in nz_label:
    print ('%s: %s' %(zones[nz_l],body_zone_desc[zones[nz_l]]))

plt.figure(figsize=(15,15))

for k in range(16):
    plt.subplot(4,4,k+1)
    #plt.imshow(X[:,:,k],cmap='gray')
    Xk=np.flipud(Xcp[k])
    plt.imshow(Xk,cmap=COLORMAP)
    

### playing with data: thresholding 

In [None]:
%%script false 
# this part is not needed
img = convert_to_grayscale(Xk)
# global thresholding
ret1,th1 = cv2.threshold(img,127,255,cv2.THRESH_BINARY)
# Otsu's thresholding
ret2,th2 = cv2.threshold(img,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
# Otsu's thresholding after Gaussian filtering
blur = cv2.GaussianBlur(img,(5,5),0)
ret3,th3 = cv2.threshold(blur,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
# plot all the images and their histograms
images = [img, 0, th1,
          img, 0, th2,
          blur, 0, th3]
titles = ['Original Noisy Image','Histogram','Global Thresholding (v=127)',
          'Original Noisy Image','Histogram',"Otsu's Thresholding",
          'Gaussian filtered Image','Histogram',"Otsu's Thresholding"]
plt.figure(figsize=(15,10))
for i in xrange(3):
    plt.subplot(3,3,i*3+1),plt.imshow(images[i*3],'gray')
    plt.title(titles[i*3]), plt.xticks([]), plt.yticks([])
    plt.subplot(3,3,i*3+2),plt.hist(images[i*3].ravel(),256)
    plt.title(titles[i*3+1]), plt.xticks([]), plt.yticks([])
    plt.subplot(3,3,i*3+3),plt.imshow(images[i*3+2],'gray')
    plt.title(titles[i*3+2]), plt.xticks([]), plt.yticks([])
plt.show()

## loading stage1 train data and store as HDF5

In [None]:
prefix='stage1_train'
stagenn='stage1_'

# location to hdf5 data
path2hdf5=path2data+prefix+'.hdf5'
print('checking local file: %s' %path2hdf5)
if not os.path.exists(path2hdf5):
    sg1_h5=h5py.File(path2hdf5,'w-')
    for k,row_num in enumerate(range(0,len(stage1_train),nb_zones)):
        id=stage1_train.Id[row_num][:-6]
        print ('subject id: %s %s' %(k, id))
        path2image=path2rawData+stagenn+extension[2:]+'/'+id+extension[1:]
        X= read_data(path2image)
        print (X.shape,X.dtype,np.min(X),np.max(X))
        sg1_h5[id]=X
    sg1_h5['labels']=y
    sg1_h5.close()
else:
    ### verify hf5
    print(path2hdf5+' exist!')
    sg1_h5=h5py.File(path2hdf5,'r')
    print ('number of files in %s: %s' %(prefix,len(sg1_h5.keys())))
    print ('labels:', sg1_h5['labels'].shape)
    id0=sg1_h5.keys()[0]
    print (sg1_h5[id0].shape)

### playing with data: visualization

In [None]:
keys=sg1_h5.keys()

### sample image and label
sbj_num=np.random.randint(len(keys))

id=keys[sbj_num]
print ('subject id: '+ id)

path2image=path2rawData+stagenn+extension[2:]+'/'+id+extension[1:]

X= read_data(path2image)
#Xcp=X.transpose()
print ('subject, shape: ',sbj_num, X.shape)
plt.figure(figsize=(15,15))

for k in range(16):
    plt.subplot(4,4,k+1)
    #Xk=np.flipud(Xcp[k])
    plt.imshow(X[:,:,k],cmap='gray')
    

## loading stage1-test data and save as hdf5

In [None]:
prefix='stage1_test'
stagenn="stage1_"

# location of hdf5
path2hdf5=path2data+prefix+'.hdf5'
print('checking: '+path2hdf5)
if not os.path.exists(path2hdf5):
    sg1_h5=h5py.File(path2hdf5,'w-')
    for k,row_num in enumerate(range(0,len(stage1_test),nb_zones)):
        id=stage1_test.Id[row_num][:-6]
        print ('subject id: %s %s' %(k, id))
        path2image=path2rawData+stagenn+extension[2:]+'/'+id+extension[1:]
        X= read_data(path2image)
        print (X.shape,X.dtype,np.min(X),np.max(X))
        sg1_h5[id]=X
    #sg1_h5['labels']=y # there is no label 
    sg1_h5.close()
else:
    ### verify hf5
    print(path2hdf5+' exist!')
    sg1_h5=h5py.File(path2hdf5,'r')
    print ('number of files in %s: %s' %(prefix,len(sg1_h5.keys())))
    id0=sg1_h5.keys()[0]
    print (sg1_h5[id0].shape)
    
    try:
        print ('labels:', sg1_h5['labels'].shape)
    except:
        pass

### Visualization

In [None]:
keys=sg1_h5.keys()

### sample image and label
sbj_num=np.random.randint(len(keys))

id=keys[sbj_num]
print ('subject id: '+ id)

path2image=path2rawData+stagenn+extension[2:]+'/'+id+extension[1:]

X= read_data(path2image)
#Xcp=X.transpose()
print ('subject, shape: ',sbj_num, X.shape)
plt.figure(figsize=(15,15))

#Xr=cv2.resize(X,(w,h),interpolation = cv2.INTER_CUBIC)

for k in range(16):
    plt.subplot(4,4,k+1)
    #Xk=np.flipud(Xcp[k])
    plt.imshow(X[:,:,k],cmap='gray')
    

## Stage 2        

In [None]:
# stage 2 sample submission
stage2_test=pd.read_csv(path2_stage2_samplesubmission)
stage2_test.head()

In [None]:
prefix='stage2_test'
stagenn='stage2_'

# location of hdf5
path2hdf5=path2data+prefix+'.hdf5'
print(path2hdf5)

if not os.path.exists(path2hdf5):
    sg1_h5=h5py.File(path2hdf5,'w-')
    for k,row_num in enumerate(range(0,len(stage2_test),nb_zones)):
        id=stage2_test.Id[row_num][:-6]
        print ('subject id: %s %s' %(k, id))
        path2image=path2rawData+stagenn+extension[2:]+'/'+id+extension[1:]
        X= read_data(path2image)
        print (X.shape,X.dtype,np.min(X),np.max(X))
        sg1_h5[id]=X
    #sg1_h5['labels']=y # there is no label 
    sg1_h5.close()
else:
    ### verify hf5
    print(path2hdf5+' exist!')
    sg2_h5=h5py.File(path2hdf5,'r')
    print ('number of files in %s: %s' %(prefix,len(sg2_h5.keys())))
    id0=sg2_h5.keys()[0]
    print (sg2_h5[id0].shape)
    
    try:
        print ('labels:', sg1_h5['labels'].shape)
    except:
        pass

In [None]:
keys=sg2_h5.keys()

### sample image and label
sbj_num=np.random.randint(len(keys))

id=keys[sbj_num]
print ('subject id: '+ id)

path2image=path2rawData+stagenn+extension[2:]+'/'+id+extension[1:]

X= read_data(path2image)
#Xcp=X.transpose()
print ('subject, shape: ',sbj_num, X.shape)
plt.figure(figsize=(15,15))

#Xr=cv2.resize(X,(w,h),interpolation = cv2.INTER_CUBIC)

for k in range(16):
    plt.subplot(4,4,k+1)
    #Xk=np.flipud(Xcp[k])
    plt.imshow(X[:,:,k],cmap='gray')
    

##  downsample/stack data into one array 

In [None]:
def get_data(data_type='stage1_train'):
    # number of zones
    nb_zones=17

    if data_type=='stage1_train':
        ff=h5py.File(path2stage1_train,'r')
        # get subject ids
        keys=ff.keys()
        ids=np.array(keys[:-1])
        y=ff['labels'].value
    elif data_type=='stage1_leader':
        ff=h5py.File(path2stage1_test,'r') 
        keys=ff.keys()
        ids=np.array(keys)
        y=np.zeros((nb_zones,len(ids)),'uint8')
    elif data_type=='stage2_leader':
        ff=h5py.File(path2stage2_test,'r') 
        keys=ff.keys()
        ids=np.array(keys)
        y=np.zeros((nb_zones,len(ids)),'uint8')
        
    X=[]
    for k,id1 in enumerate(ids):
        print (k,id1)
        X0= ff[id1].value
        X0=cv2.resize(X0,(w,h),interpolation = cv2.INTER_CUBIC)
        X.append(X0)
    X=np.stack(X)
    X=np.transpose(X,(0,3,1,2))
    return X,y.astype('uint8'),ids

# this was to rotate images
def rotateData(data_type,(h,w),rotationAngle=180):
    path2traintest=path2data+data_type+'_'+str(h)+'by'+str(w)+'.hdf5'
    path2traintestR=path2data+data_type+str(rotationAngle)+'_'+str(h)+'by'+str(w)+'.hdf5'

    # load train-test data
    if os.path.exists(path2traintest):
        print('loading %s' %path2traintest)
        ff_traintest=h5py.File(path2traintest,'r')
        X=ff_traintest['X']
        try:
            y=ff_traintest['y'].value
        except:
            y=np.zeros(len(X))
        ids=ff_traintest['ids'].value
    else:
        IOError('source data does not exist!')
        
    
    if not os.path.exists(path2traintestR):
        print('storing %s' %path2traintestR)
        print('wait ...')
        ## rotate
        n,c,h,w=X.shape
        if (rotationAngle // 90) % 2 ==0:
            Xr=np.zeros((n,c,h,w),dtype=X.dtype)
        else:
            Xr=np.zeros((n,c,w,h),dtype=X.dtype)
        for k in range(n):
            for k2 in range(c):
                Xr[k,k2]=np.rot90(X[k,k2],rotationAngle/90)
            #Xr[k]=np.transpose(X[k],(0,2,1))

        ff_traintestR=h5py.File(path2traintestR,'w')
        ff_traintestR['X']=Xr
        ff_traintestR['y']=y    
        ff_traintestR['ids']=np.array(ids,'string')    
        ff_traintestR.close()
        print ('hdf5 saved!')
    else:
        print(path2traintestR +' rotated data exists!')
        # load train-test data
        ff_traintestR=h5py.File(path2traintestR,'r')
        X=ff_traintestR['X']
        print ('X shape', X.shape)    


### stage1 train  

In [None]:
# downsample/stack data into one array for train-validation
data_type="stage1_train"
prefix="stage1_traintest"

h,w=256,330
path2traintest=path2data+prefix+'_'+str(h)+'by'+str(w)+'.hdf5'
if not os.path.exists(path2traintest):
    print ('wait ...')
    X,y,ids=get_data(data_type)
    ff_traintest=h5py.File(path2traintest,'w')
    ff_traintest['X']=X
    ff_traintest['y']=y    
    ff_traintest['ids']=np.array(ids,'string')    
    ff_traintest.close()
    print ('hdf5 saved!')
else:
    print(path2traintest+ " exists!")
    # load train-test data
    ff_traintest=h5py.File(path2traintest,'r')
    ids=ff_traintest['ids'].value
    X=ff_traintest['X']
    print ('X shape', X.shape)    
    print('-'*50)
    
# we also want rotated data
rotationAngles=[90,180,270]
for rotAng in rotationAngles:    
    rotateData(prefix,(h,w),rotAng)    
    print('-'*50)

In [None]:

# downsample/stack data into one array for train-validation
data_type="stage1_train"
prefix="stage1_traintest"

h,w=512,660
path2traintest=path2data+prefix+'_'+str(h)+'by'+str(w)+'.hdf5'
if not os.path.exists(path2traintest):
    print ('wait ...')
    X,y,ids=get_data(data_type)
    ff_traintest=h5py.File(path2traintest,'w')
    ff_traintest['X']=X
    ff_traintest['y']=y    
    ff_traintest['ids']=np.array(ids,'string')    
    ff_traintest.close()
    print ('hdf5 saved!')
else:
    print(path2traintest+ " exists!")
    # load train-test data
    ff_traintest=h5py.File(path2traintest,'r')
    ids=ff_traintest['ids'].value
    X=ff_traintest['X']
    print ('X shape', X.shape)    
    print('-'*50)
    
# we also want rotated data
rotationAngles=[180]
for rotAng in rotationAngles:    
    rotateData(prefix,(h,w),rotAng)    
    print('-'*50)

### stage1 leaderboard 

In [None]:
# downsample/stack data into one array for train-validation
data_type="stage1_leader"
prefix="stage1_leader"

h,w=256,330
path2traintest=path2data+prefix+'_'+str(h)+'by'+str(w)+'.hdf5'
if not os.path.exists(path2traintest):
    print ('wait ...')
    X,y,ids=get_data(data_type)
    ff_traintest=h5py.File(path2traintest,'w')
    ff_traintest['X']=X
    ff_traintest['y']=y    
    ff_traintest['ids']=np.array(ids,'string')    
    ff_traintest.close()
    print ('hdf5 saved!')
else:
    print(path2traintest+ " exists!")
    # load train-test data
    ff_traintest=h5py.File(path2traintest,'r')
    ids=ff_traintest['ids'].value
    X=ff_traintest['X']
    print ('X shape', X.shape)    
    print('-'*50)    
    
# we also want rotated data
rotationAngles=[90,180,270]
for rotAng in rotationAngles:    
    rotateData(prefix,(h,w),rotAng)    
    print('-'*50)    

In [None]:
# downsample/stack data into one array for train-validation
data_type="stage1_leader"
prefix="stage1_leader"

h,w=512,660
path2traintest=path2data+prefix+'_'+str(h)+'by'+str(w)+'.hdf5'
if not os.path.exists(path2traintest):
    print ('wait ...')
    X,y,ids=get_data(data_type)
    ff_traintest=h5py.File(path2traintest,'w')
    ff_traintest['X']=X
    ff_traintest['y']=y    
    ff_traintest['ids']=np.array(ids,'string')    
    ff_traintest.close()
    print ('hdf5 saved!')
else:
    print(path2traintest+ " exists!")
    # load train-test data
    ff_traintest=h5py.File(path2traintest,'r')
    ids=ff_traintest['ids'].value
    X=ff_traintest['X']
    print ('X shape', X.shape)    
    print('-'*50)    
    
# we also want rotated data
rotationAngles=[180]
for rotAng in rotationAngles:    
    rotateData(prefix,(h,w),rotAng)    
    print('-'*50)    

### stage2 leaderboard 

In [None]:
# downsample/stack data into one array for train-validation
data_type="stage2_leader"
prefix="stage2_leader"

h,w=256,330
path2traintest=path2data+prefix+'_'+str(h)+'by'+str(w)+'.hdf5'
if not os.path.exists(path2traintest):
    print ('wait ...')
    X,y,ids=get_data(data_type)
    ff_traintest=h5py.File(path2traintest,'w')
    ff_traintest['X']=X
    ff_traintest['y']=y    
    ff_traintest['ids']=np.array(ids,'string')    
    ff_traintest.close()
    print ('hdf5 saved!')
else:
    print(path2traintest+ " exists!")
    # load train-test data
    ff_traintest=h5py.File(path2traintest,'r')
    ids=ff_traintest['ids'].value
    X=ff_traintest['X']
    print ('X shape', X.shape)    
    print('-'*50)        
    
# we also want rotated data
rotationAngles=[90,180,270]
for rotAng in rotationAngles:    
    rotateData(prefix,(h,w),rotAng)    
    print('-'*50)        

In [None]:

# downsample/stack data into one array for train-validation
data_type="stage2_leader"
prefix="stage2_leader"

h,w=512,660
path2traintest=path2data+prefix+'_'+str(h)+'by'+str(w)+'.hdf5'

if not os.path.exists(path2traintest):
    print ('wait ...')
    X,y,ids=get_data(data_type)
    ff_traintest=h5py.File(path2traintest,'w')
    ff_traintest['X']=X
    ff_traintest['y']=y    
    ff_traintest['ids']=np.array(ids,'string')    
    ff_traintest.close()
    print ('hdf5 saved!')
else:
    print(path2traintest+ " exists!")
    # load train-test data
    ff_traintest=h5py.File(path2traintest,'r')
    ids=ff_traintest['ids'].value
    X=ff_traintest['X']
    print ('X shape', X.shape)    
    print('-'*50)        
    
# we also want rotated data
rotationAngles=[180]
for rotAng in rotationAngles:    
    rotateData(prefix,(h,w),rotAng)    
    print('-'*50)        