# <center>Model Code for the </center>
# <center>Department of Homeland Security </center>
# <center>Passenger Screening Algorithm Challenge.</center>

# General imports and initializations

In [None]:
import math
import os
import pdb
import numpy as np
import scipy as sp
import matplotlib
import matplotlib.pyplot as plt
import datetime
import csv

from scipy import ndimage
from scipy.signal import medfilt
from scipy.signal import savgol_filter
from scipy.signal import resample
from scipy.ndimage.filters import gaussian_filter
from scipy.ndimage.filters import median_filter
from scipy.ndimage.morphology import binary_fill_holes
from scipy.ndimage.morphology import binary_dilation
from scipy.misc import imsave
from scipy.ndimage import imread
from copy import deepcopy
from scipy import linalg
from scipy.interpolate import interp1d
from skimage.transform import resize
from sklearn.preprocessing import binarize

In [None]:
!pip3 install Cython
%load_ext cython

In [None]:
%matplotlib inline

In [None]:
plt.style.use('classic')

In [None]:
print(str(datetime.datetime.now()) + "    Started")

# Project-specific imports and initializations

In [None]:
# Get project functions
from CompetitionFileIOFunctions import initRootFolders, initLog, log, filenames, filePath, loadFile

In [None]:
# Init root folders
inCloud = True
if inCloud:
    # Working in the cloud.  (This was used only for embeding scans in 2D.)
    # Will read scans from the bucket and save results in paths relative to notebook:
    initRootFolders(bucketName='kaggle_passenger_screening123407', localIOPath='')
else:
    # Working on a desktop
    # Won't bother with the bucket, but will set a constant local IO path to sidestep versioning.
    initRootFolders(
        bucketName='', 
        localIOPath='/media/qwerty/science/science data/2017-10-18 Kaggle passenger screening/'
    )

# Name the input/output folders
# cloud/ and local/ refer to locations defined in bucketName and localIOPath (above)
scanDir1 = "cloud/stage1_a3d/"
embeddedDir1 = "local/embedded2D/stage1/"
highlightDir1 = "local/highlight/stage1/"

scanDir2 = "cloud/stage2_a3d/"
embeddedDir2 = "local/embedded2D/stage2/"

logDir = "local/log/"

In [None]:
# Initialize log file
initLog(logDir, 'embed')

In [None]:
# Threshold for finding body region in 3D
threshold3D = .0002

# Read file data

In [None]:
inputFiles = filenames(embeddedDir1)
inputFiles = [f for f in inputFiles if f[-4:]=='.png']

In [None]:
bodyImages = np.array([loadFile(f) for f in inputFiles[:100]], dtype=np.float32)

fig, ax = plt.subplots(1, figsize=(15,15))
ax.imshow(bodyImages[3], cmap = 'viridis', interpolation = 'nearest')

# Define rectangular image regions for body zones

In [None]:
###################################################
# 8 hyperparameters that help define body zones
# These should be optimized
###################################################

# Leg image horizontal separations
sock = 60
knee = 120
shorts = 187

# Trunk image horizontal separations
waist = 75
chest = 200

# Trunk image vertical radii
trunkGroinRadius = 22
legGroinRadius = 19
chestRadius = 54


###################################################
# Define body zones
###################################################

# Heights and widths of body segments in combined image
trunkH = 360
legH = 180
bicepH = 90
bicepW = 128
forearmH = 70

# Helper variables
frontCenter = trunkH/2
backCenter0 = int(trunkH/6)
backCenter1 = int(trunkH*5/6)
legShadow = 120

zonesDef = {
    1: [  # Right Bicep
        ((trunkH + 2*legH, 0), (trunkH + 2*legH + bicepH, bicepW))
    ],
    2: [  # Right Forearm
        ((trunkH + 2*legH, bicepW), (trunkH + 2*legH + forearmH, 256))
    ],
    3: [  # Left Bicep
        ((trunkH + 2*legH + bicepH,0), (trunkH + 2*legH + 2*bicepH, bicepW))
    ],
    4: [  # Left Forearm
        ((trunkH + 2*legH + bicepH, bicepW), (trunkH + 2*legH + bicepH + forearmH, 256))
    ],
    5: [  # Chest
        ((trunkH/2 - chestRadius, chest), (trunkH/2 + chestRadius, 256))
    ],
    6: [ # Right abdomen
        ((backCenter0, waist), (frontCenter, chest))    
    ],
    7: [ # Left abdomen
        ((frontCenter, waist), (backCenter1, chest))
    ],
    8: [ # Right thigh
        ((trunkH + legGroinRadius, shorts), (trunkH + legShadow - legGroinRadius, 256))
    ],
    9: [ # Groin (4 regions)
        ((frontCenter - trunkGroinRadius, 0), (frontCenter + trunkGroinRadius, waist)),
        ((backCenter0 - trunkGroinRadius, 0), (backCenter0 + trunkGroinRadius, waist)),
        ((trunkH + legShadow - legGroinRadius, shorts-10), (trunkH + legShadow + legGroinRadius, 256)),
        ((trunkH + legH + legShadow - legGroinRadius, shorts-10), (trunkH + legH + legShadow + legGroinRadius, 256)),
    ],
    10: [ # Left thigh
        ((trunkH + legH + legGroinRadius, shorts), (trunkH + legH + legShadow - legGroinRadius, 256))
    ],
    11: [ # Right knee
        ((trunkH, knee), (trunkH + legH, shorts))
    ],
    12: [ # Left knee
        ((trunkH + legH, knee), (trunkH + 2*legH, shorts))
    ],
    13: [ # Right calf
        ((trunkH, sock), (trunkH + legH, knee))
    ],
    14: [ # Left calf
        ((trunkH + legH, sock), (trunkH + 2*legH, knee))
    ],
    15: [ # Right foot
        ((trunkH, 0), (trunkH + legH, sock))
    ],
    16: [ # Left foot
        ((trunkH + legH, 0), (trunkH + 2*legH, sock))
    ],
    17: [ # Upper back
        ((backCenter0 - chestRadius, chest), (backCenter0 + chestRadius, 256))
    ]
}

# Get labelled data

In [None]:
def getLabels(file):
    """
    Reads a *_labels.csv file for this competition
    Returns a list of dictionaries
    """
    
    labels = 0
    
    with open(file, newline='\n') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        labels = list(reader)
        labels = labels[1:]
        labels = [{
            'Id': x[0], 
            'Scan': x[0][0:32], 
            'Zone': int(x[0][37:]), 
            'Probability': int(round(float(x[1])))
        } for x in labels]
        
    return labels

In [None]:
def writeLabels(file, labels):
    """
    Writes a labels file for this competition.
    Use to submit predictions.
    """
    
    lines = [x['Id'] + ',' + str(x['Probability']) + '\n' for x in labels]
    
    with open(file, 'w') as csvfile:
        csvfile.write('Id,Probability\n')
        csvfile.writelines(lines)

In [None]:
def selectZone(labels, zone, probability):
    """
    Given a list of labels, 
    selects the rows corresponding to a given body zone, 
    with the requested probability of containing contraband.
    probability: 
        0: no contraband
        1: with contraband
        -1: anything
    """
    
    out = [x for x in labels if x['Zone'] == zone and (x['Probability'] == probability or probability == -1)]
    return out.copy()

In [None]:
labels = getLabels(filePath('local/stage1_labels.csv'))

# Save folder of body images with contraband highlighted

Takes about 4 minutes

In [None]:
def highlightZone(image, zone, zonesDef):
    """
    Draws a square around the requested zone on the given body image.
    """
    
    out = image.copy()
    nx, ny, dmy = image.shape

    rectangles = zonesDef[zone]
    
    for rect in rectangles:
        # Get rectangle top left (start) and bottom right (end)
        [[xStart, yStart], [xEnd, yEnd]] = np.array(rect, dtype=np.int32)
        
        # Clip to valid image coordinates
        xStart = np.clip(xStart, 0, nx-1)
        xEnd = np.clip(xEnd, 0, nx-1)
        yStart = np.clip(yStart, 0, ny-1)
        yEnd = np.clip(yEnd, 0, ny-1)
        
        # Draw rectangle
        out[xStart:xEnd, yStart] += 10
        out[xStart:xEnd, yEnd] += 10
        out[xStart, yStart:yEnd] += 10
        out[xEnd, yStart:yEnd] += 10

    return np.clip(out, 0, 1)

In [None]:
def highlightZones(file, zonesDef, labels):
    """
    Loads the given body image file and highlights all the zones with contraband.
    """
    
    base = file.split('/')[-1]
    scan = base.split('.')[-2]

    lbls = [(x['Zone'], x['Probability']) for x in labels if x['Scan'] == scan]

    img = loadFile(file)

    for i in range(len(lbls)):
        if lbls[i][1] == 1:
            img = highlightZone(img, lbls[i][0], zonesDef)

    return img

In [None]:
def saveImage(image, outputDir, inputFile):
    """Saves a body image file"""
    base = inputFile.split('/')[-1]
    base = base.split('.')[-2]
    outputFile = outputDir + base + '.png'
    outputFile = filePath(outputFile)
    imsave(outputFile, image)

In [None]:
for f in inputFiles:
    img = loadFile(f)
    img = highlightZones(f, zonesDef, labels)
    saveImage(img, highlightDir1, f)

# Extract a zone's image

In [None]:
def extractZone(bodyImg, zone, zonesDef):
    """
    Extracts rectangular image patches for the requested zone.
    The sensitive area is made of multiple patches.
    """
    
    nx, ny, dmy = bodyImg.shape
    
    rectangles = zonesDef[zone]
    
    images = []
    
    for rect in rectangles:
        # Get rectangle top left (start) and bottom right (end)
        [[xStart, yStart], [xEnd, yEnd]] = np.array(rect, dtype=np.int32)
        
        # Clip to valid image coordinates
        xStart = np.clip(xStart, 0, nx-1)
        xEnd = np.clip(xEnd, 0, nx)
        yStart = np.clip(yStart, 0, ny-1)
        yEnd = np.clip(yEnd, 0, ny)
        
        images.append(bodyImg[xStart:xEnd, yStart:yEnd])
        
    width = max([x.shape[1] for x in images])
    
    images = [resize(x, (len(x), width)) for x in images]
    
    return np.concatenate(images)

In [None]:
img = bodyImages[0]
x = extractZone(img, 15, zonesDef)

fig, ax = plt.subplots(1, figsize=(5,5))
ax.imshow(x)

# Get zone images for entire dataset

In [None]:
def getZoneImages(folder, scanIDs, zonesDef):

    """
    Function to get zone images
    Takes list of dictionaries, each containing 'Scan' field
        Don't need to be unique
    Returns labels list with zone image column
    """

    # Get dictionary of scan images
    uniqeScans = set([x['Scan'] for x in scanIDs])
    bodyImages = {s: loadFile(folder + s + '.png') for s in uniqeScans}

    # Init empty output if necessary
    zonesAlreadyEnumerated = len(scanIDs) == len(zonesDef)*len(bodyImages)
    zonesNotEnumerated = len(scanIDs) == len(bodyImages)
    if zonesAlreadyEnumerated:
        out = scanIDs.copy()
    elif zonesNotEnumerated:
        out = [
            {
                'Id': s + '_Zone' + str(z),
                'Scan': s,
                'Zone': z
            }
            for s in bodyImages.keys()
            for z in range(1,18)
        ]
    else:
        print('Error in getZoneImages: incorrect number of scanIDs.  Scans must either be unique or duplicated 17 times')
        return 0

    # Extract image zones
    for d in out:
        zoneImg = extractZone(bodyImages[d['Scan']], d['Zone'], zonesDef)
        d['ZoneImage'] = zoneImg

    return out

In [None]:
labels = getZoneImages(embeddedDir1, labels, zonesDef)

In [None]:
l = labels[6]
print('Zone: ', l['Zone'], '\nProbability:', l['Probability'])
fig, ax = plt.subplots(1, figsize=(5,5))
ax.imshow(l['ZoneImage'])

# Characterize normal variation with SVD, subtract as background

In [None]:
class imageDimensionalityReducer():
    """
    Performs PCA on a list of monochromatic images.
    The reduce method projects images onto the basis of the most significant principal components.
    The list of images should have dimensions (n,height,width)
    """
    
    def __init__(self, numSV=15):
        self.numSV = numSV
    
    def fit(self, images):

        # Get a copy
        imgs = images.copy()
        
        # Zero-center the images
        imgs -= 0.5

        # Get list of flattened full-size images
        matFull = np.reshape(imgs, (imgs.shape[0], -1))

        # Get list of flattened small images
        # This will speed up calculation of the correlation matrix
        mat = np.array([resize(x, (30,30)) for x in imgs])
        mat = np.reshape(mat, (mat.shape[0], -1))

        # Get correlation matrix
        cor = np.dot(mat, np.transpose(mat))

        # Get eigenbasis for column space of mat
        evals, evecs = linalg.eig(cor)

        # Get most significant eigenvectors
        evecs = np.real(evecs[:, :self.numSV])

        # Approximate the right singular vectors for matFull
        # (evecs.rightSV ~= matFull), and then normalize rightSV
        rightSV = np.dot(linalg.pinv(evecs), matFull)
        self.rightSV = np.array([x/linalg.norm(x) for x in rightSV])

        return self

    def reduce(self, images):
        
        # Get a copy
        imgs = images.copy()
        
        # Zero-center the images
        imgs -= 0.5
        
        # Get eigenimage basis expansion coefficients
        imgsLow = np.reshape(imgs, (len(imgs), -1)).dot(np.transpose(self.rightSV)).dot(self.rightSV)
        imgsLow = np.reshape(imgsLow, imgs.shape)
        
        return imgsLow + 0.5

In [None]:
class imageBackgroundSubtractor():
    """
    Uses PCA to estimate a background for a list of images (using imageDimensionalityReducer).
    The subtract method subtracts the estimated background from the list of images.
    The list of images should have dimensions (n,height,width,3)
    Initialization variables:
        channelMask: Binary list of length three.  Selects channels to receive background subtractions.
        channelRescale:  Adjust saturation.  1 -> no adjustment
        numSV: Number of singular values to use in background subtraction
    """
    
    
    def __init__(self, channelMask, channelRescale, numSV=15):
        self.channelMask = channelMask
        self.numSV = numSV
        self.channelRescale = channelRescale

    def fit(self, images):

        estimators = self.channelMask.copy()

        for i in range(len(self.channelMask)):
            if self.channelMask[i]:
                estimators[i] = imageDimensionalityReducer(self.numSV)
                estimators[i].fit(images[:,:,:,i])

        # Save results
        self.estimators = estimators

        return self

    def subtract(self, images):

        bg = images.copy()

        for i, est in enumerate(self.estimators):
            if self.channelMask[i]:
                bg[:,:,:,i] = est.reduce(images[:,:,:,i])
            else:
                bg[:,:,:,i] = 0.5

        subtracted = images - bg + 0.5
        
        for i in range(len(self.channelRescale)):
            subtracted[:,:,:,i] = (subtracted[:,:,:,i] - 0.5) * self.channelRescale[i] + 0.5
        
        subtracted = np.clip(subtracted, 0, 1)

        return subtracted

In [None]:
zone = 15

# Get images of zones without threats
imgsEmpty = np.array([x['ZoneImage'] for x in selectZone(labels, zone, 0)[:1000]])

# Get files with threat
imgsThreat = np.array([x['ZoneImage'] for x in selectZone(labels, zone, 1)[:1000]])

In [None]:
# Fit background subtractor
bgs = imageBackgroundSubtractor([1,1,1], [1.5,1.5,1.5], numSV=4)
bgs.fit(imgsEmpty)

# Subtract background
nobgEmpty = bgs.subtract(imgsEmpty)
nobgThreat= bgs.subtract(imgsThreat)

In [None]:
def clampShadow(bodyImages, bodyNoBG, scale):
    
    # In shadow regions, the surface isn't captured, so surface radius and intensity tend to be junk.
    # The blue channel is surface thickness, and tends to be high in shadow regions.
    # Wherever blue is explainable background (I.e. it got subtracted), it's probably a shadow region,
    # so clamp the other colors to grey.
    
    blueDiff = bodyNoBG[:,:,:,2] - bodyImages[:,:,:,2]
    
    clamp = np.exp(blueDiff/scale)
    clamp = np.clip(clamp, 0, 1)
    
    out = bodyNoBG.copy()
    out[:,:,:,0] = (out[:,:,:,0] - 0.5) * clamp * 2.5 + 0.5
    out[:,:,:,1] = (out[:,:,:,1] - 0.5) * clamp * 2.5 + 0.5
    out = np.clip(out, 0, 1)
    
    return out

In [None]:
clampedEmpty =  clampShadow(imgsEmpty, nobgEmpty, 0.2)
clampedThreat =  clampShadow(imgsThreat, nobgThreat, 0.2)

In [None]:
# Background subtraction works well

fig, ax = plt.subplots(6,3, figsize=(13,9), facecolor='white')

ax[0,0].set_title('1. Original surface moments', size='large')
ax[0,1].set_title('2. Background subtracted', size='large')
ax[0,2].set_title('3. Shadows clamped', size='large')

for i in range(6):
  
    ax[i,0].imshow(np.transpose(imgsThreat[i], axes=(1,0,2))[::-1])
    ax[i,0].set_yticklabels([])
    ax[i,0].set_xticklabels([])
    
    ax[i,1].imshow(np.transpose(nobgThreat[i], axes=(1,0,2))[::-1])
    ax[i,1].set_yticklabels([])
    ax[i,1].set_xticklabels([])
    
    ax[i,2].imshow(np.transpose(clampedThreat[i], axes=(1,0,2))[::-1])
    ax[i,2].set_yticklabels([])
    ax[i,2].set_xticklabels([])

# Subtract background for all zone images

In [None]:
class zoneBackgroundSubtractor():

    def __init__(self):
        self.numZones = 17

    def fit(self, dataset):

        self.subtractors = [[] for x in range(self.numZones)]

        for z in range(1, self.numZones+1):
            print('Fitting background subtractor for zone: ', z)

            # Fit background subtractor
            # Use zone images without contraband
            zoneEmpty = selectZone(dataset, z, 0)
            zoneEmpty = np.array([d['ZoneImage'] for d in zoneEmpty])
            bgsubtractor = imageBackgroundSubtractor([1,1,1], [1.5,1.5,1.5], numSV=4)
            bgsubtractor.fit(zoneEmpty)

            # Save
            self.subtractors[z-1] = bgsubtractor

        return self

    def subtract(self, dataset):

        out = []
        
        for z in range(1, self.numZones+1):
            print('Subtracting background in zone: ', z)
            
            bgsubtractor = self.subtractors[z-1]
            
            # Get images
            zoneData = selectZone(dataset, z, -1).copy()
            x = np.array([d['ZoneImage'] for d in zoneData])

            # Subtract background
            nobg = bgsubtractor.subtract(x)
            
            # Clamp to grey in shadow regions
            x = clampShadow(x, nobg, 0.2)
            #x = nobg

            for i in range(len(zoneData)):
                zoneData[i]['ZoneImageBGSubtracted'] = x[i]

            out = out + zoneData

        return out

In [None]:
bgSubtractor = zoneBackgroundSubtractor()
bgSubtractor.fit(labels)
labels = bgSubtractor.subtract(labels)

In [None]:
zone = 15
imgsThreat = np.array([x['ZoneImageBGSubtracted'] for x in selectZone(labels, zone, 1)[:100]])
imgsEmpty = np.array([x['ZoneImageBGSubtracted'] for x in selectZone(labels, zone, 0)[:100]])

In [None]:
fig, ax = plt.subplots(6,2, figsize=(7,9), facecolor='white')

ax[0,0].set_title('No contraband', size='large')
ax[0,1].set_title('With contraband', size='large')

for i in range(6):
    
    ax[i,0].imshow(np.transpose(imgsEmpty[i], axes=(1,0,2))[::-1])
    ax[i,0].set_yticklabels([])
    ax[i,0].set_xticklabels([])
    
    ax[i,1].imshow(np.transpose(imgsThreat[i], axes=(1,0,2))[::-1])
    ax[i,1].set_yticklabels([])
    ax[i,1].set_xticklabels([])

# Transfer learning: get bottleneck features

Takes about 20 minutes. Your machine should have least 8 cores, 48GB RAM.

In [None]:
!pip3 install Keras
!pip3 install H5py

import keras
from keras.applications.inception_v3 import InceptionV3, preprocess_input

In [None]:
def bottleneck(dataset):

    size = (139,139,3)
    print(str(datetime.datetime.now()) + "    Started bottleneck layer calculation")

    print('Loading model')
    model = InceptionV3(weights='imagenet', include_top=False, input_shape=size)

    print('Resizing images')
    imgs = np.array([resize(x['ZoneImageBGSubtracted'], size).astype(np.float32) for x in dataset])

    print('Preprocessing images')
    preprocessed = preprocess_input(imgs)

    print('Extracting bottleneck features')
    features = model.predict(preprocessed)
    features = np.squeeze(features)

    print('Pooling')
    features = np.mean(features, axis=(1,2))

    #print('Flattening')
    #features = np.reshape(features, (len(features), -1))

    print('Storing bottleneck features')
    for i in range(len(dataset)):
        dataset[i]['Bottleneck'] = features[i]

    print(str(datetime.datetime.now()) + "    Finished bottleneck layer calculation")

    return dataset

In [None]:
labels = bottleneck(labels)

# Classify

In [None]:
from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA

In [None]:
def logLossScoreWithEstimator(estimator, x, y):
    pred = estimator.predict_proba(x)[:,1]
    #pred =  estimator.predict(x)
    delta = .07
    pred = np.clip(np.array(pred, dtype=np.float32), delta, 1-delta)
    score = -((np.dot(y, np.log(pred))) + (np.dot(1-np.array(y), np.log(1-pred))))/len(y)
    return score

def logLossScoreSimple(pred, y, delta=0.07):

    pred = np.clip(np.array(pred, dtype=np.float32), delta, 1-delta)
    score = -((np.dot(y, np.log(pred))) + (np.dot(1-np.array(y), np.log(1-pred))))/len(y)
    
    return score

In [None]:
class threatPredicter():
    """
    Binary classifier which can be applied directly to label datasets.
    """
    
    def __init__(self):
        self.numZones = 17

    def fit(self, dataset):

        self.pipelines = [[] for x in range(self.numZones)]

        for z in range(1, self.numZones+1):
            print('Fitting zone ', z)

            # Get data for the zone being fitted
            zoneData = selectZone(dataset, z, -1)
            
            # Get x (bottleneck features)
            x = np.array([d['Bottleneck'] for d in zoneData])
            
            # Get y (contraband probabilities)
            y = [d['Probability'] for d in zoneData]

            # Fit regressor
            #regressor = RandomForestClassifier() 
            regressor = LogisticRegression()
            regressor.fit(x, y)

            # Save pipeline
            self.pipelines[z-1] = regressor

        return self

    def predict(self, dataset):

        out = []

        for z in range(1, self.numZones+1):
            print('Predicting zone ', z)

            regressor = self.pipelines[z-1]
            
            # Get bottleneck features for this zone
            zoneData = selectZone(dataset, z, -1).copy()
            x = np.array([d['Bottleneck'] for d in zoneData])

            # Predict
            pred = regressor.predict_proba(x)[:,1]
            #pred = regressor.predict(x)

            for i in range(len(zoneData)):
                zoneData[i]['Probability'] = pred[i]

            out = out + zoneData

        return out

# Train on stage 1 train dataset

In [None]:
model = threatPredicter()

In [None]:
model.fit(labels)

# Get predictions for stage 1 test dataset

In [None]:
labelsTest = getLabels(filePath('local/stage1_sample_submission.csv'))
labelsTest = getZoneImages(embeddedDir1, labelsTest, zonesDef)
labelsTest = bgSubtractor.subtract(labelsTest)
labelsTest = bottleneck(labelsTest)

In [None]:
pred = model.predict(labelsTest)

In [None]:
prob = [a['Probability'] for a in pred]
fig, ax = plt.subplots(1,facecolor='white')
ax.plot(prob)
ax.set_ylabel('Predicted probability')

In [None]:
i = -50

order = np.argsort(prob)
img = pred[order[i]]['ZoneImageBGSubtracted']
print(pred[order[i]]['Probability'])
fig, ax = plt.subplots(1, figsize=(5,5))
ax.imshow(img)

# Compare with solution file

In [None]:
labelsSolution = getLabels(filePath('local/stage1_solution.csv'))

In [None]:
def sortLabels(labels):
    ids = [l['Id'] for l in labels]
    order = np.argsort(ids)
    out = [deepcopy(labels[order[i]]) for i in range(len(order))]
    return out

In [None]:
p = sortLabels(pred)
p = [x['Probability'] for x in p]

s = sortLabels(labelsSolution)
s = [x['Probability'] for x in s]

print('Num positives in test dataset: ', np.sum(s))
print('Score: ', logLossScoreSimple(p, s, .02))

# Get predictions for stage 2 dataset

Takes about 25 minutes

In [None]:
labels2 = getLabels(filePath('local/stage2_sample_submission.csv'))

In [None]:
labels2 = getZoneImages(embeddedDir2, labels2, zonesDef)

In [None]:
labels2 = bgSubtractor.subtract(labels2)
labels2 = bottleneck(labels2)

In [None]:
pred2 = model.predict(labels2)

In [None]:
# Clip slightly for safety. (Exact predictions give huge errors when they're wrong.)
pred2b = deepcopy(pred2)
for i in range(len(pred2b)):
    pred2b[i]['Probability'] = np.clip(pred2b[i]['Probability'], .02, 1-.02)

# Plot clipped predictions
prob = [a['Probability'] for a in pred2b]
plt.plot(prob)

In [None]:
writeLabels(filePath('local/transferLearningAvgPool.csv'), pred2b)

In [None]:
print(str(datetime.datetime.now()) + "    Finished")