In [1]:
%matplotlib inline



In [2]:
%load_ext line_profiler

In [3]:
import os
import gc
import itertools
import random
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import multiprocessing
import time
import traceback
import re

from PIL import Image
import skimage.data
import skimage.exposure
import skimage.color
import skimage.transform

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score

import scipy.misc

### make training pairs parent func

In [4]:
def makePairsParent(train_info, k):

    artists = train_info.artist.unique()       
    np.random.shuffle(artists)
    
    chunkSize = int(len(artists)/k)
    
    artist_list_list = []
    
    for foldNum in range(k):
        if foldNum == k-1:
            artist_list_list.append( artists[foldNum*chunkSize : len(artists)] )
        else:
            artist_list_list.append( artists[foldNum*chunkSize : (foldNum+1)*(chunkSize) ] )
        
    # define which indices belong to each fold

    #foldLocsList = [] #list of Index objects, one for each fold

    #foldSize = int(len(train_info)/k)

    #for foldNum in range(k):
    #    if foldNum == k-1:
    #        foldLocsList.append( train_info.index[foldNum*foldSize : len(train_info)] )
    #    else:
    #        foldLocsList.append( train_info.index[foldNum*foldSize : (foldNum+1)*(foldSize) ] )

    # define which indices belong to each fold
    argsList = []

    num_cores = min( multiprocessing.cpu_count()-2, k)

    for foldNum in range(k):
        #argsList.append([train_info.loc[foldLocsList[foldNum]]])
        argsList.append( [ train_info[train_info.artist.isin(artist_list_list[foldNum])], foldNum ] )

    print('Launching %s jobs to make pairs' % k)

    startTime = time.time()

    pool = multiprocessing.Pool(num_cores)
    train_pairs_list = pool.starmap(make_pairs, argsList)
    pool.close()
    pool.join()

    endTime = time.time()

    print("making pairs complete, time taken = %.2f minutes" % ((endTime - startTime) / 60.0))
    
    return pd.concat(train_pairs_list)

## make pairs

In [5]:
def make_pairs(train_info, foldNum):
    """Creates training data from the supplied training image information file"""
    artists = train_info.artist.unique()

    n = train_info.groupby('artist').size()
    n = (2*n**2).sum() 
    t = pd.DataFrame(np.zeros((n, 4)), columns=['artist1', 'image1', 'artist2', 'image2'])
    i = 0
    j = 0
    
    for artist in artists:

        #artist info is Ax2 matrix of artist, filename
        artistInfo = train_info[train_info.artist==artist][['artist', 'filename']].values
        
        use = train_info[train_info.artist != artist ].index.values
        np.random.shuffle(use)
        
        #nm = np.min([a.shape[0]**2, train_info[train_info.artist != m].shape[0] ])
        numExamples = np.min([len(artistInfo)**2, sum(train_info.artist != artist) ])
        use = use[0:numExamples]
        
        #diffArtistInfo a Bx2 matrix of artist, filename
        diffArtistInfo = train_info[train_info.artist!=artist][['artist', 'filename']].ix[use, :].values

        
        toAdd_SameArtist = pd.DataFrame(np.concatenate([  np.repeat(artistInfo[:, 0], len(artistInfo)).reshape((-1,1)), #artist
                                            np.repeat(artistInfo[:, 1],
                                            artistInfo.shape[0]).reshape((-1,1)),
                                            np.tile(artistInfo, (len(artistInfo), 1))],
                                         axis=1),
                          columns=['artist1', 'image1', 'artist2', 'image2'])
        toAdd_SameArtist = toAdd_SameArtist.loc[0:numExamples, :]
        
        toAdd_DiffArtist = pd.DataFrame(np.concatenate([np.tile(artistInfo,
                                                  (len(artistInfo), 1))[0:len(diffArtistInfo), :],
                                          diffArtistInfo], axis=1),
                          columns=['artist1', 'image1', 'artist2', 'image2'])
        toAdd_DiffArtist = toAdd_DiffArtist.loc[0:numExamples, :]
        
        #print(j, i, a2.shape[0], b2.shape[0])
        #print(b2)
        t.iloc[i:i+len(toAdd_SameArtist), :] = toAdd_SameArtist.values
        t.iloc[i+len(toAdd_SameArtist):i+len(toAdd_SameArtist)+len(toAdd_DiffArtist), :] = toAdd_DiffArtist.values
        
        i += len(toAdd_SameArtist) + len(toAdd_DiffArtist)
        j += 1
        if j%100==0:
            print('finished %s of %s artists'%(j, len(artists)))

    print('make pairs completed')
    t = t[~t.image2.isin([np.nan, 0])]
    
    t['sameArtist'] = ( t['artist1'] == t['artist2'] )
    t['foldNum'] = foldNum
    
    return t[t.image1 > t.image2]

## Prep Image List

In [6]:
# def prepImageList(image_info, isTest):
#     """given the train_image_info or submission_info, returns a dataframe with a single column containing filenames of images"""
#     if isTest:
#         images = list(set(list(image_info.image1.unique()) + list(image_info.image2.unique())))
#         result = pd.DataFrame(np.array(images).reshape((-1, 1)), columns = ['filename'])
#     else:
#         result = pd.DataFrame(columns = ['filename'], data = image_info['filename'] )
    
#     return result

## Get Features Parent

In [6]:
def getFeaturesParent(isTest):
    """Creates features for training and test images. This function utilizes multiprocessing.
    Args:
        isTest: bool to fetch training or test data
    Returns:
        pandas DataFrame containing features
    """
    
    num_cores = multiprocessing.cpu_count() - 2
    
    argsList = []
    
    for jobNum in range(num_cores):
        argsList.append((isTest, jobNum, num_cores))
        

    print('Launching %s jobs' % (num_cores))
    startTime = time.time()
    
    pool = multiprocessing.Pool(num_cores)
    image_features_list = pool.starmap(getFeaturesWorker, argsList)
    pool.close()
    pool.join()
    
    image_features = pd.concat(image_features_list)
    
    endTime = time.time()
    
    print("collecting features complete, time taken = %.2f minutes" % ((endTime - startTime) / 60.0))
    return image_features

## Get Features Worker

In [7]:
def unit_vector(vector):
    """ Returns the unit vector of the vector.  """
    return vector / np.linalg.norm(vector)

def angle_between(v1, v2):
    """ Returns the angle in radians between vectors 'v1' and 'v2'::

            >>> angle_between((1, 0, 0), (0, 1, 0))
            1.5707963267948966
            >>> angle_between((1, 0, 0), (1, 0, 0))
            0.0
            >>> angle_between((1, 0, 0), (-1, 0, 0))
            3.141592653589793
    """
    v1_u = unit_vector(v1)
    v2_u = unit_vector(v2)
    return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0))

In [8]:
def getFeaturesWorker(isTest, jobNum, totalJobs):
    """Child function for computing image features, only to be called by getFeaturesParent
    Args:
        isTest: whether to compute features for test or training images
        jobNum: which job number this is
        totalJobs: total number of jobs
    Returns:
        pandas dataframe containing a data for a fraction of the training or test images
    """
    if isTest:
        mydir = r'/data/test_data/test'
        info = pd.read_csv(r'/data/test_data/submission_info.csv', index_col = 0)
    else:
        mydir = r'/data/training_data/train'
        info = pd.read_csv(r'/data/training_data/train_info.csv', index_col = 0)
    
    totalNumImages = len(info)
    
    chunkSize = np.int(totalNumImages/totalJobs)
    
    if jobNum == totalJobs - 1:
        startInd = jobNum * chunkSize
        endInd = totalNumImages
    else:
        startInd = jobNum * chunkSize
        endInd = (jobNum + 1) * chunkSize
        
    info = info.iloc[startInd:endInd]
    
    info['pixelsx'] = np.nan
    info['pixelsy'] = np.nan
    info['size_bytes'] = np.nan
    
    info['r_mean'] = np.nan
    info['r_10_pct'] = np.nan
    info['r_25_pct'] = np.nan
    info['r_50_pct'] = np.nan
    info['r_75_pct'] = np.nan
    info['r_90_pct'] = np.nan
    
    info['g_mean'] = np.nan
    info['g_std'] = np.nan
    info['g_10_pct'] = np.nan
    info['g_25_pct'] = np.nan
    info['g_50_pct'] = np.nan
    info['g_75_pct'] = np.nan
    info['g_90_pct'] = np.nan
    
    info['b_mean'] = np.nan
    info['b_std'] = np.nan
    info['b_10_pct'] = np.nan
    info['b_25_pct'] = np.nan
    info['b_50_pct'] = np.nan
    info['b_75_pct'] = np.nan
    info['b_90_pct'] = np.nan
    
    info['h_mean'] = np.nan
    info['h_var'] = np.nan
    
    info['s_mean'] = np.nan
    info['s_std'] = np.nan
    info['s_10_pct'] = np.nan 
    info['s_25_pct'] = np.nan 
    info['s_50_pct'] = np.nan 
    info['s_75_pct'] = np.nan 
    info['s_90_pct'] = np.nan 
    
    info['v_mean'] = np.nan
    info['v_std'] = np.nan
    info['v_10_pct'] = np.nan 
    info['v_25_pct'] = np.nan 
    info['v_50_pct'] = np.nan
    info['v_75_pct'] = np.nan 
    info['v_90_pct'] = np.nan 
    
    info['v_10_pct'] = np.nan 
    info['v_25_pct'] = np.nan 
    info['v_50_pct'] = np.nan
    info['v_75_pct'] = np.nan 
    info['v_90_pct'] = np.nan 
   
    info['h_cx_05_pct'] = np.nan 
    info['h_cx_10_pct'] = np.nan 
    info['h_cx_25_pct'] = np.nan 
    info['h_cx_50_pct'] = np.nan
    info['h_cx_75_pct'] = np.nan 
    info['h_cx_90_pct'] = np.nan 
    info['h_cx_95_pct'] = np.nan 
    
    info['s_cx_05_pct'] = np.nan 
    info['s_cx_10_pct'] = np.nan 
    info['s_cx_25_pct'] = np.nan 
    info['s_cx_50_pct'] = np.nan
    info['s_cx_75_pct'] = np.nan 
    info['s_cx_90_pct'] = np.nan 
    info['s_cx_95_pct'] = np.nan
    
    info['v_cx_05_pct'] = np.nan 
    info['v_cx_10_pct'] = np.nan 
    info['v_cx_25_pct'] = np.nan 
    info['v_cx_50_pct'] = np.nan
    info['v_cx_75_pct'] = np.nan 
    info['v_cx_90_pct'] = np.nan 
    info['v_cx_95_pct'] = np.nan
    
    info['is_grayscale'] = np.nan
      
    print('Job %s, starting getting image info for images %s-%s' % (jobNum, startInd, endInd-1))
    startTime = time.clock()
    
    for ind, i in enumerate(info.index.values):
        try:       
            #im = Image.open(mydir+'/'+info.loc[i, 'filename'])
            #info.loc[i, 'pixelsx'], info.loc[i, 'pixelsy'] = im.size
            
            im = skimage.data.imread(mydir + '/' + info.loc[i, 'filename'])
                
            info.loc[i, 'pixelsx'] = pixelsx = im.shape[1]
            info.loc[i, 'pixelsy'] = pixelsy = im.shape[0]
            
            info.loc[i, 'is_grayscale' ] = grayscale = (len(im.shape) == 2)
            
            # get sample dimensions
            yxRatio = (pixelsy/pixelsx)

            base_size = 200
            
            if yxRatio < 1.0:
                num_y_samp = base_size + 2
                num_x_samp = np.int( num_y_samp / yxRatio )
            else:
                num_x_samp = base_size + 2
                num_y_samp = np.int( num_x_samp * yxRatio)

            #print('x and y: %i %i' %(pixelsx, pixelsy))
            #sample
            #x_space = np.round(np.linspace(0, pixelsx-1, num_x_samp)).astype(int)
            #y_space = np.round(np.linspace(0, pixelsy-1, num_y_samp)).astype(int)

            #samp_im = np.take(im, x_space, axis=1)
            #samp_im = np.take(samp_im, y_space, axis=0)
            
            #im = samp_im
            im = skimage.transform.resize(im, [num_y_samp, num_x_samp])
                
            #convert grayscale to rgb
            if grayscale:
                temp = np.zeros([num_y_samp, num_x_samp, 3])
                #temp = np.zeros([pixelsy, pixelsx, 3])
                for temp_ind in range(3):
                    temp[:,:,temp_ind] = im
                im = temp    
              
            # rgb 
            info.loc[i, 'r_mean'] = im[:,:,0].mean()
            info.loc[i, 'g_mean'] = im[:,:,1].mean()
            info.loc[i, 'b_mean'] = im[:,:,2].mean()
            
            r_pcts = np.percentile(im[:,:,0],[10, 25, 50, 75, 90])
            b_pcts = np.percentile(im[:,:,1],[10, 25, 50, 75, 90])
            g_pcts = np.percentile(im[:,:,2],[10, 25, 50, 75, 90])
            
            for temp_ind, percentile in enumerate([10, 25, 50, 75, 90]):
                info.loc[i, 'r_%.2i_pct' % percentile] = r_pcts[temp_ind]
                info.loc[i, 'g_%.2i_pct' % percentile] = g_pcts[temp_ind]
                info.loc[i, 'b_%.2i_pct' % percentile] = b_pcts[temp_ind]
            
            info.loc[i, 'r_std'] = im[:,:,0].std()
            info.loc[i, 'g_std'] = im[:,:,1].std()
            info.loc[i, 'b_std'] = im[:,:,2].std()

            #if it is in RGBA, we don't handle it for now
            if (len(im.shape) == 3) and im.shape[2] == 4:
                print('%s is rgba' % info.loc[i, 'filename'])
            else:     
                # convert image to hue/saturation/value
                hsvImage = skimage.color.rgb2hsv(im)
                h_angles = hsvImage[:,:,0] * 2.0 * np.pi

                # average hue is converting the (0-1) hue value to unit vector coordinates
                # and finding the average direction
                sinSum = np.sin(h_angles).sum()
                cosSum = np.cos(h_angles).sum()
                info.loc[i, 'h_mean'] = np.arctan(sinSum/cosSum)

                # use the variance formula for a circulator distribution
                R2 = np.power(sinSum, 2) + np.power(cosSum, 2)
                numPixels = info.loc[i, 'pixelsx'] * info.loc[i, 'pixelsy']
                R_bar = np.sqrt(R2)/numPixels
                info.loc[i, 'h_var'] = 1 - R_bar

                info.loc[i, 's_mean'] = hsvImage[:,:,1].mean()
                info.loc[i, 's_std'] = hsvImage[:,:,1].std()

                info.loc[i, 'v_mean'] = hsvImage[:,:,2].mean()
                info.loc[i, 'v_std'] = hsvImage[:,:,2].std()
                
                s_pcts = np.percentile(im[:,:,1],[10, 25, 50, 75, 90])
                v_pcts = np.percentile(im[:,:,2],[10, 25, 50, 75, 90])

                for temp_ind, percentile in enumerate([10, 25, 50, 75, 90]):
                    info.loc[i, 's_%.2i_pct' % percentile] = r_pcts[temp_ind]
                    info.loc[i, 'v_%.2i_pct' % percentile] = g_pcts[temp_ind]

                #complexity metrics
                
                #print('samp im shape' % samp_im.shape)
                
                # go from hue to angles              
                samp_h_angles = hsvImage[:,:,0] * 2.0 * np.pi
                #samp_h_angles = samp_im[:,:,0] * 2.0 * np.pi
                
                #compute gradients
                hsv_x_grad = np.zeros([num_y_samp-2, num_x_samp-2, 3])
                hsv_y_grad = np.zeros([num_y_samp-2, num_x_samp-2, 3])
                
                hsv_x_grad[:,:,0] = np.min(
                        [ np.abs(h_angles[1:-1,0:num_x_samp-2] - h_angles[1:-1,1:num_x_samp-1]),
                        2.0 * np.pi - np.abs( h_angles[1:-1,0:num_x_samp-2] - h_angles[1:-1,1:num_x_samp-1])
                        ],
                        axis = 0
                      )
                hsv_y_grad[:,:,0] = np.min(
                        [ np.abs(h_angles[0:num_y_samp-2,1:-1] - h_angles[1:num_y_samp-1,1:-1]),
                        2.0 * np.pi - np.abs( h_angles[0:num_y_samp-2,1:-1] - h_angles[1:num_y_samp-1,1:-1])
                        ],
                        axis = 0
                      )

                hsv_x_grad[:,:,1:3] = np.abs(hsvImage[1:-1,0:num_x_samp-2,1:3] - hsvImage[1:-1,1:num_x_samp-1,1:3])
                hsv_y_grad[:,:,1:3] = np.abs(hsvImage[0:num_y_samp-2,1:-1,1:3] - hsvImage[1:num_y_samp-1,1:-1,1:3])

                hsv_grad_mag = np.sqrt(np.power(hsv_x_grad,2) + np.power(hsv_y_grad,2))
                
                h_pcts = np.percentile(hsv_grad_mag[:,:,0], [5,10,25,50,75,90,95])
                s_pcts = np.percentile(hsv_grad_mag[:,:,1], [5,10,25,50,75,90,95])
                v_pcts = np.percentile(hsv_grad_mag[:,:,2], [5,10,25,50,75,90,95])
                
                for temp_ind, percentile in enumerate([5, 10, 25, 50, 75, 90, 95]):
                    info.loc[i, 'h_cx_%.2i_pct' % percentile] = h_pcts[temp_ind]
                    info.loc[i, 's_cx_%.2i_pct' % percentile] = s_pcts[temp_ind]
                    info.loc[i, 'v_cx_%.2i_pct' % percentile] = v_pcts[temp_ind]

            #im = cv2.imread(dir+'/'+info.loc[i, 'new_filename'])
            #info.loc[i, 'pixelsx'], info.loc[i, 'pixelsy'] = im.shape[0:2]
            info.loc[i, 'size_bytes'] = os.path.getsize(mydir+'/'+info.loc[i, 'filename']) 

            if (ind+1)%100==0:
                currentTime = time.clock()
                print('Job %s, finished %s of %s, total time = %.2f min' %
                     (jobNum, (ind+1), len(info), (currentTime - startTime)/60.0))
        except:
            print('job %s - error in %s' % (jobNum, mydir+'/'+info.loc[i, 'filename']))
            traceback.print_exc()
    
    currentTime = time.clock()
    print('- Job %s, finished getting image info, total time = %.2f min' % ( jobNum, (currentTime - startTime) / 60.0))
    
    return info

    #return info.rename(columns={'filename' : 'new_filename'})

### load image info

In [9]:
#load training info
train_info = pd.read_csv(r'/data/training_data/train_info.csv', index_col=0)
submission_info = pd.read_csv(r'/data/test_data/submission_info.csv', index_col=0)

#shuffle and save info
#train_info = train_info.iloc[np.random.permutation(len(train_info))]
#submission_info = submission_info.iloc[np.random.permutation(len(submission_info))]

#train_info.to_csv(r'/data/training_data/train_info.csv')
#submission_info.to_csv(r'/data/test_data/submission_info.csv')

### create submission image info from submission pairs

In [6]:
# submission image data is a bunch of images pairs, but we may want to work with a list of test images instead

#submission_pairs = pd.read_csv(r'/data/test_data/submission_pairs.csv')
#images = list(set(list(submission_info.image1.unique()) + list(submission_info.image2.unique())))
#submission_info = pd.DataFrame(data=images, columns=['filename'])
#submission_info = pd.read_csv(r'/data/test_data/submission_info.csv', index_col = 0)

### load test pairs

In [4]:
test_pairs = pd.read_csv(r'/data/test_data/submission_pairs.csv')

### make training pairs

In [13]:
k=5
train_pairs = makePairsParent(train_info, k)

Launching 5 jobs to make pairs
finished 100 of 316 artists
finished 100 of 316 artists
finished 100 of 320 artists
finished 200 of 316 artists
finished 200 of 316 artists
finished 100 of 316 artists
finished 100 of 316 artists
finished 200 of 320 artists
finished 300 of 316 artists
make pairs completed
finished 300 of 316 artists
finished 200 of 316 artists
finished 300 of 320 artists
finished 200 of 316 artists
make pairs completed
make pairs completed
finished 300 of 316 artists
make pairs completed
finished 300 of 316 artists
make pairs completed
making pairs complete, time taken = 0.32 minutes


### save training pairs

In [17]:
#save as csv
filepath = r'/data/training_data/train_pairs.csv'
train_pairs.to_csv(filepath)
print('saving to %s ' % filepath  )

saving to /data/training_data/train_pairs.csv 


### load training pairs

In [5]:
#load pairs
train_pairs = pd.read_csv(r'/data/training_data/train_pairs.csv', index_col = 0)
#submission_pairs = pd.read_csv(r'/data/test_data/submission_pairs.csv')

In [7]:
train_pairs = train_pairs.iloc[np.random.permutation(len(train_pairs))]

In [9]:
train_pairs=train_pairs[0:2000000]

### compute features

In [19]:
print('Begin computing features')
startTime = time.time()

test_features = getFeaturesParent(True)

endTime = time.time()
print("Finished computing features, time taken = %.2f min" % ((endTime-startTime)/60.0) )

Begin computing features
Launching 38 jobs
Job 36, starting getting image info for images 22536-23161
Job 0, starting getting image info for images 0-625
Job 29, starting getting image info for images 18154-18779
Job 21, starting getting image info for images 13146-13771
Job 27, starting getting image info for images 16902-17527
Job 9, starting getting image info for images 5634-6259
Job 32, starting getting image info for images 20032-20657
Job 10, starting getting image info for images 6260-6885
Job 13, starting getting image info for images 8138-8763
Job 4, starting getting image info for images 2504-3129
Job 14, starting getting image info for images 8764-9389
Job 17, starting getting image info for images 10642-11267
Job 28, starting getting image info for images 17528-18153
Job 37, starting getting image info for images 23162-23816
Job 11, starting getting image info for images 6886-7511
Job 22, starting getting image info for images 13772-14397
Job 19, starting getting image inf

Traceback (most recent call last):
  File "/data/anaconda/lib/python3.5/site-packages/skimage/io/_plugins/pil_plugin.py", line 53, in pil_to_ndarray
    im.getdata()[0]
  File "/data/anaconda/lib/python3.5/site-packages/PIL/Image.py", line 1151, in getdata
    self.load()
  File "/data/anaconda/lib/python3.5/site-packages/PIL/ImageFile.py", line 218, in load
    "(%d bytes not processed)" % len(b))
OSError: image file is truncated (57 bytes not processed)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<ipython-input-7-739a9955c249>", line 116, in getFeaturesWorker
    im = skimage.data.imread(mydir + '/' + info.loc[i, 'filename'])
  File "/data/anaconda/lib/python3.5/site-packages/skimage/io/_io.py", line 61, in imread
    img = call_plugin('imread', fname, plugin=plugin, **plugin_args)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/io/manage_plugins.py", line 211, in call_plugin
    return func(*args, **kwar

Job 13, finished 200 of 626, total time = 1.00 min
Job 22, finished 200 of 626, total time = 1.02 min
Job 29, finished 200 of 626, total time = 1.05 min
Job 14, finished 200 of 626, total time = 1.01 min
Job 34, finished 200 of 626, total time = 1.05 min
Job 36, finished 200 of 626, total time = 1.06 min
Job 21, finished 200 of 626, total time = 1.04 min
Job 37, finished 200 of 655, total time = 1.07 min
Job 6, finished 200 of 626, total time = 1.03 min
Job 10, finished 200 of 626, total time = 1.05 min
Job 11, finished 200 of 626, total time = 1.06 min
Job 23, finished 200 of 626, total time = 1.10 min
Job 16, finished 200 of 626, total time = 1.09 min
Job 8, finished 200 of 626, total time = 1.08 min
Job 19, finished 200 of 626, total time = 1.11 min
Job 26, finished 200 of 626, total time = 1.12 min
Job 20, finished 200 of 626, total time = 1.11 min
Job 32, finished 200 of 626, total time = 1.14 min
Job 18, finished 200 of 626, total time = 1.11 min
Job 5, finished 200 of 626, total



Job 9, finished 200 of 626, total time = 1.23 min
Job 30, finished 300 of 626, total time = 1.39 min
Job 4, finished 200 of 626, total time = 1.38 min
Job 24, finished 300 of 626, total time = 1.45 min
Job 25, finished 300 of 626, total time = 1.46 min
job 8 - error in /data/test_data/test/20153.jpg


Traceback (most recent call last):
  File "/data/anaconda/lib/python3.5/site-packages/skimage/io/_plugins/pil_plugin.py", line 53, in pil_to_ndarray
    im.getdata()[0]
  File "/data/anaconda/lib/python3.5/site-packages/PIL/Image.py", line 1151, in getdata
    self.load()
  File "/data/anaconda/lib/python3.5/site-packages/PIL/ImageFile.py", line 218, in load
    "(%d bytes not processed)" % len(b))
OSError: image file is truncated (0 bytes not processed)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<ipython-input-7-739a9955c249>", line 116, in getFeaturesWorker
    im = skimage.data.imread(mydir + '/' + info.loc[i, 'filename'])
  File "/data/anaconda/lib/python3.5/site-packages/skimage/io/_io.py", line 61, in imread
    img = call_plugin('imread', fname, plugin=plugin, **plugin_args)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/io/manage_plugins.py", line 211, in call_plugin
    return func(*args, **kwarg

Job 36, finished 300 of 626, total time = 1.52 min
Job 13, finished 300 of 626, total time = 1.48 min
Job 34, finished 300 of 626, total time = 1.54 min
Job 28, finished 300 of 626, total time = 1.55 min
Job 12, finished 300 of 626, total time = 1.53 min
Job 35, finished 300 of 626, total time = 1.58 min
Job 18, finished 300 of 626, total time = 1.56 min
Job 29, finished 300 of 626, total time = 1.59 min
Job 5, finished 300 of 626, total time = 1.54 min
Job 19, finished 300 of 626, total time = 1.58 min
Job 8, finished 300 of 626, total time = 1.56 min
Job 7, finished 300 of 626, total time = 1.56 min
Job 0, finished 200 of 626, total time = 1.57 min
Job 14, finished 300 of 626, total time = 1.59 min
Job 1, finished 300 of 626, total time = 1.56 min
Job 6, finished 300 of 626, total time = 1.58 min
Job 16, finished 300 of 626, total time = 1.62 min
Job 11, finished 300 of 626, total time = 1.60 min
Job 22, finished 300 of 626, total time = 1.64 min
Job 20, finished 300 of 626, total ti

Traceback (most recent call last):
  File "<ipython-input-7-739a9955c249>", line 118, in getFeaturesWorker
    info.loc[i, 'pixelsx'] = pixelsx = im.shape[1]
IndexError: tuple index out of range


Job 26, finished 300 of 626, total time = 1.92 min
Job 17, finished 300 of 626, total time = 1.92 min
job 5 - error in /data/test_data/test/18649.jpg


Traceback (most recent call last):
  File "/data/anaconda/lib/python3.5/site-packages/skimage/io/_plugins/pil_plugin.py", line 53, in pil_to_ndarray
    im.getdata()[0]
  File "/data/anaconda/lib/python3.5/site-packages/PIL/Image.py", line 1151, in getdata
    self.load()
  File "/data/anaconda/lib/python3.5/site-packages/PIL/ImageFile.py", line 218, in load
    "(%d bytes not processed)" % len(b))
OSError: image file is truncated (79 bytes not processed)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<ipython-input-7-739a9955c249>", line 116, in getFeaturesWorker
    im = skimage.data.imread(mydir + '/' + info.loc[i, 'filename'])
  File "/data/anaconda/lib/python3.5/site-packages/skimage/io/_io.py", line 61, in imread
    img = call_plugin('imread', fname, plugin=plugin, **plugin_args)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/io/manage_plugins.py", line 211, in call_plugin
    return func(*args, **kwar

Job 4, finished 300 of 626, total time = 1.94 min
91447.jpg is rgba
Job 30, finished 400 of 626, total time = 1.98 min
Job 27, finished 300 of 626, total time = 2.01 min
Job 12, finished 400 of 626, total time = 1.98 min
Job 35, finished 400 of 626, total time = 2.03 min
Job 13, finished 400 of 626, total time = 1.99 min
64004.jpg is rgba
Job 34, finished 400 of 626, total time = 2.04 min
Job 24, finished 400 of 626, total time = 2.04 min
Job 29, finished 400 of 626, total time = 2.06 min
Job 36, finished 400 of 626, total time = 2.08 min
Job 11, finished 400 of 626, total time = 2.04 min
Job 6, finished 400 of 626, total time = 2.05 min
Job 25, finished 400 of 626, total time = 2.09 min
Job 7, finished 400 of 626, total time = 2.06 min
Job 33, finished 400 of 626, total time = 2.12 min
Job 28, finished 400 of 626, total time = 2.12 min
Job 3, finished 400 of 626, total time = 2.06 min
Job 1, finished 400 of 626, total time = 2.07 min
Job 2, finished 400 of 626, total time = 2.07 min
J

### save features

In [22]:
#train_features.to_csv(r'/data/training_data/train_features_40000_end.csv')
test_features.to_csv(r'/data/test_data/test_features.csv')
#y = pd.read_csv(r'/data/test_data/test_features.csv', index_col=0)
#test_features_sorted = test_features_sorted.drop('Unnamed: 0',1)

In [43]:
#train_features.to_csv(r'/data/training_data/train_features_40000_end.csv')
#test_features_sorted.to_csv(r'/data/test_data/test_features.csv')
#y = pd.read_csv(r'/data/test_data/test_features.csv', index_col=0)
#test_features_sorted = test_features_sorted.drop('Unnamed: 0',1)

### load training features

In [10]:
#load features
train_features_0 = pd.read_csv(r'/data/training_data/train_features_0_40000.csv', index_col = 0)
train_features_1 = pd.read_csv(r'/data/training_data/train_features_40000_end.csv', index_col = 0)

  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
train_features = pd.concat([train_features_0, train_features_1])

### load test features

In [12]:
test_features = pd.read_csv(r'/data/test_data/test_features.csv', index_col = 0)

In [13]:
# fill nulls
cols = test_features.columns.copy()

cols.drop('is_grayscale')

test_features = test_features[cols].fillna(test_features[cols].mean())

test_features.loc[test_features['is_grayscale'].isnull(), 'is_grayscale'] = 0

### additional processing on feature - remove extra columns

In [14]:
test_features.columns

Index(['filename', 'pixelsx', 'pixelsy', 'size_bytes', 'r_mean', 'r_10_pct',
       'r_25_pct', 'r_50_pct', 'r_75_pct', 'r_90_pct', 'g_mean', 'g_std',
       'g_10_pct', 'g_25_pct', 'g_50_pct', 'g_75_pct', 'g_90_pct', 'b_mean',
       'b_std', 'b_10_pct', 'b_25_pct', 'b_50_pct', 'b_75_pct', 'b_90_pct',
       'h_mean', 'h_var', 's_mean', 's_std', 's_10_pct', 's_25_pct',
       's_50_pct', 's_75_pct', 's_90_pct', 'v_mean', 'v_std', 'v_10_pct',
       'v_25_pct', 'v_50_pct', 'v_75_pct', 'v_90_pct', 'h_cx_05_pct',
       'h_cx_10_pct', 'h_cx_25_pct', 'h_cx_50_pct', 'h_cx_75_pct',
       'h_cx_90_pct', 'h_cx_95_pct', 's_cx_05_pct', 's_cx_10_pct',
       's_cx_25_pct', 's_cx_50_pct', 's_cx_75_pct', 's_cx_90_pct',
       's_cx_95_pct', 'v_cx_05_pct', 'v_cx_10_pct', 'v_cx_25_pct',
       'v_cx_50_pct', 'v_cx_75_pct', 'v_cx_90_pct', 'v_cx_95_pct',
       'is_grayscale', 'r_std'],
      dtype='object')

In [15]:
# saved features are straight from the feature functions with no handling of nulls, etc
# these have to be addressed prior to training/predicting

#rgb_features = ['r_mean', 'r_med', 'r_std', 'g_mean', 'g_med',
#                'g_std', 'b_mean', 'b_med', 'b_std',]

#size_features =  [ 'pixelsx',
#       'pixelsy', 'size_bytes' ]


# take out unnecessary columns
def removeColumns(features):
#     feature_names = ['pixelsx', 'pixelsy', 'size_bytes',
#                      'r_mean', 'r_med', 'r_std',
#                      'g_mean', 'g_med', 'g_std',
#                      'b_mean', 'b_med', 'b_std',
#                      'h_mean', 'h_var',
#                      's_mean', 's_med', 's_std',
#                      'v_mean', 'v_med', 'v_std',
#                      'is_grayscale']

    #features = features[ ['filename'] + feature_names ]
    features = features.drop(['artist', 'title', 'style', 'genre', 'date'], axis=1)
    
    return features

In [16]:
train_features = removeColumns(train_features)

In [17]:
train_features.columns

Index(['filename', 'pixelsx', 'pixelsy', 'size_bytes', 'r_mean', 'r_10_pct',
       'r_25_pct', 'r_50_pct', 'r_75_pct', 'r_90_pct', 'g_mean', 'g_std',
       'g_10_pct', 'g_25_pct', 'g_50_pct', 'g_75_pct', 'g_90_pct', 'b_mean',
       'b_std', 'b_10_pct', 'b_25_pct', 'b_50_pct', 'b_75_pct', 'b_90_pct',
       'h_mean', 'h_var', 's_mean', 's_std', 's_10_pct', 's_25_pct',
       's_50_pct', 's_75_pct', 's_90_pct', 'v_mean', 'v_std', 'v_10_pct',
       'v_25_pct', 'v_50_pct', 'v_75_pct', 'v_90_pct', 'h_cx_05_pct',
       'h_cx_10_pct', 'h_cx_25_pct', 'h_cx_50_pct', 'h_cx_75_pct',
       'h_cx_90_pct', 'h_cx_95_pct', 's_cx_05_pct', 's_cx_10_pct',
       's_cx_25_pct', 's_cx_50_pct', 's_cx_75_pct', 's_cx_90_pct',
       's_cx_95_pct', 'v_cx_05_pct', 'v_cx_10_pct', 'v_cx_25_pct',
       'v_cx_50_pct', 'v_cx_75_pct', 'v_cx_90_pct', 'v_cx_95_pct',
       'is_grayscale', 'r_std'],
      dtype='object')

### feature processing - add image features

In [18]:
def addImageFeatures(features):
    """modifies in place"""
    features['aspect_ratio'] = features['pixelsx']/features['pixelsy']
    features['size_per_pixel'] = features['size_bytes']/features['pixelsx']/features['pixelsy']

In [19]:
addImageFeatures(train_features)

In [20]:
addImageFeatures(test_features)

### Join features to pairs

In [21]:
#join pair data to image features

def joinPairsToFeatures(pairs, features):
    
    feature_base_names = ['pixelsx', 'pixelsy', 'size_bytes', 'aspect_ratio', 'size_per_pixel',
                     'r_mean', 'r_std',
                     'g_mean', 'g_std',
                     'b_mean', 'b_std',
                     'h_mean', 'h_var',
                     's_mean', 's_std',
                     'v_mean', 'v_std', 
                     'is_grayscale' ]
    
    for dim in ['r', 'g', 'b', 's', 'v']:
        for pct in [10, 25, 50, 75, 90]:
            feature_base_names.append( '%s_%.2i_pct' % (dim, pct))
            
    for dim in ['h', 's', 'v']:
        for pct in [5, 10, 25, 50, 75, 90, 95]:
            feature_base_names.append( '%s_cx_%.2i_pct' % (dim, pct))

    col_dict_1 = {}
    col_dict_2 = {}

    for feature in feature_base_names:
        col_dict_1[feature] = '%s_1' % feature
        col_dict_2[feature] = '%s_2' % feature

    pairs = pairs.merge(features,
                        left_on='image1', right_on='filename')
    pairs.rename( columns = col_dict_1,
                          inplace=True)
    pairs = pairs.merge(features,
                        left_on='image2', right_on='filename')
    pairs.rename( columns = col_dict_2,
                          inplace=True)
    return pairs

In [22]:
train_pairs = joinPairsToFeatures(train_pairs, train_features)

In [23]:
test_pairs = joinPairsToFeatures(test_pairs, test_features)

In [24]:
#pairs get out of order after merging, so re-sort
test_pairs = test_pairs.sort_values(by='index')
test_pairs.set_index('index', inplace = True)

### remove nulls in training pairs

In [25]:
# we remove the nulls after the join, could also be done before
train_pairs = train_pairs[~train_pairs['pixelsx_1'].isnull()]
train_pairs = train_pairs[~train_pairs['pixelsx_2'].isnull()]
train_pairs = train_pairs[~train_pairs['size_bytes_1'].isnull()]
train_pairs = train_pairs[~train_pairs['size_bytes_2'].isnull()]
train_pairs = train_pairs[~train_pairs['h_mean_1'].isnull()]
train_pairs = train_pairs[~train_pairs['h_mean_2'].isnull()]

#print(train_pairs.isnull().sum())
print('%i nulls' % (train_pairs.isnull().sum().sum()))

0 nulls


### check no nulls in submission

In [26]:
#print(test_pairs.isnull().sum())
print(test_pairs.isnull().sum().sum())

0


### feature processing - add diff features to pairs

In [27]:
## try training on diffs instead of aboslute values

def addPairFeatures(pairs):
    diff_feature_base = [ 
                     'r_mean', 'r_std',
                     'g_mean', 'g_std',
                     'b_mean', 'b_std',
                     #'h_mean', 
                     'h_var',
                     's_mean', 's_std',
                     'v_mean', 'v_std', ]

    diff_feature_names = [ temp_feature + '_diff' for temp_feature in diff_feature_base]

    for diff_feature in diff_feature_base:
        pairs[diff_feature + '_diff'] = ( pairs[ diff_feature + '_1'] - pairs[ diff_feature + '_2'] ).abs()
    
    pairs['h_mean_diff'] = np.min( [ np.abs( pairs['h_mean_1'] - pairs['h_mean_2'] ),
                                    2.0 * np.pi - np.abs( pairs['h_mean_1'] - pairs['h_mean_2'] ),
                                   ],
                                  axis = 0
                                 )


In [28]:
addPairFeatures(test_pairs)

In [29]:
addPairFeatures(train_pairs)

### results helper

In [30]:
def computePredictStats(y_prob, y_true, threshold = 0.5):
    """ compute accuracy, precision, recall, negative precision, specificity, and auc roc
        Args:
            y_prob: array of floats from 0.0 - 1.0
            y_true: array of booleans
            threshold: true/false threshold value, between 0.0-1.0
        Returns:
            dict of classification metrics
    """
    # y_pred = np.array([True, True, False, False])
    # y_true = np.array([True, True, True, True])
    y_pred = y_prob > threshold
    
    total = len(y_prob)
    true_pos = sum( (y_pred == True) & (y_true == True) )
    true_neg = sum( (y_pred == False) & (y_true == False) )
    false_pos = sum( (y_pred == True) & (y_true == False) )
    false_neg = sum( (y_pred == False ) & (y_true == True) )
    
    accuracy = (true_pos + true_neg) / total
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    npp = true_neg / (true_neg + false_neg) #negative prediction value
    specificity = true_neg / (true_neg + false_pos)
    roc = roc_auc_score(y_true, y_prob)
    
    return { 'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'npp': npp,
            'specificity': specificity,
            'roc': roc,
            'true_pos': true_pos,
            'true_neg': true_neg,
            'false_pos': false_pos,
            'false_neg': false_neg,
           }

### split data into X and Y

In [31]:
def featureHelper(feature_name):

    #FEATURE SET 02
    
    v_pct_match = re.search("v_\d*_pct", feature_name)
    s_pct_match = re.search("s_\d*_pct", feature_name)
    
    h_cx_pct_match = re.search("h_cx_\d*_pct", feature_name)
    s_cx_pct_match = re.search("s_cx_\d*_pct", feature_name)
    v_cx_pct_match = re.search("v_cx_\d*_pct", feature_name)
    
    include = ( 'foldNum' in feature_name
               or 'pixelsx' in feature_name
               or 'pixelsy' in feature_name
               or 'aspect_ratio' in feature_name
               or 'size_bytes' in feature_name
               or 'size_per_pixel' in feature_name
               
               #or 'r_' in feature_name
               #or 'g_' in feature_name
               #or 'b_' in feature_name
               
               or 'h_var_diff' in feature_name
               
               or 's_mean_diff' in feature_name
               or 's_std_diff' in feature_name
               
               or 'v_mean_diff' in feature_name
               or 'v_std_diff' in feature_name
               
               or 's_10_pct' in feature_name
               or 's_50_pct' in feature_name
               or 's_90_pct' in feature_name
               
               or 'v_10_pct' in feature_name
               or 'v_50_pct' in feature_name
               or 'v_90_pct' in feature_name
               
               #or 'h_cx_10_pct' in feature_name
               #or 'h_cx_50_pct' in feature_name
               #or 'h_cx_90_pct' in feature_name
               #or 'h_cx_95_pct' in feature_name
               
               or 's_cx_10_pct' in feature_name
               or 's_cx_50_pct' in feature_name
               or 's_cx_90_pct' in feature_name
               or 's_cx_95_pct' in feature_name
               
               or 'v_cx_10_pct' in feature_name
               or 'v_cx_50_pct' in feature_name
               or 'v_cx_90_pct' in feature_name
               or 's_cx_95_pct' in feature_name
               #or v_pct_match is not None
               #or s_pct_match is not None
               
               #or h_cx_pct_match is not None
               #or s_cx_pct_match is not None
               #or v_cx_pct_match is not None
              )
    
    if include:
        return True
    else:
        return False

In [114]:
def featureHelper(feature_name):

    v_pct_match = re.search("v_\d*_pct", feature_name)
    s_pct_match = re.search("s_\d*_pct", feature_name)
    
    h_cx_pct_match = re.search("h_cx_\d*_pct", feature_name)
    s_cx_pct_match = re.search("s_cx_\d*_pct", feature_name)
    v_cx_pct_match = re.search("v_cx_\d*_pct", feature_name)
    
    include = ( 'foldNum' in feature_name
               or 'pixelsx' in feature_name
               or 'pixelsy' in feature_name
               or 'aspect_ratio' in feature_name
               or 'size_bytes' in feature_name
               or 'size_per_pixel' in feature_name
               
               #or 'r_' in feature_name
               #or 'g_' in feature_name
               #or 'b_' in feature_name
               
               or 'h_var_diff' in feature_name
               
               or 's_mean_diff' in feature_name
               or 's_std_diff' in feature_name
               
               or 'v_mean_diff' in feature_name
               or 'v_std_diff' in feature_name
               
               or v_pct_match is not None
               or s_pct_match is not None
               
               or h_cx_pct_match is not None
               or s_cx_pct_match is not None
               or v_cx_pct_match is not None
              )
    
    if include:
        return True
    else:
        return False

In [32]:
#for foldNum, train_pairs in enumerate(train_pairs_list):
#    train_pairs_list[foldNum] = train_pairs.iloc[np.random.permutation(len(train_pairs))]

# get list of X columns, they are ones with '_1' or '_2' in the name
allCols = train_pairs.columns
isX = list(map(featureHelper, allCols))
X_columns = allCols[isX]

train_X = train_pairs[X_columns]
train_Y = train_pairs[['sameArtist', 'foldNum']]

In [33]:
allCols = test_pairs.columns
isX = list(map(featureHelper, allCols))
X_columns = allCols[isX]

test_X = test_pairs[X_columns]

In [34]:
len(train_X.columns)

42

In [35]:
len(test_X.columns)

41

In [23]:
pd.isnull(test_X).sum().sum()

0

### k-fold CV

In [47]:
def kFoldCV(train_X, train_Y, k, numFoldsToTest):
    """ Perform k-fold cross validation. Will modify train_pairs (shuffle rows)
        Args:
            train_pairs: Dataframe containing training image pairs with features. Should have no nulls.
            k: k, at least 2
            numFoldsToTest: how many folds to actually test, at most k
        Returns:
            dataframe with CV results
    """
    numFoldsToTest = min(k, numFoldsToTest)
    
    # shuffle rows
    #train_pairs = train_pairs.iloc[np.random.permutation(len(train_pairs))]

    # get list of X columns, they are ones with '_1' or '_2' in the name
    #allCols = train_pairs.columns
    #isX = list(map(lambda x: ('_1' in x) or ('_2' in x), allCols))
    #X_columns = allCols[isX]
    
    # split X and Y data
    #CV_X = train_pairs[X_columns]
    #CV_Y = train_pairs['sameArtist']
    
    # define which indices belong to each fold
    foldLocsList = [] #list of Index objects, one for each fold
    
    #foldSize = int(len(train_X)/k)
    #    
    #for foldNum in range(k):
    #    if foldNum == k-1:
    #        foldLocsList.append( train_X.index[foldNum*foldSize : len(train_X)] )
    #    else:
    #        foldLocsList.append( train_X.index[foldNum*foldSize : (foldNum+1)*(foldSize) ] )
    
    # set up dataframe for collecting results
    columnsList = list(itertools.product(('train', 'test'), ('roc', 'precision', 'recall', 'npp', 'specificity')))
    results = pd.DataFrame(index = range(numFoldsToTest), columns = pd.MultiIndex.from_tuples(columnsList))
    
    # test each fold
    for testFold in range(numFoldsToTest):
        
        CV_training_folds = np.arange(k)
        CV_training_folds = np.delete(CV_training_folds, np.where(CV_training_folds == testFold))

        # set up Xs
        CV_train_X = train_X[ train_X['foldNum'].isin(CV_training_folds) ]   
        CV_test_X = train_X[ train_X['foldNum'] == testFold ]   
        
        CV_train_X = CV_train_X.drop(['foldNum'], axis=1)
        CV_test_X = CV_test_X.drop(['foldNum'], axis=1)

        # set up Ys
        CV_train_Y = train_Y[train_X['foldNum'].isin(CV_training_folds) ]
        CV_test_Y = train_Y[ train_Y['foldNum'] == testFold ]   
        
        CV_train_Y = CV_train_Y['sameArtist']
        CV_test_Y = CV_test_Y['sameArtist']

        shuffleInds = np.random.permutation(len(CV_train_X))
        CV_train_X = CV_train_X.iloc[shuffleInds]
        CV_train_Y = CV_train_Y.iloc[shuffleInds]
        
        #CV_train_X = CV_train_X.iloc[0:300000]
        #CV_train_Y = CV_train_Y.iloc[0:300000]
        
        # fit model
        n_jobs = max( multiprocessing.cpu_count() - 10, 1)
        clf = RandomForestClassifier(n_estimators=500, n_jobs=n_jobs, min_samples_split=8, oob_score = True)
        #clf = ExtraTreesClassifier(n_estimators=300, n_jobs=n_jobs, min_samples_split=5)
        #clf = GradientBoostingClassifier(n_estimators = 5)

        start = time.time()

        print('starting fit with %s jobs.. ' % n_jobs, end='')

        clf.fit(CV_train_X, CV_train_Y)

        end = time.time()
                               
        print('total training time: %s' % (end - start) )

        # get in-sample and out-of-sample results
                               
        pred_train = clf.predict_proba(CV_train_X)[:,1]
        train_results = computePredictStats( pred_train, CV_train_Y)

        pred_test = clf.predict_proba(CV_test_X)[:,1]
        test_results = computePredictStats( pred_test, CV_test_Y)
       
        for stat in ('roc', 'accuracy', 'precision', 'recall', 'npp', 'specificity'):
            results.loc[testFold, ('train', stat)] = train_results[stat]
            results.loc[testFold, ('test', stat)] = test_results[stat]
        
        #results.loc[testFold, ('train', 'oob')] = clf.oob_score_
        
        columnToImportance = list(zip(CV_train_X.columns, clf.feature_importances_))
        columnToImportance = sorted(columnToImportance, key=lambda x: x[1], reverse=True)
        importanceDF = pd.DataFrame(data = columnToImportance)
            
    return results, importanceDF    

### run k-fold cv

In [48]:
k = 5
numFoldsToTest = 2

results, importance = kFoldCV(train_X, train_Y, k, numFoldsToTest)

#print(results[ [('train', 'roc'), ('test', 'roc'), ('train', 'oob'),  ('test', 'accuracy')]])
#print(results[ [('train', 'roc'), ('test', 'roc'), ('train', 'oob'),  ('test', 'accuracy')]].mean())
print(results[ [('train', 'roc'), ('test', 'roc'), ('train', 'accuracy'), ('test', 'accuracy')]])
print(results[ [('train', 'roc'), ('test', 'roc'), ('train', 'accuracy'), ('test', 'accuracy')]].mean())
#print(importance)

starting fit with 54 jobs.. total training time: 484.06285214424133
starting fit with 54 jobs.. total training time: 472.72594022750854
  train      test     train      test
    roc       roc  accuracy  accuracy
0     1  0.752695  0.999759  0.680868
1     1  0.739002  0.999766  0.671421
train  roc         1.000000
test   roc         0.745849
train  accuracy    0.999762
test   accuracy    0.676145
dtype: float64


In [170]:
#print(results[ [('train', 'roc'), ('test', 'roc'), ('train', 'accuracy'), ('test', 'accuracy')]])
#print(results[ [('train', 'roc'), ('test', 'roc'), ('train', 'accuracy'), ('test', 'accuracy')]].mean())
print(importance)

                   0         1
0        v_mean_diff  0.042899
1         h_var_diff  0.039640
2         s_std_diff  0.031706
3        s_mean_diff  0.030775
4          pixelsy_1  0.028729
5         v_std_diff  0.028493
6          pixelsx_1  0.027658
7          pixelsy_2  0.026942
8   size_per_pixel_1  0.026475
9   size_per_pixel_2  0.026139
10         pixelsx_2  0.025563
11      size_bytes_1  0.025357
12    aspect_ratio_1  0.024669
13     v_cx_90_pct_1  0.024661
14    aspect_ratio_2  0.024534
15      size_bytes_2  0.024482
16     v_cx_50_pct_1  0.024383
17     v_cx_90_pct_2  0.024289
18     v_cx_50_pct_2  0.023707
19     v_cx_10_pct_1  0.023462
20     v_cx_10_pct_2  0.022598
21        v_50_pct_1  0.022428
22        v_50_pct_2  0.021881
23     s_cx_50_pct_1  0.021875
24        v_10_pct_1  0.021846
25        s_50_pct_1  0.021746
26        v_90_pct_1  0.021652
27        s_10_pct_1  0.021470
28        v_10_pct_2  0.021450
29        s_90_pct_1  0.021390
30        s_50_pct_2  0.021331
31      

In [63]:
type(train_Y_list[0])

pandas.core.series.Series

In [112]:
len(train_X)

1997446

### full training

In [39]:
n_jobs = max( multiprocessing.cpu_count() - 2, 1)
clf = RandomForestClassifier(n_estimators=500, min_samples_split=8, n_jobs=n_jobs)

start = time.time()

print('starting fit')
#excluding the patient_id column from the fit and prediction

shuffleInds = np.random.permutation(len(train_X))
train_X = train_X.iloc[shuffleInds]
train_Y = train_Y.iloc[shuffleInds]

full_train_X = train_X.iloc[shuffleInds].drop(['foldNum'], axis=1).iloc[0:2000000]
full_train_Y = train_Y.iloc[shuffleInds]['sameArtist'].iloc[0:2000000]

clf.fit(full_train_X, full_train_Y)

end = time.time()

print('total training time: %s' % (end - start) )

#columnsList = list(itertools.product(('train', 'test'), ('roc', 'precision', 'recall', 'npp', 'specificity')))
#results = pd.DataFrame(index = range(numFoldsToTest), columns = pd.MultiIndex.from_tuples(columnsList))


starting fit
total training time: 318.94555592536926



### make sure real modle works

In [40]:
testFold = 0

CV_test_X = train_X[ train_X['foldNum'] == testFold ]   
CV_test_Y = train_Y[ train_Y['foldNum'] == testFold ] 

CV_test_X = CV_test_X.drop(['foldNum'], axis=1)
CV_test_Y = CV_test_Y['sameArtist'] 

pred_test = clf.predict_proba(CV_test_X)[:,1]
test_results = computePredictStats( pred_test, CV_test_Y)

In [41]:
print(test_results)

{'specificity': 0.99968876126985406, 'roc': 0.99999963828481087, 'accuracy': 0.99975084561488281, 'npp': 0.99981423744471054, 'false_pos': 62, 'true_pos': 198103, 'true_neg': 199142, 'precision': 0.99968712941235838, 'false_neg': 37, 'recall': 0.99981326334914711}


### save model

In [35]:
##save model

start = time.clock()

with open('submission_02.pkl', 'wb') as fid:
    pickle.dump(clf, fid) 

end = time.clock()
print('total saving time: %s' % (end - start) )

OSError: [Errno 28] No space left on device

### load model

In [25]:
start = time.clock()

# load it again
with open('submission_02.pkl', 'rb') as fid:
    clf = pickle.load(fid)

end = time.clock()
print('total loading time: %s' % (end - start) )

EOFError: Ran out of input

## test on test set

In [42]:
start = time.time()

print('starting predictions')
#excluding the patient_id column from the fit and prediction

test_predictions_0 = clf.predict_proba(test_X[0:5000000])[:,1]
test_predictions_1 = clf.predict_proba(test_X[5000000:10000000])[:,1]
test_predictions_2 = clf.predict_proba(test_X[10000000:15000000])[:,1]
test_predictions_3 = clf.predict_proba(test_X[15000000:20000000])[:,1]
test_predictions_4 = clf.predict_proba(test_X[20000000:])[:,1]

end = time.time()

print('total predictions time: %s' % (end - start) )

starting predictions
total predictions time: 200.39063477516174


 ## prepare submission
    

In [43]:
submission = pd.DataFrame(index = test_pairs.index)
submission['sameArtist'] = test_predictions
submission.to_csv('/data/notebook/notebooks/my_submission_02.csv', index=True)

In [44]:
submission = pd.DataFrame(index = test_pairs.index)
submission['sameArtist'] = test_predictions

In [46]:
len(submission)

21916047

21916047