In [1]:
%matplotlib inline



In [2]:
%load_ext line_profiler

In [39]:
import os
import gc
import itertools
import random
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import multiprocessing
import time
import traceback

from PIL import Image
import skimage.data
import skimage.exposure
import skimage.color

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score

import scipy.misc

### make training pairs parent func

In [4]:
def makePairsParent(train_info, k):

    artists = train_info.artist.unique()       
    np.random.shuffle(artists)
    
    chunkSize = int(len(artists)/k)
    
    artist_list_list = []
    
    for foldNum in range(k):
        if foldNum == k-1:
            artist_list_list.append( artists[foldNum*chunkSize : len(artists)] )
        else:
            artist_list_list.append( artists[foldNum*chunkSize : (foldNum+1)*(chunkSize) ] )
        
    # define which indices belong to each fold

    #foldLocsList = [] #list of Index objects, one for each fold

    #foldSize = int(len(train_info)/k)

    #for foldNum in range(k):
    #    if foldNum == k-1:
    #        foldLocsList.append( train_info.index[foldNum*foldSize : len(train_info)] )
    #    else:
    #        foldLocsList.append( train_info.index[foldNum*foldSize : (foldNum+1)*(foldSize) ] )

    # define which indices belong to each fold
    argsList = []

    num_cores = min( multiprocessing.cpu_count()-2, k)

    for foldNum in range(k):
        #argsList.append([train_info.loc[foldLocsList[foldNum]]])
        argsList.append( [ train_info[train_info.artist.isin(artist_list_list[foldNum])], foldNum ] )

    print('Launching %s jobs to make pairs' % k)

    startTime = time.time()

    pool = multiprocessing.Pool(num_cores)
    train_pairs_list = pool.starmap(make_pairs, argsList)
    pool.close()
    pool.join()

    endTime = time.time()

    print("making pairs complete, time taken = %.2f minutes" % ((endTime - startTime) / 60.0))
    
    return pd.concat(train_pairs_list)

## make pairs

In [5]:
def make_pairs(train_info, foldNum):
    """Creates training data from the supplied training image information file"""
    artists = train_info.artist.unique()

    n = train_info.groupby('artist').size()
    n = (2*n**2).sum() 
    t = pd.DataFrame(np.zeros((n, 4)), columns=['artist1', 'image1', 'artist2', 'image2'])
    i = 0
    j = 0
    
    for artist in artists:

        #artist info is Ax2 matrix of artist, filename
        artistInfo = train_info[train_info.artist==artist][['artist', 'filename']].values
        
        use = train_info[train_info.artist != artist ].index.values
        np.random.shuffle(use)
        
        #nm = np.min([a.shape[0]**2, train_info[train_info.artist != m].shape[0] ])
        numExamples = np.min([len(artistInfo)**2, sum(train_info.artist != artist) ])
        use = use[0:numExamples]
        
        #diffArtistInfo a Bx2 matrix of artist, filename
        diffArtistInfo = train_info[train_info.artist!=artist][['artist', 'filename']].ix[use, :].values

        
        toAdd_SameArtist = pd.DataFrame(np.concatenate([  np.repeat(artistInfo[:, 0], len(artistInfo)).reshape((-1,1)), #artist
                                            np.repeat(artistInfo[:, 1],
                                            artistInfo.shape[0]).reshape((-1,1)),
                                            np.tile(artistInfo, (len(artistInfo), 1))],
                                         axis=1),
                          columns=['artist1', 'image1', 'artist2', 'image2'])
        toAdd_SameArtist = toAdd_SameArtist.loc[0:numExamples, :]
        
        toAdd_DiffArtist = pd.DataFrame(np.concatenate([np.tile(artistInfo,
                                                  (len(artistInfo), 1))[0:len(diffArtistInfo), :],
                                          diffArtistInfo], axis=1),
                          columns=['artist1', 'image1', 'artist2', 'image2'])
        toAdd_DiffArtist = toAdd_DiffArtist.loc[0:numExamples, :]
        
        #print(j, i, a2.shape[0], b2.shape[0])
        #print(b2)
        t.iloc[i:i+len(toAdd_SameArtist), :] = toAdd_SameArtist.values
        t.iloc[i+len(toAdd_SameArtist):i+len(toAdd_SameArtist)+len(toAdd_DiffArtist), :] = toAdd_DiffArtist.values
        
        i += len(toAdd_SameArtist) + len(toAdd_DiffArtist)
        j += 1
        if j%100==0:
            print('finished %s of %s artists'%(j, len(artists)))

    print('make pairs completed')
    t = t[~t.image2.isin([np.nan, 0])]
    
    t['sameArtist'] = ( t['artist1'] == t['artist2'] )
    t['foldNum'] = foldNum
    
    return t[t.image1 > t.image2]

## Prep Image List

In [6]:
# def prepImageList(image_info, isTest):
#     """given the train_image_info or submission_info, returns a dataframe with a single column containing filenames of images"""
#     if isTest:
#         images = list(set(list(image_info.image1.unique()) + list(image_info.image2.unique())))
#         result = pd.DataFrame(np.array(images).reshape((-1, 1)), columns = ['filename'])
#     else:
#         result = pd.DataFrame(columns = ['filename'], data = image_info['filename'] )
    
#     return result

## Get Features Parent

In [104]:
def getFeaturesParent(isTest):
    """Creates features for training and test images. This function utilizes multiprocessing.
    Args:
        isTest: bool to fetch training or test data
    Returns:
        pandas DataFrame containing features
    """
    
    num_cores = multiprocessing.cpu_count() - 5
    
    argsList = []
    
    for jobNum in range(num_cores):
        argsList.append((isTest, jobNum, num_cores))
        

    print('Launching %s jobs' % (num_cores))
    startTime = time.time()
    
    pool = multiprocessing.Pool(num_cores)
    image_features_list = pool.starmap(getFeaturesWorker, argsList)
    pool.close()
    pool.join()
    
    image_features = pd.concat(image_features_list)
    
    endTime = time.time()
    
    print("collecting features complete, time taken = %.2f minutes" % ((endTime - startTime) / 60.0))
    return image_features

## Get Features Worker

In [111]:
def unit_vector(vector):
    """ Returns the unit vector of the vector.  """
    return vector / np.linalg.norm(vector)

def angle_between(v1, v2):
    """ Returns the angle in radians between vectors 'v1' and 'v2'::

            >>> angle_between((1, 0, 0), (0, 1, 0))
            1.5707963267948966
            >>> angle_between((1, 0, 0), (1, 0, 0))
            0.0
            >>> angle_between((1, 0, 0), (-1, 0, 0))
            3.141592653589793
    """
    v1_u = unit_vector(v1)
    v2_u = unit_vector(v2)
    return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0))

In [151]:
def getFeaturesWorker(isTest, jobNum, totalJobs):
    """Child function for computing image features, only to be called by getFeaturesParent
    Args:
        isTest: whether to compute features for test or training images
        jobNum: which job number this is
        totalJobs: total number of jobs
    Returns:
        pandas dataframe containing a data for a fraction of the training or test images
    """
    if isTest:
        mydir = r'/data/test_data/test'
        info = pd.read_csv(r'/data/test_data/submission_info.csv', index_col = 0)
    else:
        mydir = r'/data/training_data/train'
        info = pd.read_csv(r'/data/training_data/train_info.csv', index_col = 0)
    
    info = info[0:2000]
    
    totalNumImages = len(info)
    
    chunkSize = np.int(totalNumImages/totalJobs)
    
    if jobNum == totalJobs - 1:
        startInd = jobNum * chunkSize
        endInd = totalNumImages
    else:
        startInd = jobNum * chunkSize
        endInd = (jobNum + 1) * chunkSize
        
    info = info.iloc[startInd:endInd]
    
    info['pixelsx'] = np.nan
    info['pixelsy'] = np.nan
    info['size_bytes'] = np.nan
    
    info['r_mean'] = np.nan
    info['r_10_pct'] = np.nan
    info['r_25_pct'] = np.nan
    info['r_50_pct'] = np.nan
    info['r_75_pct'] = np.nan
    info['r_90_pct'] = np.nan
    
    info['g_mean'] = np.nan
    info['g_std'] = np.nan
    info['g_10_pct'] = np.nan
    info['g_25_pct'] = np.nan
    info['g_50_pct'] = np.nan
    info['g_75_pct'] = np.nan
    info['g_90_pct'] = np.nan
    
    info['b_mean'] = np.nan
    info['b_std'] = np.nan
    info['b_10_pct'] = np.nan
    info['b_25_pct'] = np.nan
    info['b_50_pct'] = np.nan
    info['b_75_pct'] = np.nan
    info['b_90_pct'] = np.nan
    
    info['h_mean'] = np.nan
    info['h_var'] = np.nan
    
    info['s_mean'] = np.nan
    info['s_std'] = np.nan
    info['s_10_pct'] = np.nan 
    info['s_25_pct'] = np.nan 
    info['s_50_pct'] = np.nan 
    info['s_75_pct'] = np.nan 
    info['s_90_pct'] = np.nan 
    
    info['v_mean'] = np.nan
    info['v_std'] = np.nan
    info['v_10_pct'] = np.nan 
    info['v_25_pct'] = np.nan 
    info['v_50_pct'] = np.nan
    info['v_75_pct'] = np.nan 
    info['v_90_pct'] = np.nan 
    
    info['v_10_pct'] = np.nan 
    info['v_25_pct'] = np.nan 
    info['v_50_pct'] = np.nan
    info['v_75_pct'] = np.nan 
    info['v_90_pct'] = np.nan 
   
    info['h_cx_05_pct'] = np.nan 
    info['h_cx_10_pct'] = np.nan 
    info['h_cx_25_pct'] = np.nan 
    info['h_cx_50_pct'] = np.nan
    info['h_cx_75_pct'] = np.nan 
    info['h_cx_90_pct'] = np.nan 
    info['h_cx_95_pct'] = np.nan 
    
    info['s_cx_05_pct'] = np.nan 
    info['s_cx_10_pct'] = np.nan 
    info['s_cx_25_pct'] = np.nan 
    info['s_cx_50_pct'] = np.nan
    info['s_cx_75_pct'] = np.nan 
    info['s_cx_90_pct'] = np.nan 
    info['s_cx_95_pct'] = np.nan
    
    info['v_cx_05_pct'] = np.nan 
    info['v_cx_10_pct'] = np.nan 
    info['v_cx_25_pct'] = np.nan 
    info['v_cx_50_pct'] = np.nan
    info['v_cx_75_pct'] = np.nan 
    info['v_cx_90_pct'] = np.nan 
    info['v_cx_95_pct'] = np.nan
    
    info['is_grayscale'] = np.nan
      
    print('Job %s, starting getting image info for images %s-%s' % (jobNum, startInd, endInd-1))
    startTime = time.clock()
    
    for ind, i in enumerate(info.index.values):
        try:       
            #im = Image.open(mydir+'/'+info.loc[i, 'filename'])
            #info.loc[i, 'pixelsx'], info.loc[i, 'pixelsy'] = im.size
            
            im = skimage.data.imread(mydir + '/' + info.loc[i, 'filename'])
                
            info.loc[i, 'pixelsx'] = pixelsx = im.shape[1]
            info.loc[i, 'pixelsy'] = pixelsy = im.shape[0]
            
            info.loc[i, 'is_grayscale' ] = grayscale = (len(im.shape) == 2)
            
            # get sample dimensions
            yxRatio = (pixelsy/pixelsx)

            base_size = 200
            
            if yxRatio < 1.0:
                num_y_samp = base_size + 2
                num_x_samp = np.int( num_y_samp / yxRatio )
            else:
                num_x_samp = base_size + 2
                num_y_samp = np.int( num_x_samp * yxRatio)

            #print('x and y: %i %i' %(pixelsx, pixelsy))
            #sample
            x_space = np.round(np.linspace(0, pixelsx-1, num_x_samp)).astype(int)
            y_space = np.round(np.linspace(0, pixelsy-1, num_y_samp)).astype(int)

            samp_im = np.take(im, x_space, axis=1)
            samp_im = np.take(samp_im, y_space, axis=0)
            
            im = samp_im
                
            #convert grayscale to rgb
            if grayscale:
                temp = np.zeros([num_y_samp, num_x_samp, 3])
                #temp = np.zeros([pixelsy, pixelsx, 3])
                for ind in range(3):
                    temp[:,:,ind] = im
                im = temp    
              
            # rgb 
            info.loc[i, 'r_mean'] = im[:,:,0].mean()
            info.loc[i, 'g_mean'] = im[:,:,1].mean()
            info.loc[i, 'b_mean'] = im[:,:,2].mean()
            
            r_pcts = np.percentile(im[:,:,0],[10, 25, 50, 75, 90])
            b_pcts = np.percentile(im[:,:,1],[10, 25, 50, 75, 90])
            g_pcts = np.percentile(im[:,:,2],[10, 25, 50, 75, 90])
            
            for ind, percentile in enumerate([10, 25, 50, 75, 90]):
                info.loc[i, 'r_%.2i_pct' % percentile] = r_pcts[ind]
                info.loc[i, 'g_%.2i_pct' % percentile] = g_pcts[ind]
                info.loc[i, 'b_%.2i_pct' % percentile] = b_pcts[ind]
            
            info.loc[i, 'r_std'] = im[:,:,0].std()
            info.loc[i, 'g_std'] = im[:,:,1].std()
            info.loc[i, 'b_std'] = im[:,:,2].std()

            #if it is in RGBA, we don't handle it for now
            if (len(im.shape) == 3) and im.shape[2] == 4:
                print('%s is rgba' % info.loc[i, 'filename'])
            else:     
                # convert image to hue/saturation/value
                hsvImage = skimage.color.rgb2hsv(im)
                h_angles = hsvImage[:,:,0] * 2.0 * np.pi

                # average hue is converting the (0-1) hue value to unit vector coordinates
                # and finding the average direction
                sinSum = np.sin(h_angles).sum()
                cosSum = np.cos(h_angles).sum()
                info.loc[i, 'h_mean'] = np.arctan(sinSum/cosSum)

                # use the variance formula for a circulator distribution
                R2 = np.power(sinSum, 2) + np.power(cosSum, 2)
                numPixels = info.loc[i, 'pixelsx'] * info.loc[i, 'pixelsy']
                R_bar = np.sqrt(R2)/numPixels
                info.loc[i, 'h_var'] = 1 - R_bar

                info.loc[i, 's_mean'] = hsvImage[:,:,1].mean()
                info.loc[i, 's_std'] = hsvImage[:,:,1].std()

                info.loc[i, 'v_mean'] = hsvImage[:,:,2].mean()
                info.loc[i, 'v_std'] = hsvImage[:,:,2].std()
                
                s_pcts = np.percentile(im[:,:,1],[10, 25, 50, 75, 90])
                v_pcts = np.percentile(im[:,:,2],[10, 25, 50, 75, 90])

                for ind, percentile in enumerate([10, 25, 50, 75, 90]):
                    info.loc[i, 's_%.2i_pct' % percentile] = r_pcts[ind]
                    info.loc[i, 'v_%.2i_pct' % percentile] = g_pcts[ind]

                #complexity metrics
                
                #print('samp im shape' % samp_im.shape)
                
                # go from hue to angles
                hsvImageCopy = hsvImage.copy()
                
                samp_h_angles = hsvImage[:,:,0] * 2.0 * np.pi
                #samp_h_angles = samp_im[:,:,0] * 2.0 * np.pi
                hsvImageCopy[:,:,0] = samp_h_angles
                
                #compute gradients
                hsv_x_grad = np.zeros([num_y_samp-2, num_x_samp-2, 3])
                hsv_y_grad = np.zeros([num_y_samp-2, num_x_samp-2, 3])
                
                hsv_x_grad[:,:,0] = np.min(
                        [ np.abs(samp_im[1:-1,0:num_x_samp-2,0] - samp_im[1:-1,1:num_x_samp-1,0]),
                        2.0 * np.pi - np.abs( samp_im[1:-1,0:num_x_samp-2,0] - samp_im[1:-1,1:num_x_samp-1,0])
                        ],
                        axis = 0
                      )
                hsv_y_grad[:,:,0] = np.min(
                        [ np.abs(samp_im[0:num_y_samp-2,1:-1,0] - samp_im[1:num_y_samp-1,1:-1,0]),
                        2.0 * np.pi - np.abs( samp_im[0:num_y_samp-2,1:-1,0] - samp_im[1:num_y_samp-1,1:-1,0])
                        ],
                        axis = 0
                      )

                hsv_x_grad[:,:,1:3] = np.abs(samp_im[1:-1,0:num_x_samp-2,1:3] - samp_im[1:-1,1:num_x_samp-1,1:3])
                hsv_y_grad[:,:,1:3] = np.abs(samp_im[0:num_y_samp-2,1:-1,1:3] - samp_im[1:num_y_samp-1,1:-1,1:3])

                hsv_grad_mag = np.sqrt(np.power(hsv_x_grad,2) + np.power(hsv_y_grad,2))
                
                h_pcts = np.percentile(hsv_grad_mag[:,:,0], [5,10,25,50,75,90,95])
                s_pcts = np.percentile(hsv_grad_mag[:,:,1], [5,10,25,50,75,90,95])
                v_pcts = np.percentile(hsv_grad_mag[:,:,2], [5,10,25,50,75,90,95])
                
                for ind, percentile in enumerate([5, 10, 25, 50, 75, 90, 95]):
                    info.loc[i, 'h_cx_%.2i_pct' % percentile] = h_pcts[ind]
                    info.loc[i, 's_cx_%.2i_pct' % percentile] = s_pcts[ind]
                    info.loc[i, 'v_cx_%.2i_pct' % percentile] = v_pcts[ind]

            #im = cv2.imread(dir+'/'+info.loc[i, 'new_filename'])
            #info.loc[i, 'pixelsx'], info.loc[i, 'pixelsy'] = im.shape[0:2]
            info.loc[i, 'size_bytes'] = os.path.getsize(mydir+'/'+info.loc[i, 'filename']) 
            if (ind+1)%100==0:
                currentTime = time.clock()
                print('Job %s, finished %s of %s, total time = %.2f min' %
                     (jobNum, (ind+1), len(info), (currentTime - startTime)/60.0))
        except:
            print('job %s - error in %s' % (jobNum, mydir+'/'+info.loc[i, 'filename']))
            traceback.print_exc()
    
    currentTime = time.clock()
    print('- Job %s, finished getting image info, total time = %.2f min' % ( jobNum, (currentTime - startTime) / 60.0))
    
    return info

    #return info.rename(columns={'filename' : 'new_filename'})

### load image info

In [9]:
#load training info
train_info = pd.read_csv(r'/data/training_data/train_info.csv', index_col=0)
submission_info = pd.read_csv(r'/data/test_data/submission_info.csv', index_col=0)

#shuffle and save info
#train_info = train_info.iloc[np.random.permutation(len(train_info))]
#submission_info = submission_info.iloc[np.random.permutation(len(submission_info))]

#train_info.to_csv(r'/data/training_data/train_info.csv')
#submission_info.to_csv(r'/data/test_data/submission_info.csv')

### create submission image info from submission pairs

In [6]:
# submission image data is a bunch of images pairs, but we may want to work with a list of test images instead

#submission_pairs = pd.read_csv(r'/data/test_data/submission_pairs.csv')
#images = list(set(list(submission_info.image1.unique()) + list(submission_info.image2.unique())))
#submission_info = pd.DataFrame(data=images, columns=['filename'])
#submission_info = pd.read_csv(r'/data/test_data/submission_info.csv', index_col = 0)

### load test pairs

In [8]:
test_pairs = pd.read_csv(r'/data/test_data/submission_pairs.csv')

### make training pairs

In [17]:
k=5
train_pairs = makePairsParent(train_info, k)

Launching 5 jobs to make pairs
finished 100 of 320 artists
finished 100 of 316 artists
finished 100 of 316 artists
finished 100 of 316 artists
finished 100 of 316 artists
finished 200 of 320 artists
finished 200 of 316 artists
finished 200 of 316 artists
finished 200 of 316 artists
finished 200 of 316 artists
finished 300 of 320 artists
finished 300 of 316 artists
make pairs completed
finished 300 of 316 artists
make pairs completed
make pairs completed
finished 300 of 316 artists
make pairs completed
finished 300 of 316 artists
make pairs completed
making pairs complete, time taken = 0.31 minutes


### save training pairs

In [19]:
#save as csv
filepath = r'/data/training_data/train_pairs.csv'
print('saving to %s ' % filepath  )#    train_pairs.to_csv(filepath)

saving to /data/training_data/train_pairs.csv 


### load training pairs

In [14]:
#load pairs
train_pairs = pd.read_csv(r'/data/training_data/train_pairs.csv', index_col = 0)
#submission_pairs = pd.read_csv(r'/data/test_data/submission_pairs.csv')

### compute features

In [152]:
print('Begin computing features')
startTime = time.time()

train_features = getFeaturesParent(False)

endTime = time.time()
print("Finished computing features, time taken = %.2f min" % ((endTime-startTime)/60.0) )

Begin computing features
Launching 35 jobs
Job 12, starting getting image info for images 684-740
Job 19, starting getting image info for images 1083-1139
Job 28, starting getting image info for images 1596-1652
Job 9, starting getting image info for images 513-569
Job 26, starting getting image info for images 1482-1538
Job 2, starting getting image info for images 114-170
Job 7, starting getting image info for images 399-455
Job 34, starting getting image info for images 1938-1999
Job 13, starting getting image info for images 741-797
Job 21, starting getting image info for images 1197-1253
Job 24, starting getting image info for images 1368-1424
Job 32, starting getting image info for images 1824-1880
Job 17, starting getting image info for images 969-1025
Job 33, starting getting image info for images 1881-1937
Job 30, starting getting image info for images 1710-1766
Job 15, starting getting image info for images 855-911
job 24 - error in /data/training_data/train/56414.jpg


Traceback (most recent call last):
  File "<ipython-input-151-9c9e49b38fe7>", line 213, in getFeaturesWorker
    samp_im[:,:,0] = samp_h_angles
IndexError: too many indices for array


Job 20, starting getting image info for images 1140-1196
Job 4, starting getting image info for images 228-284
Job 6, starting getting image info for images 342-398
Job 5, starting getting image info for images 285-341
Job 25, starting getting image info for images 1425-1481
Job 31, starting getting image info for images 1767-1823
Job 29, starting getting image info for images 1653-1709
Job 8, starting getting image info for images 456-512
Job 3, starting getting image info for images 171-227
Job 10, starting getting image info for images 570-626
Job 0, starting getting image info for images 0-56
Job 14, starting getting image info for images 798-854
Job 22, starting getting image info for images 1254-1310
Job 16, starting getting image info for images 912-968
Job 18, starting getting image info for images 1026-1082
Job 27, starting getting image info for images 1539-1595
Job 1, starting getting image info for images 57-113
Job 11, starting getting image info for images 627-683
Job 23,

Traceback (most recent call last):
  File "<ipython-input-151-9c9e49b38fe7>", line 213, in getFeaturesWorker
    samp_im[:,:,0] = samp_h_angles
IndexError: too many indices for array


job 32 - error in /data/training_data/train/80711.jpg


Traceback (most recent call last):
  File "<ipython-input-151-9c9e49b38fe7>", line 213, in getFeaturesWorker
    samp_im[:,:,0] = samp_h_angles
IndexError: too many indices for array


job 27 - error in /data/training_data/train/48872.jpg


Traceback (most recent call last):
  File "<ipython-input-151-9c9e49b38fe7>", line 213, in getFeaturesWorker
    samp_im[:,:,0] = samp_h_angles
IndexError: too many indices for array


72168.jpg is rgba
job 9 - error in /data/training_data/train/92904.jpg


Traceback (most recent call last):
  File "<ipython-input-151-9c9e49b38fe7>", line 213, in getFeaturesWorker
    samp_im[:,:,0] = samp_h_angles
IndexError: too many indices for array


job 24 - error in /data/training_data/train/97185.jpg


Traceback (most recent call last):
  File "<ipython-input-151-9c9e49b38fe7>", line 213, in getFeaturesWorker
    samp_im[:,:,0] = samp_h_angles
IndexError: too many indices for array


job 3 - error in /data/training_data/train/77785.jpg


Traceback (most recent call last):
  File "<ipython-input-151-9c9e49b38fe7>", line 213, in getFeaturesWorker
    samp_im[:,:,0] = samp_h_angles
IndexError: too many indices for array


job 1 - error in /data/training_data/train/81985.jpg


Traceback (most recent call last):
  File "<ipython-input-151-9c9e49b38fe7>", line 213, in getFeaturesWorker
    samp_im[:,:,0] = samp_h_angles
IndexError: too many indices for array


job 19 - error in /data/training_data/train/71373.jpg


Traceback (most recent call last):
  File "<ipython-input-151-9c9e49b38fe7>", line 213, in getFeaturesWorker
    samp_im[:,:,0] = samp_h_angles
IndexError: too many indices for array


job 10 - error in /data/training_data/train/94058.jpg


Traceback (most recent call last):
  File "<ipython-input-151-9c9e49b38fe7>", line 213, in getFeaturesWorker
    samp_im[:,:,0] = samp_h_angles
IndexError: too many indices for array


job 2 - error in /data/training_data/train/100767.jpg


Traceback (most recent call last):
  File "<ipython-input-151-9c9e49b38fe7>", line 213, in getFeaturesWorker
    samp_im[:,:,0] = samp_h_angles
IndexError: too many indices for array


job 22 - error in /data/training_data/train/13224.jpg


Traceback (most recent call last):
  File "<ipython-input-151-9c9e49b38fe7>", line 213, in getFeaturesWorker
    samp_im[:,:,0] = samp_h_angles
IndexError: too many indices for array


job 29 - error in /data/training_data/train/64715.jpg


Traceback (most recent call last):
  File "<ipython-input-151-9c9e49b38fe7>", line 213, in getFeaturesWorker
    samp_im[:,:,0] = samp_h_angles
IndexError: too many indices for array


- Job 29, finished getting image info, total time = 0.12 min
job 22 - error in /data/training_data/train/93049.jpg


Traceback (most recent call last):
  File "<ipython-input-151-9c9e49b38fe7>", line 213, in getFeaturesWorker
    samp_im[:,:,0] = samp_h_angles
IndexError: too many indices for array


- Job 24, finished getting image info, total time = 0.12 min
- Job 33, finished getting image info, total time = 0.13 min
- Job 23, finished getting image info, total time = 0.12 min
- Job 26, finished getting image info, total time = 0.13 min
job 19 - error in /data/training_data/train/102574.jpg


Traceback (most recent call last):
  File "<ipython-input-151-9c9e49b38fe7>", line 213, in getFeaturesWorker
    samp_im[:,:,0] = samp_h_angles
IndexError: too many indices for array


- Job 31, finished getting image info, total time = 0.13 min
- Job 25, finished getting image info, total time = 0.13 min
- Job 34, finished getting image info, total time = 0.14 min
- Job 15, finished getting image info, total time = 0.13 min
- Job 22, finished getting image info, total time = 0.13 min
job 21 - error in /data/training_data/train/87321.jpg


Traceback (most recent call last):
  File "<ipython-input-151-9c9e49b38fe7>", line 213, in getFeaturesWorker
    samp_im[:,:,0] = samp_h_angles
IndexError: too many indices for array


- Job 32, finished getting image info, total time = 0.14 min
- Job 18, finished getting image info, total time = 0.13 min
- Job 20, finished getting image info, total time = 0.13 min
- Job 28, finished getting image info, total time = 0.15 min
- Job 17, finished getting image info, total time = 0.14 min
- Job 19, finished getting image info, total time = 0.14 min
- Job 13, finished getting image info, total time = 0.14 min
- Job 10, finished getting image info, total time = 0.13 min
job 16 - error in /data/training_data/train/23571.jpg


Traceback (most recent call last):
IndexError: too many indices for array
  File "<ipython-input-151-9c9e49b38fe7>", line 213, in getFeaturesWorker
    samp_im[:,:,0] = samp_h_angles


- Job 27, finished getting image info, total time = 0.14 min
- Job 14, finished getting image info, total time = 0.13 min
- Job 1, finished getting image info, total time = 0.12 min
- Job 3, finished getting image info, total time = 0.13 min
- Job 7, finished getting image info, total time = 0.13 min
- Job 30, finished getting image info, total time = 0.15 min
- Job 16, finished getting image info, total time = 0.14 min
- Job 12, finished getting image info, total time = 0.15 min
- Job 2, finished getting image info, total time = 0.14 min
- Job 9, finished getting image info, total time = 0.15 min
- Job 5, finished getting image info, total time = 0.14 min
- Job 4, finished getting image info, total time = 0.14 min
- Job 11, finished getting image info, total time = 0.15 min
- Job 6, finished getting image info, total time = 0.14 min
- Job 21, finished getting image info, total time = 0.16 min
- Job 8, finished getting image info, total time = 0.15 min
- Job 0, finished getting image i

In [142]:
train_features.isnull().sum()[train_features.isnull().sum()!=0]

style           22
genre           22
date           505
h_mean           1
h_var            1
s_mean           1
s_std            1
s_10_pct         1
s_25_pct         1
s_50_pct         1
s_75_pct         1
s_90_pct         1
v_mean           1
v_std            1
v_10_pct         1
v_25_pct         1
v_50_pct         1
v_75_pct         1
v_90_pct         1
h_cx_05_pct      1
h_cx_10_pct      1
h_cx_25_pct      1
h_cx_50_pct      1
h_cx_75_pct      1
h_cx_90_pct      1
h_cx_95_pct      1
s_cx_05_pct      1
s_cx_10_pct      1
s_cx_25_pct      1
s_cx_50_pct      1
s_cx_75_pct      1
s_cx_90_pct      1
s_cx_95_pct      1
v_cx_05_pct      1
v_cx_10_pct      1
v_cx_25_pct      1
v_cx_50_pct      1
v_cx_75_pct      1
v_cx_90_pct      1
v_cx_95_pct      1
dtype: int64

### save features

In [38]:
#test_features.to_csv(r'/data/test_data/test_features.csv')
#test_features_sorted.to_csv(r'/data/test_data/test_features.csv')
#y = pd.read_csv(r'/data/test_data/test_features.csv', index_col=0)
#test_features_sorted = test_features_sorted.drop('Unnamed: 0',1)

In [43]:
#train_features.to_csv(r'/data/training_data/train_features_40000_end.csv')
#test_features_sorted.to_csv(r'/data/test_data/test_features.csv')
#y = pd.read_csv(r'/data/test_data/test_features.csv', index_col=0)
#test_features_sorted = test_features_sorted.drop('Unnamed: 0',1)

### load training features

In [20]:
#load features
train_features_0 = pd.read_csv(r'/data/training_data/train_features_0_20000.csv', index_col = 0)
train_features_1 = pd.read_csv(r'/data/training_data/train_features_20000_40000.csv', index_col = 0)
train_features_2 = pd.read_csv(r'/data/training_data/train_features_40000_end.csv', index_col = 0)

In [21]:
train_features = pd.concat([train_features_0, train_features_1, train_features_2])

### load test features

In [22]:
test_features = pd.read_csv(r'/data/test_data/test_features.csv', index_col = 0)

In [23]:
# fill nulls
cols = test_features.columns.copy()

cols.drop('is_grayscale')

test_features = test_features[cols].fillna(test_features[cols].mean())

### additional processing on feature - remove extra columns

In [24]:
# saved features are straight from the feature functions with no handling of nulls, etc
# these have to be addressed prior to training/predicting

#rgb_features = ['r_mean', 'r_med', 'r_std', 'g_mean', 'g_med',
#                'g_std', 'b_mean', 'b_med', 'b_std',]

#size_features =  [ 'pixelsx',
#       'pixelsy', 'size_bytes' ]


# take out unnecessary columns
def removeColumns(features):
    feature_names = ['pixelsx', 'pixelsy', 'size_bytes',
                     'r_mean', 'r_med', 'r_std',
                     'g_mean', 'g_med', 'g_std',
                     'b_mean', 'b_med', 'b_std',
                     'h_mean', 'h_var',
                     's_mean', 's_med', 's_std',
                     'v_mean', 'v_med', 'v_std',
                     'is_grayscale']

    features = features[ ['filename'] + feature_names ]
    
    return features

In [25]:
train_features = removeColumns(train_features)

In [15]:
train_features.columns

Index(['filename', 'pixelsx', 'pixelsy', 'size_bytes', 'r_mean', 'r_med',
       'r_std', 'g_mean', 'g_med', 'g_std', 'b_mean', 'b_med', 'b_std',
       'h_mean', 'h_var', 's_mean', 's_med', 's_std', 'v_mean', 'v_med',
       'v_std', 'is_grayscale'],
      dtype='object')

### feature processing - add image features

In [26]:
def addImageFeatures(features):
    """modifies in place"""
    features['aspect_ratio'] = features['pixelsx']/features['pixelsy']
    features['size_per_pixel'] = features['size_bytes']/features['pixelsx']/features['pixelsy']

In [27]:
addImageFeatures(train_features)

In [40]:
addImageFeatures(test_features)

### Join features to pairs

In [28]:
#join pair data to image features

def joinPairsToFeatures(pairs, features):
    
    feature_base_names = ['pixelsx', 'pixelsy', 'size_bytes', 'aspect_ratio', 'size_per_pixel',
                     'r_mean', 'r_med', 'r_std',
                     'g_mean', 'g_med', 'g_std',
                     'b_mean', 'b_med', 'b_std',
                     'h_mean', 'h_var',
                     's_mean', 's_med', 's_std',
                     'v_mean', 'v_med', 'v_std', 
                     'is_grayscale' ]

    col_dict_1 = {}
    col_dict_2 = {}

    for feature in feature_base_names:
        col_dict_1[feature] = '%s_1' % feature
        col_dict_2[feature] = '%s_2' % feature

    pairs = pairs.merge(features,
                        left_on='image1', right_on='filename')
    pairs.rename( columns = col_dict_1,
                          inplace=True)
    pairs = pairs.merge(features,
                        left_on='image2', right_on='filename')
    pairs.rename( columns = col_dict_2,
                          inplace=True)
    return pairs

In [29]:
train_pairs = joinPairsToFeatures(train_pairs, train_features)

In [49]:
train_pairs.columns

Index(['artist1', 'image1', 'artist2', 'image2', 'sameArtist', 'foldNum',
       'filename_x', 'pixelsx_1', 'pixelsy_1', 'size_bytes_1', 'r_mean_1',
       'r_med_1', 'r_std_1', 'g_mean_1', 'g_med_1', 'g_std_1', 'b_mean_1',
       'b_med_1', 'b_std_1', 'h_mean_1', 'h_var_1', 's_mean_1', 's_med_1',
       's_std_1', 'v_mean_1', 'v_med_1', 'v_std_1', 'is_grayscale_1',
       'aspect_ratio_1', 'size_per_pixel_1', 'filename_y', 'pixelsx_2',
       'pixelsy_2', 'size_bytes_2', 'r_mean_2', 'r_med_2', 'r_std_2',
       'g_mean_2', 'g_med_2', 'g_std_2', 'b_mean_2', 'b_med_2', 'b_std_2',
       'h_mean_2', 'h_var_2', 's_mean_2', 's_med_2', 's_std_2', 'v_mean_2',
       'v_med_2', 'v_std_2', 'is_grayscale_2', 'aspect_ratio_2',
       'size_per_pixel_2'],
      dtype='object')

In [48]:
test_pairs = joinPairsToFeatures(test_pairs, test_features)

In [49]:
#pairs get out of order after merging, so re-sort
test_pairs = test_pairs.sort_values(by='index')
test_pairs.set_index('index', inplace = True)

### remove nulls in training pairs

In [30]:
# we remove the nulls after the join, could also be done before
train_pairs = train_pairs[~train_pairs['pixelsx_1'].isnull()]
train_pairs = train_pairs[~train_pairs['pixelsx_2'].isnull()]
train_pairs = train_pairs[~train_pairs['size_bytes_1'].isnull()]
train_pairs = train_pairs[~train_pairs['size_bytes_2'].isnull()]
train_pairs = train_pairs[~train_pairs['h_mean_1'].isnull()]
train_pairs = train_pairs[~train_pairs['h_mean_2'].isnull()]

#print(train_pairs.isnull().sum())
print('%i nulls' % (train_pairs.isnull().sum().sum()))

0 nulls


### check no nulls in submission

In [21]:
#print(test_pairs.isnull().sum())
print(test_pairs.isnull().sum().sum())

NameError: name 'test_pairs' is not defined

### feature processing - add diff features to pairs

In [31]:
## try training on diffs instead of aboslute values

def addPairFeatures(pairs):
    diff_feature_base = [ 
                     'r_mean', 'r_med', 'r_std',
                     'g_mean', 'g_med', 'g_std',
                     'b_mean', 'b_med', 'b_std',
                     'h_mean', 'h_var',
                     's_mean', 's_med', 's_std',
                     'v_mean', 'v_med', 'v_std', ]

    diff_feature_names = [ temp_feature + '_diff' for temp_feature in diff_feature_base]

    for diff_feature in diff_feature_base:
        pairs[diff_feature + '_diff'] = ( pairs[ diff_feature + '_1'] - pairs[ diff_feature + '_2'] ).abs()

In [51]:
addPairFeatures(test_pairs)

NameError: name 'test_pairs' is not defined

In [32]:
addPairFeatures(train_pairs)

### results helper

In [33]:
def computePredictStats(y_prob, y_true, threshold = 0.5):
    """ compute accuracy, precision, recall, negative precision, specificity, and auc roc
        Args:
            y_prob: array of floats from 0.0 - 1.0
            y_true: array of booleans
            threshold: true/false threshold value, between 0.0-1.0
        Returns:
            dict of classification metrics
    """
    # y_pred = np.array([True, True, False, False])
    # y_true = np.array([True, True, True, True])
    y_pred = y_prob > threshold
    
    total = len(y_prob)
    true_pos = sum( (y_pred == True) & (y_true == True) )
    true_neg = sum( (y_pred == False) & (y_true == False) )
    false_pos = sum( (y_pred == True) & (y_true == False) )
    false_neg = sum( (y_pred == False ) & (y_true == True) )
    
    accuracy = (true_pos + true_neg) / total
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    npp = true_neg / (true_neg + false_neg) #negative prediction value
    specificity = true_neg / (true_neg + false_pos)
    roc = roc_auc_score(y_true, y_prob)
    
    return { 'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'npp': npp,
            'specificity': specificity,
            'roc': roc,
            'true_pos': true_pos,
            'true_neg': true_neg,
            'false_pos': false_pos,
            'false_neg': false_neg,
           }

### split data into X and Y

In [69]:
def featureHelper(feature_name):
    include = ( 'foldNum' in feature_name
               or 'pixelsx' in feature_name
               or 'pixelsy' in feature_name
               #or 'aspect_ratio' in feature_name
               #or '_diff' in feature_name
               or 'size_bytes' in feature_name
               or 'size_per_pixel' in feature_name
               or 'h_mean_diff' in feature_name
               or 'h_var_diff' in feature_name
               or 's_mean_diff' in feature_name
               or 's_med_diff' in feature_name
               or 's_std_diff' in feature_name
               or 'v_mean_diff' in feature_name
               or 'v_med_diff' in feature_name
               or 'v_std_diff' in feature_name
               or 'r_' in feature_name
               or 'g_' in feature_name
               or 'b_' in feature_name
              )
    
    if include:
        return True
    else:
        return False

In [70]:
#for foldNum, train_pairs in enumerate(train_pairs_list):
#    train_pairs_list[foldNum] = train_pairs.iloc[np.random.permutation(len(train_pairs))]

# get list of X columns, they are ones with '_1' or '_2' in the name
allCols = train_pairs.columns
isX = list(map(featureHelper, allCols))
X_columns = allCols[isX]

train_X = train_pairs[X_columns]
train_Y = train_pairs[['sameArtist', 'foldNum']]

In [63]:
allCols = test_pairs.columns
isX = list(map(featureHelper, allCols))
X_columns = allCols[isX]

test_X = test_pairs[X_columns]

NameError: name 'test_pairs' is not defined

In [71]:
train_X.columns

Index(['foldNum', 'pixelsx_1', 'pixelsy_1', 'size_bytes_1', 'r_mean_1',
       'r_med_1', 'r_std_1', 'g_mean_1', 'g_med_1', 'g_std_1', 'b_mean_1',
       'b_med_1', 'b_std_1', 'h_var_1', 'size_per_pixel_1', 'pixelsx_2',
       'pixelsy_2', 'size_bytes_2', 'r_mean_2', 'r_med_2', 'r_std_2',
       'g_mean_2', 'g_med_2', 'g_std_2', 'b_mean_2', 'b_med_2', 'b_std_2',
       'h_var_2', 'size_per_pixel_2', 'r_mean_diff', 'r_med_diff',
       'r_std_diff', 'g_mean_diff', 'g_med_diff', 'g_std_diff', 'b_mean_diff',
       'b_med_diff', 'b_std_diff', 'h_mean_diff', 'h_var_diff', 's_mean_diff',
       's_med_diff', 's_std_diff', 'v_mean_diff', 'v_med_diff', 'v_std_diff'],
      dtype='object')

In [117]:
test_X.columns

NameError: name 'test_X' is not defined

In [154]:
pd.isnull(test_X).sum().sum()

0

### k-fold CV

In [83]:
def kFoldCV(train_X, train_Y, k, numFoldsToTest):
    """ Perform k-fold cross validation. Will modify train_pairs (shuffle rows)
        Args:
            train_pairs: Dataframe containing training image pairs with features. Should have no nulls.
            k: k, at least 2
            numFoldsToTest: how many folds to actually test, at most k
        Returns:
            dataframe with CV results
    """
    numFoldsToTest = min(k, numFoldsToTest)
    
    # shuffle rows
    #train_pairs = train_pairs.iloc[np.random.permutation(len(train_pairs))]

    # get list of X columns, they are ones with '_1' or '_2' in the name
    #allCols = train_pairs.columns
    #isX = list(map(lambda x: ('_1' in x) or ('_2' in x), allCols))
    #X_columns = allCols[isX]
    
    # split X and Y data
    #CV_X = train_pairs[X_columns]
    #CV_Y = train_pairs['sameArtist']
    
    # define which indices belong to each fold
    foldLocsList = [] #list of Index objects, one for each fold
    
    #foldSize = int(len(train_X)/k)
    #    
    #for foldNum in range(k):
    #    if foldNum == k-1:
    #        foldLocsList.append( train_X.index[foldNum*foldSize : len(train_X)] )
    #    else:
    #        foldLocsList.append( train_X.index[foldNum*foldSize : (foldNum+1)*(foldSize) ] )
    
    # set up dataframe for collecting results
    columnsList = list(itertools.product(('train', 'test'), ('roc', 'precision', 'recall', 'npp', 'specificity')))
    results = pd.DataFrame(index = range(numFoldsToTest), columns = pd.MultiIndex.from_tuples(columnsList))
    
    # test each fold
    for testFold in range(numFoldsToTest):
        
        CV_training_folds = np.arange(k)
        CV_training_folds = np.delete(CV_training_folds, np.where(CV_training_folds == testFold))

        # set up Xs
        CV_train_X = train_X[ train_X['foldNum'].isin(CV_training_folds) ]   
        CV_test_X = train_X[ train_X['foldNum'] == testFold ]   
        
        CV_train_X = CV_train_X.drop(['foldNum'], axis=1)
        CV_test_X = CV_test_X.drop(['foldNum'], axis=1)

        # set up Ys
        CV_train_Y = train_Y[train_X['foldNum'].isin(CV_training_folds) ]
        CV_test_Y = train_Y[ train_Y['foldNum'] == testFold ]   
        
        CV_train_Y = CV_train_Y['sameArtist']
        CV_test_Y = CV_test_Y['sameArtist']

        shuffleInds = np.random.permutation(len(CV_train_X))
        
        CV_train_X = CV_train_X.iloc[0:300000]
        CV_train_Y = CV_train_Y.iloc[0:300000]
        
        # fit model
        n_jobs = max( multiprocessing.cpu_count() - 2, 1)
        #clf = RandomForestClassifier(n_estimators=500, n_jobs=n_jobs, min_samples_split=15, oob_score = True)
        #clf = ExtraTreesClassifier(n_estimators=300, n_jobs=n_jobs, min_samples_split=5)
        clf = GradientBoostingClassifier(n_estimators = 5)

        start = time.time()

        print('starting fit with %s jobs.. ' % n_jobs, end='')

        clf.fit(CV_train_X, CV_train_Y)

        end = time.time()
                               
        print('total training time: %s' % (end - start) )

        # get in-sample and out-of-sample results
                               
        pred_train = clf.predict_proba(CV_train_X)[:,1]
        train_results = computePredictStats( pred_train, CV_train_Y)

        pred_test = clf.predict_proba(CV_test_X)[:,1]
        test_results = computePredictStats( pred_test, CV_test_Y)
       
        for stat in ('roc', 'accuracy', 'precision', 'recall', 'npp', 'specificity'):
            results.loc[testFold, ('train', stat)] = train_results[stat]
            results.loc[testFold, ('test', stat)] = test_results[stat]
        
        #results.loc[testFold, ('train', 'oob')] = clf.oob_score_
        
        columnToImportance = list(zip(CV_train_X.columns, clf.feature_importances_))
        columnToImportance = sorted(columnToImportance, key=lambda x: x[1], reverse=True)
        importanceDF = pd.DataFrame(data = columnToImportance)
            
    return results, importanceDF    

### run k-fold cv

In [84]:
k = 5
numFoldsToTest = 2

results, importance = kFoldCV(train_X, train_Y, k, numFoldsToTest)

#print(results[ [('train', 'roc'), ('test', 'roc'), ('train', 'oob'),  ('test', 'accuracy')]])
#print(results[ [('train', 'roc'), ('test', 'roc'), ('train', 'oob'),  ('test', 'accuracy')]].mean())
print(results[ [('train', 'roc'), ('test', 'roc'), ('train', 'accuracy'), ('test', 'accuracy')]])
print(results[ [('train', 'roc'), ('test', 'roc'), ('train', 'accuracy'), ('test', 'accuracy')]].mean())
#print(importance)

starting fit with 38 jobs.. total training time: 6.677391529083252
starting fit with 38 jobs.. total training time: 6.667193174362183
      train      test     train      test
        roc       roc  accuracy  accuracy
0  0.642529  0.609495  0.600230  0.578916
1  0.626275  0.630901  0.590447  0.594956
train  roc         0.634402
test   roc         0.620198
train  accuracy    0.595338
test   accuracy    0.586936
dtype: float64


In [76]:
print(results[ [('train', 'roc'), ('test', 'roc'), ('train', 'accuracy'), ('test', 'accuracy')]])
print(results[ [('train', 'roc'), ('test', 'roc'), ('train', 'accuracy'), ('test', 'accuracy')]].mean())
print(importance)

      train      test    train      test
        roc       roc accuracy  accuracy
0  0.761061  0.667766  0.68926  0.620130
1  0.723338  0.664235  0.65366  0.618091
train  roc         0.742199
test   roc         0.666001
train  accuracy    0.671460
test   accuracy    0.619111
dtype: float64
                   0         1
0       size_bytes_2  0.053831
1          pixelsx_2  0.051941
2          pixelsx_1  0.051854
3   size_per_pixel_2  0.051589
4       size_bytes_1  0.042519
5          pixelsy_2  0.040967
6         b_med_diff  0.039091
7          pixelsy_1  0.038525
8        h_mean_diff  0.035571
9         s_med_diff  0.035206
10  size_per_pixel_1  0.033904
11        v_med_diff  0.032000
12        h_var_diff  0.030899
13        v_std_diff  0.029114
14           r_med_2  0.028642
15          b_mean_2  0.028509
16       v_mean_diff  0.026880
17           r_std_1  0.026450
18          b_mean_1  0.020881
19           h_var_1  0.020002
20          r_mean_2  0.019903
21        r_std_diff  0.018

In [63]:
type(train_Y_list[0])

pandas.core.series.Series

In [112]:
len(train_X)

1997446

### full training

In [114]:
clf = RandomForestClassifier(n_estimators=100, min_samples_split=15, n_jobs=30)

start = time.time()

print('starting fit')
#excluding the patient_id column from the fit and prediction
clf.fit(train_X, train_Y)

end = time.time()

print('total training time: %s' % (end - start) )

#columnsList = list(itertools.product(('train', 'test'), ('roc', 'precision', 'recall', 'npp', 'specificity')))
#results = pd.DataFrame(index = range(numFoldsToTest), columns = pd.MultiIndex.from_tuples(columnsList))


starting fit
total training time: 37.411495208740234


### save model

In [115]:
##save model

start = time.clock()

with open('my_dumped_classifier.pkl', 'wb') as fid:
    pickle.dump(clf, fid) 

end = time.clock()
print('total saving time: %s' % (end - start) )

total saving time: 3.6660020000003897


### load model

In [28]:
start = time.clock()

# load it again
with open('my_dumped_classifier.pkl', 'rb') as fid:
    clf = pickle.load(fid)

end = time.clock()
print('total loading time: %s' % (end - start) )



total loading time: 0.007420999999339983


## test on test set

In [155]:
test_predictions = clf.predict_proba(test_X)[:,1]

 ## prepare submission
    

In [166]:
submission = pd.DataFrame(index = test_pairs.index)
submission['sameArtist'] = test_predictions
submission.to_csv('/data/notebook/notebooks/my_submission_01.csv', index=True)

In [158]:
submission = pd.DataFrame(index = test_pairs.index)
submission['sameArtist'] = test_predictions

In [162]:
submission[0:5]

Unnamed: 0_level_0,sameArtist
index,Unnamed: 1_level_1
0,0.146189
1,0.915134
2,0.062293
3,0.395387
4,0.117785


21916047