In [1]:
%matplotlib inline

In [2]:
%load_ext line_profiler

In [3]:
import os
import itertools
import random
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import multiprocessing
import time
import traceback

from PIL import Image
import skimage.data
import skimage.exposure
import skimage.color

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score

import scipy.misc

## make pairs

In [4]:
def make_pairs(train_info):
    """Creates training data from the supplied training image information file"""
    artists = train_info.artist.unique()

    n = train_info.groupby('artist').size()
    n = (2*n**2).sum() 
    t = pd.DataFrame(np.zeros((n, 4)), columns=['artist1', 'image1', 'artist2', 'image2'])
    i = 0
    j = 0
    
    for artist in artists:

        #artist info is Ax2 matrix of artist, filename
        artistInfo = train_info[train_info.artist==artist][['artist', 'filename']].values
        
        use = train_info[train_info.artist != artist ].index.values
        np.random.shuffle(use)
        
        #nm = np.min([a.shape[0]**2, train_info[train_info.artist != m].shape[0] ])
        numExamples = np.min([len(artistInfo)**2, sum(train_info.artist != artist) ])
        use = use[0:numExamples]
        
        #diffArtistInfo a Bx2 matrix of artist, filename
        diffArtistInfo = train_info[train_info.artist!=artist][['artist', 'filename']].ix[use, :].values

        
        toAdd_SameArtist = pd.DataFrame(np.concatenate([  np.repeat(artistInfo[:, 0], len(artistInfo)).reshape((-1,1)), #artist
                                            np.repeat(artistInfo[:, 1],
                                            artistInfo.shape[0]).reshape((-1,1)),
                                            np.tile(artistInfo, (len(artistInfo), 1))],
                                         axis=1),
                          columns=['artist1', 'image1', 'artist2', 'image2'])
        toAdd_SameArtist = toAdd_SameArtist.loc[0:numExamples, :]
        
        toAdd_DiffArtist = pd.DataFrame(np.concatenate([np.tile(artistInfo,
                                                  (len(artistInfo), 1))[0:len(diffArtistInfo), :],
                                          diffArtistInfo], axis=1),
                          columns=['artist1', 'image1', 'artist2', 'image2'])
        toAdd_DiffArtist = toAdd_DiffArtist.loc[0:numExamples, :]
        
        #print(j, i, a2.shape[0], b2.shape[0])
        #print(b2)
        t.iloc[i:i+len(toAdd_SameArtist), :] = toAdd_SameArtist.values
        t.iloc[i+len(toAdd_SameArtist):i+len(toAdd_SameArtist)+len(toAdd_DiffArtist), :] = toAdd_DiffArtist.values
        
        i += len(toAdd_SameArtist) + len(toAdd_DiffArtist)
        j += 1
        if j%100==0:
            print('finished %s of %s artists'%(j, len(artists)))

    print('make pairs completed')
    t = t[~t.image2.isin([np.nan, 0])]
    return t[t.image1 > t.image2]

## Prep Image List

In [4]:
# def prepImageList(image_info, isTest):
#     """given the train_image_info or submission_info, returns a dataframe with a single column containing filenames of images"""
#     if isTest:
#         images = list(set(list(image_info.image1.unique()) + list(image_info.image2.unique())))
#         result = pd.DataFrame(np.array(images).reshape((-1, 1)), columns = ['filename'])
#     else:
#         result = pd.DataFrame(columns = ['filename'], data = image_info['filename'] )
    
#     return result

## Get Features Parent

In [4]:
def getFeaturesParent(isTest):
    """Creates features for training and test images. This function utilizes multiprocessing.
    Args:
        isTest: bool to fetch training or test data
    Returns:
        pandas DataFrame containing features
    """
    
    num_cores = multiprocessing.cpu_count()
    num_cores = 30
    argsList = []
    
    for jobNum in range(num_cores):
        argsList.append((isTest, jobNum, num_cores))
        

    print('Launching %s jobs' % (num_cores))
    startTime = time.time()
    
    pool = multiprocessing.Pool(num_cores)
    image_features_list = pool.starmap(getFeaturesWorker, argsList)
    pool.close()
    pool.join()
    
    image_features = pd.concat(image_features_list)
    
    endTime = time.time()
    
    print("collecting features complete, time taken = %.2f minutes" % ((endTime - startTime) / 60.0))
    return image_features

## Get Features Worker

In [6]:
im = skimage.data.imread(r'/data/training_data/train/73565.jpg')
hsvImage = skimage.color.rgb2hsv(im)  

#angles = hsvImage[:,:,0] * 2.0 * np.pi

n = len(angles)
        
sinSum = np.sin(angles).sum()
cosSum = np.cos(angles).sum()
h_mean = np.arctan(sinSum/cosSum)
R2 = np.power(sinSum, 2) + np.power(cosSum, 2)
R_bar = np.sqrt(R2)/n
h_var = 1 - R_bar
h_std = np.sqrt(-2*np.log(R_bar))

NameError: name 'angles' is not defined

In [93]:
im = skimage.data.imread(r'/data/training_data/train/56703.jpg')

In [106]:
pilim =  Image.open('/data/training_data/train/73565.jpg')

In [169]:
np.sin(hsvImage[:,:,0]*2.0*np.pi).sum()
np.cos(hsvImage[:,:,0]*2.0*np.pi).sum()

myHues = np.array([0, .5, .5, .5, 0, 0])
myAngles = myHues*2.0*np.pi

sinSum = np.sin(myAngles).sum()
cosSum = np.cos(myAngles).sum()

myAvgH = np.arctan(sinSum/cosSum)

angles = myAngles

In [12]:
for a,b in enumerate(train_info.index):
    print(a,b)

0 35050
1 18697
2 56709
3 21390
4 38483
5 62757
6 28155
7 72943
8 63737
9 40105


In [17]:
def getFeaturesWorker(isTest, jobNum, totalJobs):
    """Child function for computing image features, only to be called by getFeaturesParent
    Args:
        isTest: whether to compute features for test or training images
        jobNum: which job number this is
        totalJobs: total number of jobs
    Returns:
        pandas dataframe containing a data for a fraction of the training or test images
    """
    if isTest:
        mydir = r'/data/test_data/test'
        info = pd.read_csv(r'/data/test_data/submission_info.csv')
    else:
        mydir = r'/data/training_data/train'
        info = pd.read_csv(r'/data/training_data/train_info.csv')
    
    info = info.iloc[20000:]
    
    totalNumImages = len(info)
    
    chunkSize = np.int(totalNumImages/totalJobs)
    
    if jobNum == totalJobs - 1:
        startInd = jobNum * chunkSize
        endInd = totalNumImages
    else:
        startInd = jobNum * chunkSize
        endInd = (jobNum + 1) * chunkSize
        
    info = info.iloc[startInd:endInd]
    
    info['pixelsx'] = np.nan
    info['pixelsy'] = np.nan
    info['size_bytes'] = np.nan
    
    info['r_mean'] = np.nan
    info['r_med'] = np.nan
    info['r_std'] = np.nan
    
    info['g_mean'] = np.nan
    info['g_med'] = np.nan
    info['g_std'] = np.nan
    
    info['b_mean'] = np.nan
    info['b_med'] = np.nan
    info['b_std'] = np.nan
    
    info['h_mean'] = np.nan
    info['h_var'] = np.nan
    
    info['s_mean'] = np.nan
    info['s_std'] = np.nan
    info['s_med'] = np.nan
    
    info['v_mean'] = np.nan
    info['v_std'] = np.nan
    info['v_med'] = np.nan
    
    info['is_grayscale'] = np.nan
      
    print('Job %s, starting getting image info for images %s-%s' % (jobNum, startInd, endInd-1))
    startTime = time.clock()
    
    for ind, i in enumerate(info.index.values):
        try:       
            #im = Image.open(mydir+'/'+info.loc[i, 'filename'])
            #info.loc[i, 'pixelsx'], info.loc[i, 'pixelsy'] = im.size
            
            im = skimage.data.imread(mydir + '/' + info.loc[i, 'filename'])
            
            #if it is in RGBA, convert to RGB
            if (len(im.shape) == 3) and im.shape[2] == 4:
                print('%s is rgba' % info.loc[i, 'filename'])
                
            info.loc[i, 'pixelsx'] = im.shape[1]
            info.loc[i, 'pixelsy'] = im.shape[0]
            
            grayscale = (len(im.shape) == 2)
            
            if grayscale:
                info.loc[i, 'r_mean'] = im.mean()
                info.loc[i, 'g_mean'] = info.loc[i, 'b_mean'] = info.loc[i, 'r_mean']
                
                info.loc[i, 'r_med'] = np.median(im)
                info.loc[i, 'g_med'] = info.loc[i, 'b_med'] = info.loc[i, 'r_med']
                
                info.loc[i, 'r_std'] = im.std()
                info.loc[i, 'g_std'] = info.loc[i, 'b_std'] = info.loc[i, 'r_std']
                
                info.loc[i, 'is_grayscale' ] = 1
                
                info.loc[i, 'h_mean'] = 0
                info.loc[i, 'h_var'] = 0
                info.loc[i, 's_mean'] = 0
                info.loc[i, 's_std'] = 0
                info.loc[i, 's_med'] = 0
                info.loc[i, 'v_mean'] = info.loc[i, 'r_mean']/256.0
                info.loc[i, 'v_std'] = info.loc[i, 'r_std']/256.0
                info.loc[i, 'v_med'] = info.loc[i, 'r_med']/256.0
                
            else:
                info.loc[i, 'r_mean'] = im[:,:,0].mean()
                info.loc[i, 'g_mean'] = im[:,:,1].mean()
                info.loc[i, 'b_mean'] = im[:,:,2].mean()
                info.loc[i, 'r_med'] = np.median(im[:,:,0])
                info.loc[i, 'g_med'] = np.median(im[:,:,1])
                info.loc[i, 'b_med'] = np.median(im[:,:,2])
                info.loc[i, 'r_std'] = im[:,:,0].std()
                info.loc[i, 'g_std'] = im[:,:,1].std()
                info.loc[i, 'b_std'] = im[:,:,2].std()
                info.loc[i, 'is_grayscale' ] = 0
                
                # convert image to hue/saturation/value
                hsvImage = skimage.color.rgb2hsv(im)
                angles = hsvImage[:,:,0] * 2.0 * np.pi

                # average hue is converting the (0-1) hue value to unit vector coordinates
                # and finding the average direction
                sinSum = np.sin(angles).sum()
                cosSum = np.cos(angles).sum()
                info.loc[i, 'h_mean'] = np.arctan(sinSum/cosSum)
                                
                # use the variance formula for a circulator distribution
                R2 = np.power(sinSum, 2) + np.power(cosSum, 2)
                numPixels = info.loc[i, 'pixelsx'] * info.loc[i, 'pixelsy']
                R_bar = np.sqrt(R2)/numPixels
                info.loc[i, 'h_var'] = 1 - R_bar
                                
                info.loc[i, 's_mean'] = hsvImage[:,:,1].mean()
                info.loc[i, 's_std'] = np.median(hsvImage[:,:,1])
                info.loc[i, 's_med'] = hsvImage[:,:,1].std()
                info.loc[i, 'v_mean'] = hsvImage[:,:,2].mean()
                info.loc[i, 'v_std'] = np.median(hsvImage[:,:,2])
                info.loc[i, 'v_med'] = hsvImage[:,:,2].std()
                
            #im = cv2.imread(dir+'/'+info.loc[i, 'new_filename'])
            #info.loc[i, 'pixelsx'], info.loc[i, 'pixelsy'] = im.shape[0:2]
            info.loc[i, 'size_bytes'] = os.path.getsize(mydir+'/'+info.loc[i, 'filename']) 
            if (ind+1)%100==0:
                currentTime = time.clock()
                print('Job %s, finished %s of %s, total time = %.2f min' %
                     (jobNum, (ind+1), len(info), (currentTime - startTime)/60.0))
        except:
            print('job %s - error in %s' % (jobNum, mydir+'/'+info.loc[i, 'filename']))
            traceback.print_exc()
    
    currentTime = time.clock()
    print('- Job %s, finished getting image info, total time = %.2f min' % ( jobNum, (currentTime - startTime) / 60.0))
    
    return info

    #return info.rename(columns={'filename' : 'new_filename'})

### load image info

In [10]:
#load training info
train_info = pd.read_csv(r'/data/training_data/train_info.csv', index_col=0)
submission_info = pd.read_csv(r'/data/test_data/submission_info.csv', index_col=0)

#shuffle and save info
#train_info = train_info.iloc[np.random.permutation(len(train_info))]
#submission_info = submission_info.iloc[np.random.permutation(len(submission_info))]

#train_info.to_csv(r'/data/training_data/train_info.csv')
#submission_info.to_csv(r'/data/test_data/submission_info.csv')

In [9]:
len(train_info)

NameError: name 'train_info' is not defined

### create submission image info from submission pairs

In [6]:
# submission image data is a bunch of images pairs, but we may want to work with a list of test images instead

#submission_pairs = pd.read_csv(r'/data/test_data/submission_pairs.csv')
#images = list(set(list(submission_info.image1.unique()) + list(submission_info.image2.unique())))
#submission_info = pd.DataFrame(data=images, columns=['filename'])
#submission_info = pd.read_csv(r'/data/test_data/submission_info.csv', index_col = 0)

### make training pairs

In [None]:
#make training pairs
#train_pairs = make_pairs(train_image_info)
#train_pairs[ 'sameArtist' ] = train_pairs[ 'artist1' ] == train_pairs[ 'artist2' ]

### save training pairs

In [None]:
#save as csv
#train_pairs.to_csv(r'/data/training_data/train_pairs.csv')

### load pairs

In [9]:
#load pairs
train_pairs = pd.read_csv(r'/data/training_data/train_pairs.csv', index_col = 0)
#submission_pairs = pd.read_csv(r'/data/test_data/submission_pairs.csv')

### shuffle pairs, reduce number if necessary

In [10]:
train_pairs = train_pairs.iloc[np.random.permutation(len(train_pairs))]
train_pairs = train_pairs.iloc[0:100]

### compute features

In [18]:
print('Begin computing features')
startTime = time.time()

train_features = getFeaturesParent(False)

endTime = time.time()
print("Finished computing features, time taken = %.2f min" % ((endTime-startTime)/60.0) )

Begin computing features
Launching 30 jobs
Job 9, starting getting image info for images 17829-19809
Job 20, starting getting image info for images 39620-41600
Job 16, starting getting image info for images 31696-33676
Job 10, starting getting image info for images 19810-21790
Job 22, starting getting image info for images 43582-45562
Job 5, starting getting image info for images 9905-11885
Job 4, starting getting image info for images 7924-9904
Job 11, starting getting image info for images 21791-23771
Job 17, starting getting image info for images 33677-35657
Job 6, starting getting image info for images 11886-13866
Job 15, starting getting image info for images 29715-31695
Job 23, starting getting image info for images 45563-47543
Job 8, starting getting image info for images 15848-17828
Job 19, starting getting image info for images 37639-39619
Job 3, starting getting image info for images 5943-7923
Job 7, starting getting image info for images 13867-15847
Job 28, starting getting 



Job 18, finished 100 of 1981, total time = 1.08 min
103084.jpg is rgba
62261.jpg is rgba
job 10 - error in /data/training_data/train/62261.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 151, in _prepare_colorarray
    raise ValueError(msg)
ValueError: the input array must be have a shape == (.., ..,[ ..,] 3)), got (1044, 688, 4)


job 20 - error in /data/training_data/train/103084.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 151, in _prepare_colorarray
    raise ValueError(msg)
ValueError: the input array must be have a shape == (.., ..,[ ..,] 3)), got (6256, 4548, 4)


Job 2, finished 100 of 1981, total time = 1.22 min
101332.jpg is rgba
job 3 - error in /data/training_data/train/101332.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 151, in _prepare_colorarray
    raise ValueError(msg)
ValueError: the input array must be have a shape == (.., ..,[ ..,] 3)), got (686, 566, 4)


Job 26, finished 100 of 1981, total time = 1.32 min
Job 13, finished 100 of 1981, total time = 1.43 min
Job 17, finished 100 of 1981, total time = 1.45 min
Job 27, finished 100 of 1981, total time = 1.47 min
Job 12, finished 100 of 1981, total time = 1.48 min
Job 9, finished 100 of 1981, total time = 1.55 min
Job 0, finished 100 of 1981, total time = 1.60 min
Job 29, finished 100 of 1984, total time = 1.61 min
196.jpg is rgba
job 3 - error in /data/training_data/train/196.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 151, in _prepare_colorarray
    raise ValueError(msg)
ValueError: the input array must be have a shape == (.., ..,[ ..,] 3)), got (425, 387, 4)


Job 5, finished 100 of 1981, total time = 1.71 min
Job 28, finished 100 of 1981, total time = 1.72 min
Job 16, finished 100 of 1981, total time = 1.72 min
Job 1, finished 100 of 1981, total time = 1.73 min
Job 22, finished 100 of 1981, total time = 1.76 min
Job 7, finished 100 of 1981, total time = 1.75 min
Job 23, finished 100 of 1981, total time = 1.77 min
Job 10, finished 100 of 1981, total time = 1.79 min
Job 24, finished 100 of 1981, total time = 1.81 min
Job 21, finished 100 of 1981, total time = 1.83 min
Job 15, finished 100 of 1981, total time = 1.83 min
Job 6, finished 100 of 1981, total time = 1.87 min
Job 19, finished 100 of 1981, total time = 1.90 min
Job 11, finished 100 of 1981, total time = 1.94 min
Job 3, finished 100 of 1981, total time = 1.96 min
64966.jpg is rgba
job 2 - error in /data/training_data/train/64966.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 151, in _prepare_colorarray
    raise ValueError(msg)
ValueError: the input array must be have a shape == (.., ..,[ ..,] 3)), got (1498, 1231, 4)


Job 8, finished 100 of 1981, total time = 2.06 min
Job 20, finished 100 of 1981, total time = 2.12 min
Job 14, finished 100 of 1981, total time = 2.23 min
Job 28, finished 200 of 1981, total time = 2.88 min
Job 18, finished 200 of 1981, total time = 2.97 min
Job 13, finished 200 of 1981, total time = 2.97 min
Job 2, finished 200 of 1981, total time = 3.01 min
Job 26, finished 200 of 1981, total time = 3.03 min
Job 7, finished 200 of 1981, total time = 3.18 min
Job 12, finished 200 of 1981, total time = 3.21 min
Job 25, finished 100 of 1981, total time = 3.23 min
Job 6, finished 200 of 1981, total time = 3.35 min
Job 5, finished 200 of 1981, total time = 3.41 min
Job 9, finished 200 of 1981, total time = 3.45 min
Job 27, finished 200 of 1981, total time = 3.50 min
Job 0, finished 200 of 1981, total time = 3.49 min
Job 23, finished 200 of 1981, total time = 3.52 min
Job 17, finished 200 of 1981, total time = 3.53 min
Job 21, finished 200 of 1981, total time = 3.53 min
Job 22, finished 20

Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 151, in _prepare_colorarray
    raise ValueError(msg)
ValueError: the input array must be have a shape == (.., ..,[ ..,] 3)), got (851, 1206, 4)


Job 11, finished 200 of 1981, total time = 3.92 min
Job 15, finished 200 of 1981, total time = 3.96 min
87657.jpg is rgba
job 7 - error in /data/training_data/train/87657.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 151, in _prepare_colorarray
    raise ValueError(msg)
ValueError: the input array must be have a shape == (.., ..,[ ..,] 3)), got (1369, 1000, 4)


Job 14, finished 200 of 1981, total time = 4.05 min
Job 20, finished 200 of 1981, total time = 4.07 min
Job 1, finished 200 of 1981, total time = 4.12 min
Job 8, finished 200 of 1981, total time = 4.15 min
Job 16, finished 200 of 1981, total time = 4.17 min
Job 28, finished 300 of 1981, total time = 4.20 min
Job 29, finished 200 of 1984, total time = 4.24 min
Job 13, finished 300 of 1981, total time = 4.55 min
Job 26, finished 300 of 1981, total time = 4.57 min
Job 2, finished 300 of 1981, total time = 4.57 min
Job 9, finished 300 of 1981, total time = 4.74 min
Job 18, finished 300 of 1981, total time = 4.80 min
Job 7, finished 300 of 1981, total time = 4.81 min
Job 25, finished 200 of 1981, total time = 4.85 min
Job 27, finished 300 of 1981, total time = 4.95 min
Job 19, finished 200 of 1981, total time = 5.01 min
Job 17, finished 300 of 1981, total time = 5.02 min
Job 21, finished 300 of 1981, total time = 5.07 min
Job 5, finished 300 of 1981, total time = 5.12 min
Job 22, finished 3

Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 75, in getFeaturesWorker
    info.loc[i, 'pixelsx'] = im.shape[1]
IndexError: tuple index out of range


Job 7, finished 500 of 1981, total time = 8.29 min
Job 15, finished 400 of 1981, total time = 8.34 min
Job 3, finished 300 of 1981, total time = 8.45 min
Job 10, finished 400 of 1981, total time = 8.47 min
Job 28, finished 500 of 1981, total time = 8.52 min
Job 22, finished 400 of 1981, total time = 8.56 min
Job 13, finished 500 of 1981, total time = 8.56 min
Job 18, finished 500 of 1981, total time = 8.73 min
Job 21, finished 500 of 1981, total time = 8.75 min
Job 11, finished 400 of 1981, total time = 8.77 min
36930.jpg is rgba
job 2 - error in /data/training_data/train/36930.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 151, in _prepare_colorarray
    raise ValueError(msg)
ValueError: the input array must be have a shape == (.., ..,[ ..,] 3)), got (689, 499, 4)


Job 1, finished 500 of 1981, total time = 8.79 min




Job 6, finished 500 of 1981, total time = 8.86 min
Job 23, finished 400 of 1981, total time = 8.89 min
Job 2, finished 500 of 1981, total time = 8.90 min
Job 20, finished 400 of 1981, total time = 8.92 min
Job 14, finished 400 of 1981, total time = 8.92 min
Job 0, finished 500 of 1981, total time = 8.98 min
Job 17, finished 400 of 1981, total time = 9.04 min
Job 12, finished 500 of 1981, total time = 9.10 min
Job 19, finished 400 of 1981, total time = 9.20 min
Job 27, finished 500 of 1981, total time = 9.22 min
Job 29, finished 400 of 1984, total time = 9.27 min
Job 26, finished 600 of 1981, total time = 9.30 min
Job 24, finished 500 of 1981, total time = 9.38 min
Job 8, finished 400 of 1981, total time = 9.49 min
Job 7, finished 600 of 1981, total time = 9.57 min
46041.jpg is rgba
job 3 - error in /data/training_data/train/46041.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 151, in _prepare_colorarray
    raise ValueError(msg)
ValueError: the input array must be have a shape == (.., ..,[ ..,] 3)), got (515, 388, 4)


Job 9, finished 500 of 1981, total time = 9.87 min
Job 10, finished 500 of 1981, total time = 9.95 min




Job 22, finished 500 of 1981, total time = 10.03 min
Job 15, finished 500 of 1981, total time = 10.01 min
Job 13, finished 600 of 1981, total time = 10.10 min
Job 5, finished 500 of 1981, total time = 10.14 min
36157.jpg is rgba
job 2 - error in /data/training_data/train/36157.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 151, in _prepare_colorarray
    raise ValueError(msg)
ValueError: the input array must be have a shape == (.., ..,[ ..,] 3)), got (2665, 1995, 4)


Job 28, finished 600 of 1981, total time = 10.23 min
Job 23, finished 500 of 1981, total time = 10.24 min
Job 11, finished 500 of 1981, total time = 10.26 min
Job 17, finished 500 of 1981, total time = 10.33 min
Job 1, finished 600 of 1981, total time = 10.42 min
Job 18, finished 600 of 1981, total time = 10.44 min
Job 2, finished 600 of 1981, total time = 10.57 min
Job 27, finished 600 of 1981, total time = 10.64 min
Job 6, finished 600 of 1981, total time = 10.80 min
Job 21, finished 600 of 1981, total time = 10.85 min
Job 14, finished 500 of 1981, total time = 10.86 min
Job 29, finished 500 of 1984, total time = 10.88 min
Job 25, finished 400 of 1981, total time = 11.07 min
Job 20, finished 500 of 1981, total time = 11.16 min
Job 24, finished 600 of 1981, total time = 11.16 min
Job 0, finished 600 of 1981, total time = 11.26 min
Job 9, finished 600 of 1981, total time = 11.30 min
Job 19, finished 500 of 1981, total time = 11.34 min
Job 26, finished 700 of 1981, total time = 11.35 mi

Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 151, in _prepare_colorarray
    raise ValueError(msg)
ValueError: the input array must be have a shape == (.., ..,[ ..,] 3)), got (2683, 1779, 4)


Job 7, finished 700 of 1981, total time = 11.60 min
Job 28, finished 700 of 1981, total time = 11.65 min
Job 8, finished 500 of 1981, total time = 11.71 min
job 26 - error in /data/training_data/train/29675.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 75, in getFeaturesWorker
    info.loc[i, 'pixelsx'] = im.shape[1]
IndexError: tuple index out of range


Job 12, finished 600 of 1981, total time = 11.88 min
Job 15, finished 600 of 1981, total time = 11.94 min
Job 3, finished 400 of 1981, total time = 11.96 min
Job 1, finished 700 of 1981, total time = 12.04 min
Job 4, finished 100 of 1981, total time = 12.05 min
Job 17, finished 600 of 1981, total time = 12.24 min
Job 14, finished 600 of 1981, total time = 12.31 min
Job 2, finished 700 of 1981, total time = 12.36 min
Job 18, finished 700 of 1981, total time = 12.41 min
Job 20, finished 600 of 1981, total time = 12.43 min
Job 22, finished 600 of 1981, total time = 12.48 min
Job 13, finished 700 of 1981, total time = 12.48 min
Job 5, finished 600 of 1981, total time = 12.54 min
Job 21, finished 700 of 1981, total time = 12.59 min
Job 23, finished 600 of 1981, total time = 12.61 min
job 13 - error in /data/training_data/train/32721.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 75, in getFeaturesWorker
    info.loc[i, 'pixelsx'] = im.shape[1]
IndexError: tuple index out of range


Job 6, finished 700 of 1981, total time = 12.65 min
Job 11, finished 600 of 1981, total time = 12.68 min
Job 24, finished 700 of 1981, total time = 12.68 min
Job 29, finished 600 of 1984, total time = 12.79 min
Job 10, finished 600 of 1981, total time = 12.92 min
Job 0, finished 700 of 1981, total time = 12.92 min
Job 19, finished 600 of 1981, total time = 13.01 min
Job 25, finished 500 of 1981, total time = 13.02 min
Job 9, finished 700 of 1981, total time = 13.40 min
Job 8, finished 600 of 1981, total time = 13.45 min
Job 26, finished 800 of 1981, total time = 13.47 min
Job 17, finished 700 of 1981, total time = 13.53 min
Job 4, finished 200 of 1981, total time = 13.52 min
Job 15, finished 700 of 1981, total time = 13.51 min
Job 22, finished 700 of 1981, total time = 13.70 min
Job 7, finished 800 of 1981, total time = 13.71 min
Job 28, finished 800 of 1981, total time = 13.72 min
Job 2, finished 800 of 1981, total time = 13.75 min
Job 27, finished 700 of 1981, total time = 13.79 min


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 151, in _prepare_colorarray
    raise ValueError(msg)
ValueError: the input array must be have a shape == (.., ..,[ ..,] 3)), got (3525, 4268, 4)


Job 5, finished 700 of 1981, total time = 14.67 min
Job 24, finished 800 of 1981, total time = 14.77 min
Job 25, finished 600 of 1981, total time = 14.78 min
Job 21, finished 800 of 1981, total time = 14.78 min
Job 11, finished 700 of 1981, total time = 14.83 min
Job 7, finished 900 of 1981, total time = 14.85 min
Job 9, finished 800 of 1981, total time = 14.87 min




Job 3, finished 500 of 1981, total time = 14.94 min
Job 6, finished 800 of 1981, total time = 15.01 min
Job 27, finished 800 of 1981, total time = 15.06 min
Job 22, finished 800 of 1981, total time = 15.07 min
Job 28, finished 900 of 1981, total time = 15.17 min
Job 0, finished 800 of 1981, total time = 15.14 min
Job 15, finished 800 of 1981, total time = 15.17 min
Job 17, finished 800 of 1981, total time = 15.23 min
34022.jpg is rgba
job 18 - error in /data/training_data/train/34022.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 151, in _prepare_colorarray
    raise ValueError(msg)
ValueError: the input array must be have a shape == (.., ..,[ ..,] 3)), got (1000, 818, 4)


Job 19, finished 700 of 1981, total time = 15.37 min
Job 12, finished 800 of 1981, total time = 15.41 min
Job 2, finished 900 of 1981, total time = 15.48 min
Job 8, finished 700 of 1981, total time = 15.54 min
Job 4, finished 300 of 1981, total time = 15.60 min
Job 26, finished 900 of 1981, total time = 15.75 min




Job 13, finished 900 of 1981, total time = 15.94 min
Job 14, finished 800 of 1981, total time = 15.95 min
Job 1, finished 900 of 1981, total time = 15.97 min
Job 10, finished 800 of 1981, total time = 16.19 min
Job 24, finished 900 of 1981, total time = 16.30 min
Job 27, finished 900 of 1981, total time = 16.33 min
Job 18, finished 900 of 1981, total time = 16.39 min
Job 29, finished 800 of 1984, total time = 16.43 min
Job 20, finished 800 of 1981, total time = 16.58 min
Job 6, finished 900 of 1981, total time = 16.75 min
Job 28, finished 1000 of 1981, total time = 16.82 min
Job 3, finished 600 of 1981, total time = 16.82 min
Job 7, finished 1000 of 1981, total time = 16.90 min
77740.jpg is rgba
job 22 - error in /data/training_data/train/77740.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 151, in _prepare_colorarray
    raise ValueError(msg)
ValueError: the input array must be have a shape == (.., ..,[ ..,] 3)), got (624, 470, 4)


Job 22, finished 900 of 1981, total time = 16.95 min
Job 17, finished 900 of 1981, total time = 16.96 min
Job 23, finished 800 of 1981, total time = 17.05 min
Job 2, finished 1000 of 1981, total time = 17.24 min
95499.jpg is rgba
job 13 - error in /data/training_data/train/95499.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 151, in _prepare_colorarray
    raise ValueError(msg)
ValueError: the input array must be have a shape == (.., ..,[ ..,] 3)), got (893, 720, 4)


Job 0, finished 900 of 1981, total time = 17.35 min
Job 21, finished 900 of 1981, total time = 17.69 min
Job 9, finished 900 of 1981, total time = 17.79 min
Job 1, finished 1000 of 1981, total time = 17.80 min
Job 27, finished 1000 of 1981, total time = 18.00 min
Job 13, finished 1000 of 1981, total time = 18.03 min
Job 24, finished 1000 of 1981, total time = 18.07 min
Job 5, finished 800 of 1981, total time = 18.08 min
Job 15, finished 900 of 1981, total time = 18.10 min
Job 19, finished 800 of 1981, total time = 18.12 min
Job 18, finished 1000 of 1981, total time = 18.18 min
Job 12, finished 900 of 1981, total time = 18.25 min
Job 26, finished 1000 of 1981, total time = 18.35 min
Job 4, finished 400 of 1981, total time = 18.36 min
Job 28, finished 1100 of 1981, total time = 18.38 min
Job 14, finished 900 of 1981, total time = 18.39 min
Job 25, finished 700 of 1981, total time = 18.52 min
Job 3, finished 700 of 1981, total time = 18.54 min
Job 6, finished 1000 of 1981, total time = 18



Job 5, finished 900 of 1981, total time = 19.49 min
Job 29, finished 900 of 1984, total time = 19.58 min
Job 15, finished 1000 of 1981, total time = 19.73 min
Job 27, finished 1100 of 1981, total time = 19.84 min
Job 13, finished 1100 of 1981, total time = 19.85 min
Job 6, finished 1100 of 1981, total time = 19.92 min
Job 12, finished 1000 of 1981, total time = 19.92 min
Job 16, finished 400 of 1981, total time = 20.02 min
Job 21, finished 1100 of 1981, total time = 20.12 min
Job 0, finished 1100 of 1981, total time = 20.12 min
Job 18, finished 1100 of 1981, total time = 20.20 min
Job 19, finished 900 of 1981, total time = 20.26 min
Job 28, finished 1200 of 1981, total time = 20.27 min
Job 10, finished 1000 of 1981, total time = 20.30 min
Job 2, finished 1200 of 1981, total time = 20.37 min
Job 24, finished 1200 of 1981, total time = 20.48 min
Job 26, finished 1100 of 1981, total time = 20.51 min
Job 3, finished 800 of 1981, total time = 20.52 min
Job 8, finished 800 of 1981, total tim



Job 9, finished 1100 of 1981, total time = 21.24 min
Job 20, finished 1000 of 1981, total time = 21.28 min
Job 29, finished 1000 of 1984, total time = 21.44 min
Job 13, finished 1200 of 1981, total time = 21.52 min
Job 27, finished 1200 of 1981, total time = 21.55 min
Job 22, finished 1100 of 1981, total time = 21.56 min
Job 28, finished 1300 of 1981, total time = 21.60 min
Job 6, finished 1200 of 1981, total time = 21.67 min
Job 19, finished 1000 of 1981, total time = 21.77 min
Job 12, finished 1100 of 1981, total time = 21.79 min
Job 15, finished 1100 of 1981, total time = 21.78 min
Job 18, finished 1200 of 1981, total time = 21.93 min
83132.jpg is rgba
job 18 - error in /data/training_data/train/83132.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 151, in _prepare_colorarray
    raise ValueError(msg)
ValueError: the input array must be have a shape == (.., ..,[ ..,] 3)), got (949, 1409, 4)


Job 0, finished 1200 of 1981, total time = 21.95 min
Job 3, finished 900 of 1981, total time = 22.00 min
Job 11, finished 800 of 1981, total time = 22.04 min
Job 21, finished 1200 of 1981, total time = 22.04 min
Job 8, finished 900 of 1981, total time = 22.08 min
Job 16, finished 500 of 1981, total time = 22.09 min
Job 1, finished 1300 of 1981, total time = 22.22 min
Job 26, finished 1200 of 1981, total time = 22.29 min
Job 24, finished 1300 of 1981, total time = 22.34 min
Job 10, finished 1100 of 1981, total time = 22.35 min
Job 25, finished 900 of 1981, total time = 22.38 min
Job 9, finished 1200 of 1981, total time = 22.42 min
Job 5, finished 1100 of 1981, total time = 22.61 min
Job 7, finished 1300 of 1981, total time = 22.63 min
Job 4, finished 600 of 1981, total time = 22.74 min
42495.jpg is rgba
job 1 - error in /data/training_data/train/42495.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 151, in _prepare_colorarray
    raise ValueError(msg)
ValueError: the input array must be have a shape == (.., ..,[ ..,] 3)), got (2339, 2339, 4)


Job 23, finished 1100 of 1981, total time = 22.88 min
Job 13, finished 1300 of 1981, total time = 22.89 min
69110.jpg is rgba
job 16 - error in /data/training_data/train/69110.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 151, in _prepare_colorarray
    raise ValueError(msg)
ValueError: the input array must be have a shape == (.., ..,[ ..,] 3)), got (1808, 1173, 4)


Job 17, finished 1200 of 1981, total time = 22.91 min
Job 6, finished 1300 of 1981, total time = 22.95 min
Job 18, finished 1300 of 1981, total time = 23.08 min
Job 12, finished 1200 of 1981, total time = 23.11 min
Job 29, finished 1100 of 1984, total time = 23.15 min
Job 20, finished 1100 of 1981, total time = 23.20 min
Job 19, finished 1100 of 1981, total time = 23.24 min
Job 27, finished 1300 of 1981, total time = 23.33 min
Job 28, finished 1400 of 1981, total time = 23.41 min
Job 22, finished 1200 of 1981, total time = 23.53 min




Job 15, finished 1200 of 1981, total time = 23.62 min
Job 16, finished 600 of 1981, total time = 23.68 min
Job 10, finished 1200 of 1981, total time = 23.70 min
Job 26, finished 1300 of 1981, total time = 23.71 min
90205.jpg is rgba
job 13 - error in /data/training_data/train/90205.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 151, in _prepare_colorarray
    raise ValueError(msg)
ValueError: the input array must be have a shape == (.., ..,[ ..,] 3)), got (864, 1171, 4)


Job 3, finished 1000 of 1981, total time = 23.84 min
Job 25, finished 1000 of 1981, total time = 23.92 min
Job 1, finished 1400 of 1981, total time = 23.95 min
Job 0, finished 1300 of 1981, total time = 23.99 min
Job 11, finished 900 of 1981, total time = 24.02 min
Job 24, finished 1400 of 1981, total time = 24.12 min
Job 8, finished 1000 of 1981, total time = 24.33 min
Job 9, finished 1300 of 1981, total time = 24.36 min
Job 4, finished 700 of 1981, total time = 24.61 min
Job 21, finished 1300 of 1981, total time = 24.65 min
Job 5, finished 1200 of 1981, total time = 24.67 min
job 27 - error in /data/training_data/train/31842.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 75, in getFeaturesWorker
    info.loc[i, 'pixelsx'] = im.shape[1]
IndexError: tuple index out of range


Job 18, finished 1400 of 1981, total time = 24.85 min
Job 13, finished 1400 of 1981, total time = 24.88 min
Job 19, finished 1200 of 1981, total time = 24.96 min
Job 17, finished 1300 of 1981, total time = 25.06 min
Job 12, finished 1300 of 1981, total time = 25.08 min
Job 6, finished 1400 of 1981, total time = 25.09 min
Job 23, finished 1200 of 1981, total time = 25.14 min
Job 28, finished 1500 of 1981, total time = 25.20 min
Job 10, finished 1300 of 1981, total time = 25.27 min
Job 29, finished 1200 of 1984, total time = 25.30 min
Job 1, finished 1500 of 1981, total time = 25.46 min
Job 15, finished 1300 of 1981, total time = 25.76 min
Job 27, finished 1400 of 1981, total time = 26.11 min
Job 16, finished 700 of 1981, total time = 26.13 min
Job 26, finished 1400 of 1981, total time = 26.20 min
Job 11, finished 1000 of 1981, total time = 26.26 min
Job 22, finished 1300 of 1981, total time = 26.30 min
Job 5, finished 1300 of 1981, total time = 26.34 min
Job 24, finished 1500 of 1981, t

Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 153, in _prepare_colorarray
    return dtype.img_as_float(arr)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/util/dtype.py", line 301, in img_as_float
    return convert(image, np.float64, force_copy)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/util/dtype.py", line 251, in convert
    return image.astype(dtype)
MemoryError


job 20 - error in /data/training_data/train/73319.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 153, in _prepare_colorarray
    return dtype.img_as_float(arr)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/util/dtype.py", line 301, in img_as_float
    return convert(image, np.float64, force_copy)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/util/dtype.py", line 251, in convert
    return image.astype(dtype)
MemoryError


job 27 - error in /data/training_data/train/56214.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 153, in _prepare_colorarray
    return dtype.img_as_float(arr)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/util/dtype.py", line 301, in img_as_float
    return convert(image, np.float64, force_copy)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/util/dtype.py", line 251, in convert
    return image.astype(dtype)
MemoryError


job 21 - error in /data/training_data/train/38282.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 209, in rgb2hsv
    out[idx, 0] = (arr[idx, 1] - arr[idx, 2]) / delta[idx]
MemoryError


job 7 - error in /data/training_data/train/72255.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 194, in rgb2hsv
    out = np.empty_like(arr)
MemoryError


Job 6, finished 1500 of 1981, total time = 27.15 min
Job 19, finished 1300 of 1981, total time = 27.17 min
Job 15, finished 1400 of 1981, total time = 27.25 min
Job 13, finished 1500 of 1981, total time = 27.47 min
Job 7, finished 1400 of 1981, total time = 27.46 min
Job 17, finished 1400 of 1981, total time = 27.49 min
Job 10, finished 1400 of 1981, total time = 27.53 min
Job 28, finished 1600 of 1981, total time = 27.57 min
Job 29, finished 1300 of 1984, total time = 27.59 min
Job 18, finished 1500 of 1981, total time = 27.75 min
Job 23, finished 1300 of 1981, total time = 27.78 min
Job 1, finished 1600 of 1981, total time = 27.75 min
Job 4, finished 900 of 1981, total time = 27.97 min
Job 9, finished 1500 of 1981, total time = 28.00 min
job 1 - error in /data/training_data/train/94489.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 75, in getFeaturesWorker
    info.loc[i, 'pixelsx'] = im.shape[1]
IndexError: tuple index out of range


Job 24, finished 1600 of 1981, total time = 28.27 min
Job 11, finished 1100 of 1981, total time = 28.29 min
69691.jpg is rgba
job 11 - error in /data/training_data/train/69691.jpg


Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 151, in _prepare_colorarray
    raise ValueError(msg)
ValueError: the input array must be have a shape == (.., ..,[ ..,] 3)), got (475, 354, 4)


Job 27, finished 1500 of 1981, total time = 28.36 min
Job 0, finished 1500 of 1981, total time = 28.34 min
Job 20, finished 1300 of 1981, total time = 28.43 min
Job 16, finished 800 of 1981, total time = 28.44 min
Job 22, finished 1400 of 1981, total time = 28.52 min
Job 12, finished 1500 of 1981, total time = 28.68 min
Job 26, finished 1500 of 1981, total time = 28.71 min
Job 21, finished 1500 of 1981, total time = 28.71 min
Job 5, finished 1400 of 1981, total time = 28.78 min
Job 6, finished 1600 of 1981, total time = 28.77 min
Job 29, finished 1400 of 1984, total time = 28.80 min
Job 25, finished 1200 of 1981, total time = 29.00 min
Job 19, finished 1400 of 1981, total time = 29.11 min
Job 8, finished 1200 of 1981, total time = 29.13 min
Job 13, finished 1600 of 1981, total time = 29.16 min
Job 3, finished 1200 of 1981, total time = 29.20 min
Job 1, finished 1700 of 1981, total time = 29.25 min
Job 7, finished 1500 of 1981, total time = 29.35 min
Job 15, finished 1500 of 1981, total

Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 193, in rgb2hsv
    arr = _prepare_colorarray(rgb)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 151, in _prepare_colorarray
    raise ValueError(msg)
ValueError: the input array must be have a shape == (.., ..,[ ..,] 3)), got (638, 420, 4)


Job 23, finished 1400 of 1981, total time = 29.46 min
Job 18, finished 1600 of 1981, total time = 29.65 min
Job 28, finished 1700 of 1981, total time = 29.89 min
Job 9, finished 1600 of 1981, total time = 29.89 min
Job 4, finished 1000 of 1981, total time = 29.95 min
Job 24, finished 1700 of 1981, total time = 29.98 min
Job 27, finished 1600 of 1981, total time = 30.03 min
Job 2, finished 1300 of 1981, total time = 30.03 min
job 9 - error in /data/training_data/train/35489.jpg
job 15 - error in /data/training_data/train/17164.jpg
job 26 - error in /data/training_data/train/62002.jpg
job 12 - error in /data/training_data/train/36281.jpg
job 16 - error in /data/training_data/train/36297.jpg
job 25 - error in /data/training_data/train/16360.jpg


Traceback (most recent call last):
Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 110, in getFeaturesWorker
    info.loc[i, 'b_std'] = im[:,:,2].std()
Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 129, in getFeaturesWorker
    info.loc[i, 's_mean'] = hsvImage[:,:,1].mean()
Traceback (most recent call last):
  File "<ipython-input-17-65a1d6868ec7>", line 132, in getFeaturesWorker
    info.loc[i, 'v_mean'] = hsvImage[:,:,2].mean()
  File "/data/anaconda/lib/python3.5/site-packages/numpy/core/_methods.py", line 124, in _std
    keepdims=keepdims)
  File "<ipython-input-17-65a1d6868ec7>", line 114, in getFeaturesWorker
    hsvImage = skimage.color.rgb2hsv(im)
  File "/data/anaconda/lib/python3.5/site-packages/numpy/core/_methods.py", line 65, in _mean
    ret = umr_sum(arr, axis, dtype, out, keepdims)
  File "/data/anaconda/lib/python3.5/site-packages/numpy/core/_methods.py", line 105, in _var
    x = um.multiply(x, x, 

job 22 - error in /data/training_data/train/35301.jpg
job 28 - error in /data/training_data/train/66523.jpg
job 11 - error in /data/training_data/train/66893.jpg


  File "/data/anaconda/lib/python3.5/site-packages/numpy/core/_methods.py", line 65, in _mean
    ret = umr_sum(arr, axis, dtype, out, keepdims)


job 8 - error in /data/training_data/train/97957.jpg


KeyboardInterrupt: 

job 13 - error in /data/training_data/train/52880.jpg


Traceback (most recent call last):


job 19 - error in /data/training_data/train/13308.jpg
job 7 - error in /data/training_data/train/72159.jpg


  File "/data/anaconda/lib/python3.5/site-packages/skimage/color/colorconv.py", line 153, in _prepare_colorarray
    return dtype.img_as_float(arr)


job 4 - error in /data/training_data/train/13889.jpg
job 10 - error in /data/training_data/train/93213.jpg
job 23 - error in /data/training_data/train/99860.jpg


Traceback (most recent call last):


job 3 - error in /data/training_data/train/10839.jpg


Traceback (most recent call last):


job 1 - error in /data/training_data/train/75013.jpg


KeyboardInterrupt
Traceback (most recent call last):


In [15]:
train_features

Unnamed: 0.1,Unnamed: 0,filename,artist,title,style,genre,date,pixelsx,pixelsy,size_bytes,...,b_std,h_mean,h_var,s_mean,s_std,s_med,v_mean,v_std,v_med,is_grayscale
0,35050,62366.jpg,ccb8b07e7e3d837b2cd08d3edaf009cd,Changing Horses,Romanticism,genre painting,,1011.0,742.0,136962.0,...,53.921006,-1.488391,0.799015,0.264909,0.247748,0.180283,0.870779,0.921569,0.139063,0.0
1,18697,80367.jpg,e9badad9c193144603d4ac52be9546f8,Autumn Morning,Romanticism,landscape,,1280.0,918.0,487250.0,...,62.232832,-0.026695,0.412312,0.316256,0.253233,0.234043,0.661061,0.647059,0.189076,0.0
2,56709,18159.jpg,bfb541e54ad5c7320e8f80e2a2163e93,"Sangi Takamura, abalone fisherman",Ukiyo-e,marina,,758.0,533.0,93135.0,...,39.626444,0.325117,0.392233,0.329966,0.326531,0.151355,0.498762,0.560784,0.242108,0.0
3,21390,55150.jpg,50591a7061fb340d875723f38e00cc3b,Woman Sellillng Flowers,Impressionism,genre painting,1889,900.0,1161.0,1447197.0,...,51.706499,0.708365,0.191871,0.416934,0.410714,0.213235,0.510177,0.505882,0.233190,0.0
4,38483,103248.jpg,9517cab1f0dee5013c558c52b07b04ff,Auf Leben und Tod (Edelhirsche in der Brunft),Naturalism,animal painting,1897.0,4500.0,3025.0,1380192.0,...,71.120484,-0.000090,0.000403,0.000019,0.000000,0.001196,0.642949,0.639216,0.278910,0.0
5,62757,25027.jpg,92152649ef5cd5113e554523edd13ffd,Waiting,Expressionism,genre painting,1969.0,679.0,456.0,240192.0,...,85.435796,-0.648869,0.462996,0.129784,0.111111,0.113268,0.395361,0.239216,0.328899,0.0
6,28155,77659.jpg,2d72f2000c42051e7c350a39bdce9bc1,Easter and the Totem,Abstract Expressionism,figurative,1953.0,767.0,1113.0,153739.0,...,83.552636,1.282578,0.266919,0.267482,0.192771,0.229625,0.679572,0.854902,0.338417,0.0
7,72943,3035.jpg,bcc742e1dab75dec6fbdbe6fe50ba53b,Madonna and Child with St. John the Baptist,High Renaissance,religious painting,,420.0,500.0,21681.0,...,15.189537,0.680941,0.102017,0.635664,0.690909,0.253294,0.251679,0.145098,0.222829,0.0
8,63737,30536.jpg,2758fcef28414c3065d93ff791211566,Untitled (European Seaside View),Post-Impressionism,cityscape,,600.0,498.0,55212.0,...,22.843491,0.709501,0.003828,0.429892,0.429245,0.116067,0.719358,0.705882,0.128654,0.0
9,40105,3529.jpg,059d590e174c545474f0fedcc126ce4c,The Garden of San Miniato near Florence,Romanticism,landscape,,1000.0,680.0,185117.0,...,62.011870,1.002236,0.485331,0.313781,0.266355,0.192202,0.640369,0.678431,0.268052,0.0


### save features

In [None]:
#test_features_sorted.to_csv(r'/data/test_data/test_features.csv')
#test_features_sorted.to_csv(r'/data/test_data/test_features.csv')
#y = pd.read_csv(r'/data/test_data/test_features.csv', index_col=0)
#test_features_sorted = test_features_sorted.drop('Unnamed: 0',1)

In [16]:
#train_features.to_csv(r'/data/training_data/train_features_0_20000.csv')
#test_features_sorted.to_csv(r'/data/test_data/test_features.csv')
#y = pd.read_csv(r'/data/test_data/test_features.csv', index_col=0)
#test_features_sorted = test_features_sorted.drop('Unnamed: 0',1)

### load features

In [13]:
#load features
train_features = pd.read_csv(r'/data/training_data/train_features.csv', index_col = 0)

### additional processing on feature

In [14]:
# saved features are straight from the feature functions with no handling of nulls, etc
# these have to be addressed prior to training/predicting

rgb_features = ['r_mean', 'r_med', 'r_std', 'g_mean', 'g_med',
                'g_std', 'b_mean', 'b_med', 'b_std',]

size_features =  [ 'pixelsx',
       'pixelsy', 'size_bytes' ]

bw_features = [ 'bw_mean', 'bw_med', 'bw_std' ]

# color pictures get -1 in grayscale columns
for feature in rgb_features:
    train_features.loc[ train_features[feature].isnull(), feature] = -1

# grayscale pictures get -1 in color columns
for feature in bw_features:
    train_features.loc[ train_features[feature].isnull(), feature] = -1

# take out unnecessary columns
feature_names = ['pixelsx',
       'pixelsy', 'size_bytes', 'r_mean', 'r_med', 'r_std', 'g_mean', 'g_med',
       'g_std', 'b_mean', 'b_med', 'b_std', 'bw_mean', 'bw_med', 'bw_std']

train_features = train_features[ ['filename'] + feature_names ]

### Join training features to training pairs

In [15]:
#join pair data to image features
feature_names = ['pixelsx',
       'pixelsy', 'size_bytes', 'r_mean', 'r_med', 'r_std', 'g_mean', 'g_med',
       'g_std', 'b_mean', 'b_med', 'b_std', 'bw_mean', 'bw_med', 'bw_std']

col_dict_1 = {}
col_dict_2 = {}

for feature in feature_names:
    col_dict_1[feature] = '%s_1' % feature
    col_dict_2[feature] = '%s_2' % feature

train_pairs = train_pairs.merge(train_features,
                                left_on='image1', right_on='filename')
train_pairs.rename( columns = col_dict_1,
                      inplace=True)
train_pairs = train_pairs.merge(train_features,
                                left_on='image2', right_on='filename')
train_pairs.rename( columns = col_dict_2,
                      inplace=True)

### remove nulls

In [16]:
# we remove the nulls after the join, could also be done before
train_pairs = train_pairs[~train_pairs['pixelsx_1'].isnull()]
train_pairs = train_pairs[~train_pairs['pixelsx_2'].isnull()]

print(train_pairs.isnull().sum())

artist1         0
image1          0
artist2         0
image2          0
sameArtist      0
filename_x      0
pixelsx_1       0
pixelsy_1       0
size_bytes_1    0
r_mean_1        0
r_med_1         0
r_std_1         0
g_mean_1        0
g_med_1         0
g_std_1         0
b_mean_1        0
b_med_1         0
b_std_1         0
bw_mean_1       0
bw_med_1        0
bw_std_1        0
filename_y      0
pixelsx_2       0
pixelsy_2       0
size_bytes_2    0
r_mean_2        0
r_med_2         0
r_std_2         0
g_mean_2        0
g_med_2         0
g_std_2         0
b_mean_2        0
b_med_2         0
b_std_2         0
bw_mean_2       0
bw_med_2        0
bw_std_2        0
dtype: int64


### results helper

In [73]:
def computePredictStats(y_prob, y_true, threshold = 0.5):
    """ compute accuracy, precision, recall, negative precision, specificity, and auc roc
        Args:
            y_prob: array of floats from 0.0 - 1.0
            y_true: array of booleans
            threshold: true/false threshold value, between 0.0-1.0
        Returns:
            dict of classification metrics
    """
    # y_pred = np.array([True, True, False, False])
    # y_true = np.array([True, True, True, True])
    y_pred = y_prob > threshold
    
    total = len(y_prob)
    true_pos = sum( (y_pred == True) & (y_true == True) )
    true_neg = sum( (y_pred == False) & (y_true == False) )
    false_pos = sum( (y_pred == True) & (y_true == False) )
    false_neg = sum( (y_pred == False ) & (y_true == True) )
    
    accuracy = (true_pos + true_neg) / total
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    npp = true_neg / (true_neg + false_neg) #negative prediction value
    specificity = true_neg / (true_neg + false_pos)
    roc = roc_auc_score(y_true, y_prob)
    
    return { 'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'npp': npp,
            'specificity': specificity,
            'roc': roc,
            'true_pos': true_pos,
            'true_neg': true_neg,
            'false_pos': false_pos,
            'false_neg': false_neg,
           }

### split data into X and Y

In [None]:
# get list of X columns, they are ones with '_1' or '_2' in the name
allCols = train_pairs.columns
isX = list(map(lambda x: ('_1' in x) or ('_2' in x), allCols))
X_columns = allCols[isX]

# split X and Y data
train_X = train_pairs[X_columns]
train_Y = train_pairs['sameArtist']

### k-fold CV

In [85]:
def kFoldCV(train_X, train_Y, k, numFoldsToTest):
    """ Perform k-fold cross validation. Will modify train_pairs (shuffle rows)
        Args:
            train_pairs: Dataframe containing training image pairs with features. Should have no nulls.
            k: k, at least 2
            numFoldsToTest: how many folds to actually test, at most k
        Returns:
            dataframe with CV results
    """
    
    numFoldsToTest = min(k, numFoldsToTest)
    k = max(2, k)
    
    # shuffle rows
    train_pairs = train_pairs.iloc[np.random.permutation(len(train_pairs))]

    # get list of X columns, they are ones with '_1' or '_2' in the name
    #allCols = train_pairs.columns
    #isX = list(map(lambda x: ('_1' in x) or ('_2' in x), allCols))
    #X_columns = allCols[isX]
    
    # split X and Y data
    #CV_X = train_pairs[X_columns]
    #CV_Y = train_pairs['sameArtist']
    
    # define which indices belong to each fold
    foldLocsList = [] #list of Index objects, one for each fold
    
    foldSize = int(len(train_pairs)/5)
        
    for foldNum in range(k):
        if foldNum == k-1:
            foldLocsList.append( train_pairs.index[foldNum*foldSize : len(train_pairs)] )
        else:
            foldLocsList.append( train_pairs.index[foldNum*foldSize : (foldNum+1)*(foldSize) ] )
    
    # set up dataframe for collecting results
    columnsList = list(itertools.product(('train', 'test'), ('roc', 'precision', 'recall', 'npp', 'specificity')))
    results = pd.DataFrame(index = range(numFoldsToTest), columns = pd.MultiIndex.from_tuples(columnsList))
    
    # test each fold
    for testNum in range(numFoldsToTest):

        # indices of training data
        trainLocs = pd.Index([])
        for foldNum in range(k):
            if foldNum != testNum:
                trainLocs = trainLocs.append( foldLocsList[foldNum] )
        
        # indices of test data
        testLocs = foldLocsList[testNum]

        # set up Xs
        CV_train_X = CV_X.loc[trainLocs]
        CV_test_X = CV_X.loc[testLocs]

        # set up Ys
        CV_train_Y = CV_Y.loc[trainLocs]
        CV_test_Y = CV_Y.loc[testLocs]
        
        # fit model
        clf = RandomForestClassifier(n_estimators=50, min_samples_split=16, max_depth=12, n_jobs=6)

        start = time.time()

        print('starting fit')

        clf.fit(CV_train_X, CV_train_Y)

        end = time.time()
                               
        print('total training time: %s' % (end - start) )

        # get in-sample and out-of-sample results
                               
        pred_train = clf.predict_proba(CV_train_X)[:,1]
        train_results = computePredictStats( pred_train, CV_train_Y)

        pred_test = clf.predict_proba(CV_test_X)[:,1]
        test_results = computePredictStats( pred_test, CV_test_Y)
       
        for stat in ('roc', 'precision', 'recall', 'npp', 'specificity'):
            results.loc[testNum, ('train', stat)] = train_results[stat]
            results.loc[testNum, ('test', stat)] = test_results[stat]
        
    return results              

### run k-fold cv

In [86]:
k = 5
numFoldsToTest = 5
results = kFoldCV(train_pairs, k, numFoldsToTest)

print(results)

starting fit
total training time: 0.13500571250915527
starting fit
total training time: 0.13413262367248535
starting fit
total training time: 0.1343233585357666
starting fit
total training time: 0.13423728942871094
starting fit
total training time: 0.13405108451843262
      train                                                test            \
        roc precision    recall       npp specificity       roc precision   
0  0.967172    0.9375  0.833333     0.875    0.954545  0.181818      0.25   
1  0.995614  0.945946  0.921053  0.930233    0.952381  0.362637       0.2   
2   0.98548         1  0.944444  0.956522           1  0.444444  0.428571   
3  0.981203  0.972222  0.921053  0.931818     0.97619  0.450549  0.333333   
4  0.985026  0.962963    0.8125  0.886792    0.979167  0.461538       0.5   

                                   
     recall       npp specificity  
0  0.111111       0.5    0.727273  
1  0.285714       0.5    0.384615  
2  0.333333  0.538462    0.636364  
3  0.428571

### full training

In [18]:
clf = RandomForestClassifier(n_estimators=100, min_samples_split=50, max_depth=8 )

start = time.clock()

print('starting fit')
#excluding the patient_id column from the fit and prediction
clf.fit(train_X, train_Y)

end = time.clock()

print('total training time: %s' % (end - start) )

columnsList = list(itertools.product(('train', 'test'), ('roc', 'precision', 'recall', 'npp', 'specificity')))
results = pd.DataFrame(index = range(numFoldsToTest), columns = pd.MultiIndex.from_tuples(columnsList))


starting fit
total training time: 917.0945500000003


### save model

In [26]:
##save model

start = time.clock()

with open('my_dumped_classifier.pkl', 'wb') as fid:
    pickle.dump(clf, fid) 

end = time.clock()
print('total saving time: %s' % (end - start) )

total saving time: 0.012914999999338761


### load model

In [28]:
start = time.clock()

# load it again
with open('my_dumped_classifier.pkl', 'rb') as fid:
    clf = pickle.load(fid)

end = time.clock()
print('total loading time: %s' % (end - start) )



total loading time: 0.007420999999339983


In [53]:
#load test set definitions
test_pairs = pd.read_csv(r'/data/test_data/submission_info.csv', index_col= 0)

#get features
#raw_test_image_info = get_image_info(test_pairs, r'/data/test_data/test')

#get raw training data features
#raw_test_image_info['bytes_per_pixel'] =raw_test_image_info['size_bytes']/(raw_test_image_info['pixelsx']*raw_test_image_info['pixelsy'])
#raw_test_image_info['aspect_ratio'] = raw_test_image_info['pixelsx']/raw_test_image_info['pixelsy']

#save raw_train_image_info
#raw_test_image_info.to_csv(r'/data/test_data/raw_test_image_info.csv')

#load raw_train_image_info
#raw_test_image_info = pd.read_csv(r'/data/test_data/raw_test_image_info.csv', index_col = 0)

## join test features to test pairs

In [51]:
def joinFeaturesToPairs(image_info, image_pairs):
    #join pair data to image features
    trimmed_image_info = image_info[['new_filename',
                                     'pixelsx',
                                     'pixelsy',
                                     'bytes_per_pixel',
                                     'aspect_ratio']]
    image_pairs = image_pairs.merge(trimmed_image_info,
                              left_on='image1', right_on='new_filename')
    image_pairs.rename( columns = {'pixelsx': 'pixelsx_1',
                        'pixelsy': 'pixelsy_1',
                        'bytes_per_pixel' : 'bytes_per_pixel_1',
                        'aspect_ratio':'aspect_ratio_1'})
    image_pairs = image_pairs.merge(trimmed_image_info,
                                    left_on='image2', right_on='new_filename')
    train_pairs.rename( columns = {'pixelsx': 'pixelsx_2',
                        'pixelsy': 'pixelsy_2',
                        'bytes_per_pixel' : 'bytes_per_pixel_2',
                        'aspect_ratio':'aspect_ratio_2'})
    return train_pairs

test_pairs_final = joinFeaturesToPairs(raw_test_image_info, test_pairs)

## test on test set

In [24]:
test_predictions = clf.pred(test_pairs_final)

['mymodel.pkl',
 'mymodel.pkl_01.npy',
 'mymodel.pkl_02.npy',
 'mymodel.pkl_03.npy',
 'mymodel.pkl_04.npy',
 'mymodel.pkl_05.npy',
 'mymodel.pkl_06.npy',
 'mymodel.pkl_07.npy',
 'mymodel.pkl_08.npy',
 'mymodel.pkl_09.npy',
 'mymodel.pkl_10.npy',
 'mymodel.pkl_11.npy',
 'mymodel.pkl_12.npy',
 'mymodel.pkl_13.npy',
 'mymodel.pkl_14.npy',
 'mymodel.pkl_15.npy',
 'mymodel.pkl_16.npy',
 'mymodel.pkl_17.npy',
 'mymodel.pkl_18.npy',
 'mymodel.pkl_19.npy',
 'mymodel.pkl_20.npy',
 'mymodel.pkl_21.npy',
 'mymodel.pkl_22.npy',
 'mymodel.pkl_23.npy',
 'mymodel.pkl_24.npy',
 'mymodel.pkl_25.npy',
 'mymodel.pkl_26.npy',
 'mymodel.pkl_27.npy',
 'mymodel.pkl_28.npy',
 'mymodel.pkl_29.npy',
 'mymodel.pkl_30.npy',
 'mymodel.pkl_31.npy',
 'mymodel.pkl_32.npy',
 'mymodel.pkl_33.npy',
 'mymodel.pkl_34.npy',
 'mymodel.pkl_35.npy',
 'mymodel.pkl_36.npy',
 'mymodel.pkl_37.npy',
 'mymodel.pkl_38.npy',
 'mymodel.pkl_39.npy',
 'mymodel.pkl_40.npy',
 'mymodel.pkl_41.npy',
 'mymodel.pkl_42.npy',
 'mymodel.pkl_43.n

 ## prepare submission
    

In [None]:
submission = submission_info[['index']]
submission['sameArtist'] = y_pred
submission.to_csv('submission.csv', index=False)