In [1]:
%matplotlib inline



In [2]:
%load_ext line_profiler

In [3]:
import os
import gc
import itertools
import random
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import multiprocessing
import time
import traceback

from PIL import Image
import skimage.data
import skimage.exposure
import skimage.color

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score

import scipy.misc

## make pairs

In [4]:
def make_pairs(train_info):
    """Creates training data from the supplied training image information file"""
    artists = train_info.artist.unique()

    n = train_info.groupby('artist').size()
    n = (2*n**2).sum() 
    t = pd.DataFrame(np.zeros((n, 4)), columns=['artist1', 'image1', 'artist2', 'image2'])
    i = 0
    j = 0
    
    for artist in artists:

        #artist info is Ax2 matrix of artist, filename
        artistInfo = train_info[train_info.artist==artist][['artist', 'filename']].values
        
        use = train_info[train_info.artist != artist ].index.values
        np.random.shuffle(use)
        
        #nm = np.min([a.shape[0]**2, train_info[train_info.artist != m].shape[0] ])
        numExamples = np.min([len(artistInfo)**2, sum(train_info.artist != artist) ])
        use = use[0:numExamples]
        
        #diffArtistInfo a Bx2 matrix of artist, filename
        diffArtistInfo = train_info[train_info.artist!=artist][['artist', 'filename']].ix[use, :].values

        
        toAdd_SameArtist = pd.DataFrame(np.concatenate([  np.repeat(artistInfo[:, 0], len(artistInfo)).reshape((-1,1)), #artist
                                            np.repeat(artistInfo[:, 1],
                                            artistInfo.shape[0]).reshape((-1,1)),
                                            np.tile(artistInfo, (len(artistInfo), 1))],
                                         axis=1),
                          columns=['artist1', 'image1', 'artist2', 'image2'])
        toAdd_SameArtist = toAdd_SameArtist.loc[0:numExamples, :]
        
        toAdd_DiffArtist = pd.DataFrame(np.concatenate([np.tile(artistInfo,
                                                  (len(artistInfo), 1))[0:len(diffArtistInfo), :],
                                          diffArtistInfo], axis=1),
                          columns=['artist1', 'image1', 'artist2', 'image2'])
        toAdd_DiffArtist = toAdd_DiffArtist.loc[0:numExamples, :]
        
        #print(j, i, a2.shape[0], b2.shape[0])
        #print(b2)
        t.iloc[i:i+len(toAdd_SameArtist), :] = toAdd_SameArtist.values
        t.iloc[i+len(toAdd_SameArtist):i+len(toAdd_SameArtist)+len(toAdd_DiffArtist), :] = toAdd_DiffArtist.values
        
        i += len(toAdd_SameArtist) + len(toAdd_DiffArtist)
        j += 1
        if j%100==0:
            print('finished %s of %s artists'%(j, len(artists)))

    print('make pairs completed')
    t = t[~t.image2.isin([np.nan, 0])]
    return t[t.image1 > t.image2]

## Prep Image List

In [4]:
# def prepImageList(image_info, isTest):
#     """given the train_image_info or submission_info, returns a dataframe with a single column containing filenames of images"""
#     if isTest:
#         images = list(set(list(image_info.image1.unique()) + list(image_info.image2.unique())))
#         result = pd.DataFrame(np.array(images).reshape((-1, 1)), columns = ['filename'])
#     else:
#         result = pd.DataFrame(columns = ['filename'], data = image_info['filename'] )
    
#     return result

## Get Features Parent

In [4]:
def getFeaturesParent(isTest):
    """Creates features for training and test images. This function utilizes multiprocessing.
    Args:
        isTest: bool to fetch training or test data
    Returns:
        pandas DataFrame containing features
    """
    
    num_cores = multiprocessing.cpu_count() - 5
    
    argsList = []
    
    for jobNum in range(num_cores):
        argsList.append((isTest, jobNum, num_cores))
        

    print('Launching %s jobs' % (num_cores))
    startTime = time.time()
    
    pool = multiprocessing.Pool(num_cores)
    image_features_list = pool.starmap(getFeaturesWorker, argsList)
    pool.close()
    pool.join()
    
    image_features = pd.concat(image_features_list)
    
    endTime = time.time()
    
    print("collecting features complete, time taken = %.2f minutes" % ((endTime - startTime) / 60.0))
    return image_features

## Get Features Worker

In [33]:
def getFeaturesWorker(isTest, jobNum, totalJobs):
    """Child function for computing image features, only to be called by getFeaturesParent
    Args:
        isTest: whether to compute features for test or training images
        jobNum: which job number this is
        totalJobs: total number of jobs
    Returns:
        pandas dataframe containing a data for a fraction of the training or test images
    """
    if isTest:
        mydir = r'/data/test_data/test'
        info = pd.read_csv(r'/data/test_data/submission_info.csv', index_col = 0)
    else:
        mydir = r'/data/training_data/train'
        info = pd.read_csv(r'/data/training_data/train_info.csv', index_col = 0)
    
    totalNumImages = len(info)
    
    chunkSize = np.int(totalNumImages/totalJobs)
    
    if jobNum == totalJobs - 1:
        startInd = jobNum * chunkSize
        endInd = totalNumImages
    else:
        startInd = jobNum * chunkSize
        endInd = (jobNum + 1) * chunkSize
        
    info = info.iloc[startInd:endInd]
    
    info['pixelsx'] = np.nan
    info['pixelsy'] = np.nan
    info['size_bytes'] = np.nan
    
    info['r_mean'] = np.nan
    info['r_med'] = np.nan
    info['r_std'] = np.nan
    
    info['g_mean'] = np.nan
    info['g_med'] = np.nan
    info['g_std'] = np.nan
    
    info['b_mean'] = np.nan
    info['b_med'] = np.nan
    info['b_std'] = np.nan
    
    info['h_mean'] = np.nan
    info['h_var'] = np.nan
    
    info['s_mean'] = np.nan
    info['s_std'] = np.nan
    info['s_med'] = np.nan
    
    info['v_mean'] = np.nan
    info['v_std'] = np.nan
    info['v_med'] = np.nan
    
    info['is_grayscale'] = np.nan
      
    print('Job %s, starting getting image info for images %s-%s' % (jobNum, startInd, endInd-1))
    startTime = time.clock()
    
    for ind, i in enumerate(info.index.values):
        try:       
            #im = Image.open(mydir+'/'+info.loc[i, 'filename'])
            #info.loc[i, 'pixelsx'], info.loc[i, 'pixelsy'] = im.size
            
            im = skimage.data.imread(mydir + '/' + info.loc[i, 'filename'])
                
            info.loc[i, 'pixelsx'] = im.shape[1]
            info.loc[i, 'pixelsy'] = im.shape[0]
            
            grayscale = (len(im.shape) == 2)
            
            if grayscale:
                info.loc[i, 'r_mean'] = im.mean()
                info.loc[i, 'g_mean'] = info.loc[i, 'b_mean'] = info.loc[i, 'r_mean']
                
                info.loc[i, 'r_med'] = np.median(im)
                info.loc[i, 'g_med'] = info.loc[i, 'b_med'] = info.loc[i, 'r_med']
                
                info.loc[i, 'r_std'] = im.std()
                info.loc[i, 'g_std'] = info.loc[i, 'b_std'] = info.loc[i, 'r_std']
                
                info.loc[i, 'is_grayscale' ] = 1
                
                info.loc[i, 'h_mean'] = 0
                info.loc[i, 'h_var'] = 0
                info.loc[i, 's_mean'] = 0
                info.loc[i, 's_std'] = 0
                info.loc[i, 's_med'] = 0
                info.loc[i, 'v_mean'] = info.loc[i, 'r_mean']/256.0
                info.loc[i, 'v_std'] = info.loc[i, 'r_std']/256.0
                info.loc[i, 'v_med'] = info.loc[i, 'r_med']/256.0
                
            else:
                info.loc[i, 'r_mean'] = im[:,:,0].mean()
                info.loc[i, 'g_mean'] = im[:,:,1].mean()
                info.loc[i, 'b_mean'] = im[:,:,2].mean()
                info.loc[i, 'r_med'] = np.median(im[:,:,0])
                info.loc[i, 'g_med'] = np.median(im[:,:,1])
                info.loc[i, 'b_med'] = np.median(im[:,:,2])
                info.loc[i, 'r_std'] = im[:,:,0].std()
                info.loc[i, 'g_std'] = im[:,:,1].std()
                info.loc[i, 'b_std'] = im[:,:,2].std()
                info.loc[i, 'is_grayscale' ] = 0
                
                #if it is in RGBA, we don't handle it for now
                if (len(im.shape) == 3) and im.shape[2] == 4:
                    print('%s is rgba' % info.loc[i, 'filename'])
                else:     
                    # convert image to hue/saturation/value
                    hsvImage = skimage.color.rgb2hsv(im)
                    angles = hsvImage[:,:,0] * 2.0 * np.pi

                    # average hue is converting the (0-1) hue value to unit vector coordinates
                    # and finding the average direction
                    sinSum = np.sin(angles).sum()
                    cosSum = np.cos(angles).sum()
                    info.loc[i, 'h_mean'] = np.arctan(sinSum/cosSum)

                    # use the variance formula for a circulator distribution
                    R2 = np.power(sinSum, 2) + np.power(cosSum, 2)
                    numPixels = info.loc[i, 'pixelsx'] * info.loc[i, 'pixelsy']
                    R_bar = np.sqrt(R2)/numPixels
                    info.loc[i, 'h_var'] = 1 - R_bar

                    info.loc[i, 's_mean'] = hsvImage[:,:,1].mean()
                    info.loc[i, 's_std'] = np.median(hsvImage[:,:,1])
                    info.loc[i, 's_med'] = hsvImage[:,:,1].std()
                    info.loc[i, 'v_mean'] = hsvImage[:,:,2].mean()
                    info.loc[i, 'v_std'] = np.median(hsvImage[:,:,2])
                    info.loc[i, 'v_med'] = hsvImage[:,:,2].std()
                
            #im = cv2.imread(dir+'/'+info.loc[i, 'new_filename'])
            #info.loc[i, 'pixelsx'], info.loc[i, 'pixelsy'] = im.shape[0:2]
            info.loc[i, 'size_bytes'] = os.path.getsize(mydir+'/'+info.loc[i, 'filename']) 
            if (ind+1)%100==0:
                currentTime = time.clock()
                print('Job %s, finished %s of %s, total time = %.2f min' %
                     (jobNum, (ind+1), len(info), (currentTime - startTime)/60.0))
        except:
            print('job %s - error in %s' % (jobNum, mydir+'/'+info.loc[i, 'filename']))
            traceback.print_exc()
    
    currentTime = time.clock()
    print('- Job %s, finished getting image info, total time = %.2f min' % ( jobNum, (currentTime - startTime) / 60.0))
    
    return info

    #return info.rename(columns={'filename' : 'new_filename'})

### load image info

In [14]:
#load training info
train_info = pd.read_csv(r'/data/training_data/train_info.csv', index_col=0)
submission_info = pd.read_csv(r'/data/test_data/submission_info.csv', index_col=0)

#shuffle and save info
#train_info = train_info.iloc[np.random.permutation(len(train_info))]
#submission_info = submission_info.iloc[np.random.permutation(len(submission_info))]

#train_info.to_csv(r'/data/training_data/train_info.csv')
#submission_info.to_csv(r'/data/test_data/submission_info.csv')

### create submission image info from submission pairs

In [6]:
# submission image data is a bunch of images pairs, but we may want to work with a list of test images instead

#submission_pairs = pd.read_csv(r'/data/test_data/submission_pairs.csv')
#images = list(set(list(submission_info.image1.unique()) + list(submission_info.image2.unique())))
#submission_info = pd.DataFrame(data=images, columns=['filename'])
#submission_info = pd.read_csv(r'/data/test_data/submission_info.csv', index_col = 0)

### load test pairs

In [134]:
test_pairs = pd.read_csv(r'/data/test_data/submission_pairs.csv')

### make training pairs

In [None]:
#make training pairs
#train_pairs = make_pairs(train_image_info)
#train_pairs[ 'sameArtist' ] = train_pairs[ 'artist1' ] == train_pairs[ 'artist2' ]

### save training pairs

In [None]:
#save as csv
#train_pairs.to_csv(r'/data/training_data/train_pairs.csv')

### load pairs

In [4]:
#load pairs
train_pairs = pd.read_csv(r'/data/training_data/train_pairs.csv', index_col = 0)
#submission_pairs = pd.read_csv(r'/data/test_data/submission_pairs.csv')

In [5]:
len(train_pairs)

4737102

### shuffle pairs, reduce number if necessary

In [87]:
train_pairs = train_pairs.iloc[np.random.permutation(len(train_pairs))]
train_pairs = train_pairs.iloc[0:2000000]

### compute features

In [34]:
print('Begin computing features')
startTime = time.time()

test_features = getFeaturesParent(True)

endTime = time.time()
print("Finished computing features, time taken = %.2f min" % ((endTime-startTime)/60.0) )

Begin computing features
Launching 35 jobs
Job 3, starting getting image info for images 2040-2719
Job 24, starting getting image info for images 16320-16999
Job 18, starting getting image info for images 12240-12919
Job 12, starting getting image info for images 8160-8839
Job 7, starting getting image info for images 4760-5439
Job 16, starting getting image info for images 10880-11559
Job 23, starting getting image info for images 15640-16319
Job 13, starting getting image info for images 8840-9519
Job 10, starting getting image info for images 6800-7479
Job 6, starting getting image info for images 4080-4759
Job 14, starting getting image info for images 9520-10199
Job 11, starting getting image info for images 7480-8159
Job 0, starting getting image info for images 0-679
Job 33, starting getting image info for images 22440-23119
Job 15, starting getting image info for images 10200-10879
Job 1, starting getting image info for images 680-1359
Job 31, starting getting image info for im

Traceback (most recent call last):
  File "<ipython-input-33-badd30fc032f>", line 69, in getFeaturesWorker
    info.loc[i, 'pixelsx'] = im.shape[1]
IndexError: tuple index out of range


32071.jpg is rgba
Job 11, finished 100 of 680, total time = 1.59 min
Job 22, finished 100 of 680, total time = 1.68 min
Job 32, finished 100 of 680, total time = 1.70 min
Job 34, finished 100 of 697, total time = 1.80 min
Job 29, finished 100 of 680, total time = 1.82 min
Job 20, finished 100 of 680, total time = 1.82 min
Job 6, finished 100 of 680, total time = 1.83 min
Job 8, finished 100 of 680, total time = 1.90 min
Job 31, finished 100 of 680, total time = 1.97 min
Job 23, finished 100 of 680, total time = 1.99 min
Job 13, finished 100 of 680, total time = 2.00 min
Job 12, finished 100 of 680, total time = 2.02 min
Job 3, finished 100 of 680, total time = 2.04 min
Job 4, finished 100 of 680, total time = 2.05 min
Job 14, finished 100 of 680, total time = 2.09 min
Job 26, finished 100 of 680, total time = 2.11 min
Job 30, finished 100 of 680, total time = 2.14 min
Job 27, finished 100 of 680, total time = 2.17 min
Job 9, finished 100 of 680, total time = 2.18 min
Job 1, finished 10

Traceback (most recent call last):
  File "/data/anaconda/lib/python3.5/site-packages/skimage/io/_plugins/pil_plugin.py", line 53, in pil_to_ndarray
    im.getdata()[0]
  File "/data/anaconda/lib/python3.5/site-packages/PIL/Image.py", line 1151, in getdata
    self.load()
  File "/data/anaconda/lib/python3.5/site-packages/PIL/ImageFile.py", line 218, in load
    "(%d bytes not processed)" % len(b))
OSError: image file is truncated (79 bytes not processed)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<ipython-input-33-badd30fc032f>", line 67, in getFeaturesWorker
    im = skimage.data.imread(mydir + '/' + info.loc[i, 'filename'])
  File "/data/anaconda/lib/python3.5/site-packages/skimage/io/_io.py", line 61, in imread
    img = call_plugin('imread', fname, plugin=plugin, **plugin_args)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/io/manage_plugins.py", line 211, in call_plugin
    return func(*args, **kwar

Job 33, finished 100 of 680, total time = 2.34 min
Job 21, finished 100 of 680, total time = 2.37 min
Job 7, finished 100 of 680, total time = 2.41 min
Job 24, finished 100 of 680, total time = 2.43 min
Job 17, finished 100 of 680, total time = 2.47 min
Job 19, finished 100 of 680, total time = 2.50 min
91447.jpg is rgba
Job 10, finished 100 of 680, total time = 2.77 min
Job 25, finished 100 of 680, total time = 2.91 min
Job 5, finished 100 of 680, total time = 3.08 min
Job 18, finished 100 of 680, total time = 3.10 min
Job 11, finished 200 of 680, total time = 3.17 min
Job 22, finished 200 of 680, total time = 3.42 min
Job 3, finished 200 of 680, total time = 3.67 min
Job 14, finished 200 of 680, total time = 3.73 min
Job 16, finished 100 of 680, total time = 3.75 min
Job 6, finished 200 of 680, total time = 3.82 min
Job 31, finished 200 of 680, total time = 3.84 min
Job 12, finished 200 of 680, total time = 3.89 min
Job 0, finished 100 of 680, total time = 3.89 min
Job 32, finished 2



Job 20, finished 200 of 680, total time = 4.52 min
Job 9, finished 200 of 680, total time = 4.55 min
Job 15, finished 200 of 680, total time = 4.66 min
Job 19, finished 200 of 680, total time = 4.70 min
Job 28, finished 200 of 680, total time = 4.77 min
Job 33, finished 200 of 680, total time = 4.78 min
Job 17, finished 200 of 680, total time = 4.79 min
Job 5, finished 200 of 680, total time = 4.84 min
Job 26, finished 200 of 680, total time = 4.86 min
Job 18, finished 200 of 680, total time = 4.88 min
Job 7, finished 200 of 680, total time = 4.98 min
Job 22, finished 300 of 680, total time = 5.13 min
Job 21, finished 200 of 680, total time = 5.14 min
Job 12, finished 300 of 680, total time = 5.29 min
Job 11, finished 300 of 680, total time = 5.38 min
Job 1, finished 300 of 680, total time = 5.46 min
Job 14, finished 300 of 680, total time = 5.52 min
25416.jpg is rgba
Job 23, finished 300 of 680, total time = 5.62 min
89073.jpg is rgba
Job 8, finished 300 of 680, total time = 5.79 min


Traceback (most recent call last):
  File "/data/anaconda/lib/python3.5/site-packages/skimage/io/_plugins/pil_plugin.py", line 53, in pil_to_ndarray
    im.getdata()[0]
  File "/data/anaconda/lib/python3.5/site-packages/PIL/Image.py", line 1151, in getdata
    self.load()
  File "/data/anaconda/lib/python3.5/site-packages/PIL/ImageFile.py", line 218, in load
    "(%d bytes not processed)" % len(b))
OSError: image file is truncated (0 bytes not processed)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<ipython-input-33-badd30fc032f>", line 67, in getFeaturesWorker
    im = skimage.data.imread(mydir + '/' + info.loc[i, 'filename'])
  File "/data/anaconda/lib/python3.5/site-packages/skimage/io/_io.py", line 61, in imread
    img = call_plugin('imread', fname, plugin=plugin, **plugin_args)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/io/manage_plugins.py", line 211, in call_plugin
    return func(*args, **kwarg

Job 21, finished 500 of 680, total time = 11.96 min
Job 12, finished 600 of 680, total time = 11.97 min
- Job 14, finished getting image info, total time = 12.09 min
Job 34, finished 500 of 697, total time = 12.12 min
Job 32, finished 600 of 680, total time = 12.13 min
Job 33, finished 600 of 680, total time = 12.18 min
- Job 27, finished getting image info, total time = 12.18 min
Job 28, finished 500 of 680, total time = 12.27 min
Job 24, finished 500 of 680, total time = 12.30 min
Job 1, finished 600 of 680, total time = 12.36 min
Job 6, finished 600 of 680, total time = 12.43 min
Job 10, finished 600 of 680, total time = 12.50 min
Job 5, finished 600 of 680, total time = 12.54 min
- Job 22, finished getting image info, total time = 12.61 min
Job 8, finished 600 of 680, total time = 12.63 min
93262.jpg is rgba
Job 30, finished 600 of 680, total time = 12.94 min
Job 4, finished 600 of 680, total time = 12.92 min
- Job 11, finished getting image info, total time = 12.93 min
Job 0, fini

Traceback (most recent call last):
  File "/data/anaconda/lib/python3.5/site-packages/skimage/io/_plugins/pil_plugin.py", line 53, in pil_to_ndarray
    im.getdata()[0]
  File "/data/anaconda/lib/python3.5/site-packages/PIL/Image.py", line 1151, in getdata
    self.load()
  File "/data/anaconda/lib/python3.5/site-packages/PIL/ImageFile.py", line 218, in load
    "(%d bytes not processed)" % len(b))
OSError: image file is truncated (57 bytes not processed)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<ipython-input-33-badd30fc032f>", line 67, in getFeaturesWorker
    im = skimage.data.imread(mydir + '/' + info.loc[i, 'filename'])
  File "/data/anaconda/lib/python3.5/site-packages/skimage/io/_io.py", line 61, in imread
    img = call_plugin('imread', fname, plugin=plugin, **plugin_args)
  File "/data/anaconda/lib/python3.5/site-packages/skimage/io/manage_plugins.py", line 211, in call_plugin
    return func(*args, **kwar

Job 13, finished 600 of 680, total time = 13.55 min
Job 18, finished 600 of 680, total time = 13.59 min
Job 20, finished 600 of 680, total time = 13.58 min
Job 24, finished 600 of 680, total time = 13.74 min
Job 25, finished 500 of 680, total time = 13.77 min
- Job 33, finished getting image info, total time = 13.82 min
69423.jpg is rgba
- Job 4, finished getting image info, total time = 13.85 min
- Job 1, finished getting image info, total time = 13.85 min
Job 28, finished 600 of 680, total time = 13.94 min
30623.jpg is rgba
- Job 5, finished getting image info, total time = 14.00 min
- Job 8, finished getting image info, total time = 14.01 min
- Job 6, finished getting image info, total time = 14.04 min
Job 19, finished 600 of 680, total time = 14.11 min
- Job 30, finished getting image info, total time = 14.14 min
- Job 15, finished getting image info, total time = 14.20 min
- Job 9, finished getting image info, total time = 14.20 min
- Job 32, finished getting image info, total tim

### save features

In [38]:
#test_features.to_csv(r'/data/test_data/test_features.csv')
#test_features_sorted.to_csv(r'/data/test_data/test_features.csv')
#y = pd.read_csv(r'/data/test_data/test_features.csv', index_col=0)
#test_features_sorted = test_features_sorted.drop('Unnamed: 0',1)

In [43]:
#train_features.to_csv(r'/data/training_data/train_features_40000_end.csv')
#test_features_sorted.to_csv(r'/data/test_data/test_features.csv')
#y = pd.read_csv(r'/data/test_data/test_features.csv', index_col=0)
#test_features_sorted = test_features_sorted.drop('Unnamed: 0',1)

### load training features

In [67]:
#load features
train_features_0 = pd.read_csv(r'/data/training_data/train_features_0_20000.csv', index_col = 0)
train_features_1 = pd.read_csv(r'/data/training_data/train_features_20000_40000.csv', index_col = 0)
train_features_2 = pd.read_csv(r'/data/training_data/train_features_40000_end.csv', index_col = 0)

In [68]:
train_features = pd.concat([train_features_0, train_features_1, train_features_2])

### load test features

In [6]:
test_features = pd.read_csv(r'/data/test_data/test_features.csv', index_col = 0)

In [49]:
cols = test_features.columns.copy()

cols.drop('is_grayscale')

Index(['filename', 'pixelsx', 'pixelsy', 'size_bytes', 'r_mean', 'r_med',
       'r_std', 'g_mean', 'g_med', 'g_std', 'b_mean', 'b_med', 'b_std',
       'h_mean', 'h_var', 's_mean', 's_std', 's_med', 'v_mean', 'v_std',
       'v_med'],
      dtype='object')

In [53]:
test_features[cols].fillna(test_featurse[cols].mean())

In [9]:
len(test_features.columns)

63

### additional processing on feature - remove extra columns

In [56]:
# saved features are straight from the feature functions with no handling of nulls, etc
# these have to be addressed prior to training/predicting

#rgb_features = ['r_mean', 'r_med', 'r_std', 'g_mean', 'g_med',
#                'g_std', 'b_mean', 'b_med', 'b_std',]

#size_features =  [ 'pixelsx',
#       'pixelsy', 'size_bytes' ]


# take out unnecessary columns
def removeColumns(features):
    feature_names = ['pixelsx', 'pixelsy', 'size_bytes',
                     'r_mean', 'r_med', 'r_std',
                     'g_mean', 'g_med', 'g_std',
                     'b_mean', 'b_med', 'b_std',
                     'h_mean', 'h_var',
                     's_mean', 's_med', 's_std',
                     'v_mean', 'v_med', 'v_std',
                     'is_grayscale']

    features = features[ ['filename'] + feature_names ]
    

In [96]:
removeColumns(train_features)

In [98]:
train_features.columns

Index(['filename', 'artist', 'title', 'style', 'genre', 'date', 'pixelsx',
       'pixelsy', 'size_bytes', 'r_mean', 'r_med', 'r_std', 'g_mean', 'g_med',
       'g_std', 'b_mean', 'b_med', 'b_std', 'h_mean', 'h_var', 's_mean',
       's_std', 's_med', 'v_mean', 'v_std', 'v_med', 'is_grayscale',
       'aspect_ratio', 'size_per_pixel'],
      dtype='object')

### feature processing - add image features

In [58]:
def addImageFeatures(features):
    """modifies in place"""
    features['aspect_ratio'] = features['pixelsx']/features['pixelsy']
    features['size_per_pixel'] = features['size_bytes']/features['pixelsx']/features['pixelsy']

In [71]:
addImageFeatures(train_features)

In [59]:
addImageFeatures(test_features)

In [72]:
train_features.columns

Index(['filename', 'artist', 'title', 'style', 'genre', 'date', 'pixelsx',
       'pixelsy', 'size_bytes', 'r_mean', 'r_med', 'r_std', 'g_mean', 'g_med',
       'g_std', 'b_mean', 'b_med', 'b_std', 'h_mean', 'h_var', 's_mean',
       's_std', 's_med', 'v_mean', 'v_std', 'v_med', 'is_grayscale',
       'aspect_ratio', 'size_per_pixel'],
      dtype='object')

### Join features to pairs

In [88]:
#join pair data to image features

def joinPairsToFeatures(pairs, features):
    
    feature_base_names = ['pixelsx', 'pixelsy', 'size_bytes', 'aspect_ratio', 'size_per_pixel',
                     'r_mean', 'r_med', 'r_std',
                     'g_mean', 'g_med', 'g_std',
                     'b_mean', 'b_med', 'b_std',
                     'h_mean', 'h_var',
                     's_mean', 's_med', 's_std',
                     'v_mean', 'v_med', 'v_std', 
                     'is_grayscale' ]

    col_dict_1 = {}
    col_dict_2 = {}

    for feature in feature_base_names:
        col_dict_1[feature] = '%s_1' % feature
        col_dict_2[feature] = '%s_2' % feature

    pairs = pairs.merge(features,
                        left_on='image1', right_on='filename')
    pairs.rename( columns = col_dict_1,
                          inplace=True)
    pairs = pairs.merge(features,
                        left_on='image2', right_on='filename')
    pairs.rename( columns = col_dict_2,
                          inplace=True)
    return pairs

In [89]:
train_pairs = joinPairsToFeatures(train_pairs, train_features)

In [91]:
train_pairs.columns

Index(['artist1', 'image1', 'artist2', 'image2', 'sameArtist', 'filename_x',
       'artist_x', 'title_x', 'style_x', 'genre_x', 'date_x', 'pixelsx_1',
       'pixelsy_1', 'size_bytes_1', 'r_mean_1', 'r_med_1', 'r_std_1',
       'g_mean_1', 'g_med_1', 'g_std_1', 'b_mean_1', 'b_med_1', 'b_std_1',
       'h_mean_1', 'h_var_1', 's_mean_1', 's_std_1', 's_med_1', 'v_mean_1',
       'v_std_1', 'v_med_1', 'is_grayscale_1', 'aspect_ratio_1',
       'size_per_pixel_1', 'filename_y', 'artist_y', 'title_y', 'style_y',
       'genre_y', 'date_y', 'pixelsx_2', 'pixelsy_2', 'size_bytes_2',
       'r_mean_2', 'r_med_2', 'r_std_2', 'g_mean_2', 'g_med_2', 'g_std_2',
       'b_mean_2', 'b_med_2', 'b_std_2', 'h_mean_2', 'h_var_2', 's_mean_2',
       's_std_2', 's_med_2', 'v_mean_2', 'v_std_2', 'v_med_2',
       'is_grayscale_2', 'aspect_ratio_2', 'size_per_pixel_2'],
      dtype='object')

In [136]:
test_pairs = joinPairsToFeatures(test_pairs, test_features)

In [140]:
test_pairs = test_pairs.sort_values(by='index')

In [145]:
test_pairs.set_index('index', inplace = True)

In [149]:
test_pairs[0:10]

Index(['image1', 'image2', 'filename_x', 'pixelsx_1', 'pixelsy_1',
       'size_bytes_1', 'r_mean_1', 'r_med_1', 'r_std_1', 'g_mean_1', 'g_med_1',
       'g_std_1', 'b_mean_1', 'b_med_1', 'b_std_1', 'h_mean_1', 'h_var_1',
       's_mean_1', 's_std_1', 's_med_1', 'v_mean_1', 'v_std_1', 'v_med_1',
       'is_grayscale_1', 'aspect_ratio_1', 'size_per_pixel_1', 'filename_y',
       'pixelsx_2', 'pixelsy_2', 'size_bytes_2', 'r_mean_2', 'r_med_2',
       'r_std_2', 'g_mean_2', 'g_med_2', 'g_std_2', 'b_mean_2', 'b_med_2',
       'b_std_2', 'h_mean_2', 'h_var_2', 's_mean_2', 's_std_2', 's_med_2',
       'v_mean_2', 'v_std_2', 'v_med_2', 'is_grayscale_2', 'aspect_ratio_2',
       'size_per_pixel_2'],
      dtype='object')

In [92]:
test_pairs.columns

Index(['index', 'image1', 'image2', 'filename_x', 'pixelsx_1', 'pixelsy_1',
       'size_bytes_1', 'r_mean_1', 'r_med_1', 'r_std_1', 'g_mean_1', 'g_med_1',
       'g_std_1', 'b_mean_1', 'b_med_1', 'b_std_1', 'h_mean_1', 'h_var_1',
       's_mean_1', 's_std_1', 's_med_1', 'v_mean_1', 'v_std_1', 'v_med_1',
       'is_grayscale_1', 'aspect_ratio_1', 'size_per_pixel_1', 'filename_y',
       'pixelsx_2', 'pixelsy_2', 'size_bytes_2', 'r_mean_2', 'r_med_2',
       'r_std_2', 'g_mean_2', 'g_med_2', 'g_std_2', 'b_mean_2', 'b_med_2',
       'b_std_2', 'h_mean_2', 'h_var_2', 's_mean_2', 's_std_2', 's_med_2',
       'v_mean_2', 'v_std_2', 'v_med_2', 'is_grayscale_2', 'aspect_ratio_2',
       'size_per_pixel_2'],
      dtype='object')

### remove nulls in training

In [93]:
# we remove the nulls after the join, could also be done before
train_pairs = train_pairs[~train_pairs['pixelsx_1'].isnull()]
train_pairs = train_pairs[~train_pairs['pixelsx_2'].isnull()]
train_pairs = train_pairs[~train_pairs['size_bytes_1'].isnull()]
train_pairs = train_pairs[~train_pairs['size_bytes_2'].isnull()]
train_pairs = train_pairs[~train_pairs['h_mean_1'].isnull()]
train_pairs = train_pairs[~train_pairs['h_mean_2'].isnull()]

print(train_pairs.isnull().sum())
print(train_pairs.isnull().sum().sum())

artist1                  0
image1                   0
artist2                  0
image2                   0
sameArtist               0
filename_x               0
artist_x                 0
title_x                 38
style_x              11237
genre_x              10732
date_x              487163
pixelsx_1                0
pixelsy_1                0
size_bytes_1             0
r_mean_1                 0
r_med_1                  0
r_std_1                  0
g_mean_1                 0
g_med_1                  0
g_std_1                  0
b_mean_1                 0
b_med_1                  0
b_std_1                  0
h_mean_1                 0
h_var_1                  0
s_mean_1                 0
s_std_1                  0
s_med_1                  0
v_mean_1                 0
v_std_1                  0
                     ...  
size_per_pixel_1         0
filename_y               0
artist_y                 0
title_y                220
style_y              15034
genre_y              17095
d

### check no nulls in submission

In [150]:
print(test_pairs.isnull().sum())
print(test_pairs.isnull().sum().sum())

image1              0
image2              0
filename_x          0
pixelsx_1           0
pixelsy_1           0
size_bytes_1        0
r_mean_1            0
r_med_1             0
r_std_1             0
g_mean_1            0
g_med_1             0
g_std_1             0
b_mean_1            0
b_med_1             0
b_std_1             0
h_mean_1            0
h_var_1             0
s_mean_1            0
s_std_1             0
s_med_1             0
v_mean_1            0
v_std_1             0
v_med_1             0
is_grayscale_1      0
aspect_ratio_1      0
size_per_pixel_1    0
filename_y          0
pixelsx_2           0
pixelsy_2           0
size_bytes_2        0
r_mean_2            0
r_med_2             0
r_std_2             0
g_mean_2            0
g_med_2             0
g_std_2             0
b_mean_2            0
b_med_2             0
b_std_2             0
h_mean_2            0
h_var_2             0
s_mean_2            0
s_std_2             0
s_med_2             0
v_mean_2            0
v_std_2   

### feature processing - add diff features to pairs matrix

In [108]:
## try training on diffs instead of aboslute values

def addPairFeatures(pairs):
    diff_feature_base = [ 
                     'r_mean', 'r_med', 'r_std',
                     'g_mean', 'g_med', 'g_std',
                     'b_mean', 'b_med', 'b_std',
                     'h_mean', 'h_var',
                     's_mean', 's_med', 's_std',
                     'v_mean', 'v_med', 'v_std', ]

    diff_feature_names = [ temp_feature + '_diff' for temp_feature in diff_feature_base]

    for diff_feature in diff_feature_base:
        pairs[diff_feature + '_diff'] = ( pairs[ diff_feature + '_1'] - pairs[ diff_feature + '_2'] ).abs()

In [None]:
addPairFeatures(test_pairs)

In [None]:
addPairFeatures(train_pairs)

### results helper

In [99]:
def computePredictStats(y_prob, y_true, threshold = 0.5):
    """ compute accuracy, precision, recall, negative precision, specificity, and auc roc
        Args:
            y_prob: array of floats from 0.0 - 1.0
            y_true: array of booleans
            threshold: true/false threshold value, between 0.0-1.0
        Returns:
            dict of classification metrics
    """
    # y_pred = np.array([True, True, False, False])
    # y_true = np.array([True, True, True, True])
    y_pred = y_prob > threshold
    
    total = len(y_prob)
    true_pos = sum( (y_pred == True) & (y_true == True) )
    true_neg = sum( (y_pred == False) & (y_true == False) )
    false_pos = sum( (y_pred == True) & (y_true == False) )
    false_neg = sum( (y_pred == False ) & (y_true == True) )
    
    accuracy = (true_pos + true_neg) / total
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    npp = true_neg / (true_neg + false_neg) #negative prediction value
    specificity = true_neg / (true_neg + false_pos)
    roc = roc_auc_score(y_true, y_prob)
    
    return { 'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'npp': npp,
            'specificity': specificity,
            'roc': roc,
            'true_pos': true_pos,
            'true_neg': true_neg,
            'false_pos': false_pos,
            'false_neg': false_neg,
           }

### split data into X and Y

In [100]:
train_pairs.columns

Index(['artist1', 'image1', 'artist2', 'image2', 'sameArtist', 'filename_x',
       'artist_x', 'title_x', 'style_x', 'genre_x', 'date_x', 'pixelsx_1',
       'pixelsy_1', 'size_bytes_1', 'r_mean_1', 'r_med_1', 'r_std_1',
       'g_mean_1', 'g_med_1', 'g_std_1', 'b_mean_1', 'b_med_1', 'b_std_1',
       'h_mean_1', 'h_var_1', 's_mean_1', 's_std_1', 's_med_1', 'v_mean_1',
       'v_std_1', 'v_med_1', 'is_grayscale_1', 'aspect_ratio_1',
       'size_per_pixel_1', 'filename_y', 'artist_y', 'title_y', 'style_y',
       'genre_y', 'date_y', 'pixelsx_2', 'pixelsy_2', 'size_bytes_2',
       'r_mean_2', 'r_med_2', 'r_std_2', 'g_mean_2', 'g_med_2', 'g_std_2',
       'b_mean_2', 'b_med_2', 'b_std_2', 'h_mean_2', 'h_var_2', 's_mean_2',
       's_std_2', 's_med_2', 'v_mean_2', 'v_std_2', 'v_med_2',
       'is_grayscale_2', 'aspect_ratio_2', 'size_per_pixel_2'],
      dtype='object')

In [106]:
def featureHelper(feature_name):
    include = ( 'pixel' in feature_name
               #or 'aspect_ratio' in feature_name
               #or '_diff' in feature_name
               or 'size_bytes' in feature_name
               or 'size_per_pixel' in feature_name
              )
    
    if include:
        return True
    else:
        return False

In [107]:
# get list of X columns, they are ones with '_1' or '_2' in the name
allCols = train_pairs.columns
isX = list(map(featureHelper, allCols))
X_columns = allCols[isX]

# split X and Y data
train_X = train_pairs[X_columns]
train_Y = train_pairs['sameArtist']

In [151]:
allCols = test_pairs.columns
isX = list(map(featureHelper, allCols))
X_columns = allCols[isX]

test_X = test_pairs[X_columns]

In [153]:
train_X.columns

Index(['pixelsx_1', 'pixelsy_1', 'size_bytes_1', 'size_per_pixel_1',
       'pixelsx_2', 'pixelsy_2', 'size_bytes_2', 'size_per_pixel_2'],
      dtype='object')

In [152]:
test_X.columns

Index(['pixelsx_1', 'pixelsy_1', 'size_bytes_1', 'size_per_pixel_1',
       'pixelsx_2', 'pixelsy_2', 'size_bytes_2', 'size_per_pixel_2'],
      dtype='object')

In [154]:
pd.isnull(test_X).sum().sum()

0

### k-fold CV

In [110]:
def kFoldCV(train_X, train_Y, k, numFoldsToTest):
    """ Perform k-fold cross validation. Will modify train_pairs (shuffle rows)
        Args:
            train_pairs: Dataframe containing training image pairs with features. Should have no nulls.
            k: k, at least 2
            numFoldsToTest: how many folds to actually test, at most k
        Returns:
            dataframe with CV results
    """

    numFoldsToTest = min(k, numFoldsToTest)
    k = max(2, k)
    
    # shuffle rows
    #train_pairs = train_pairs.iloc[np.random.permutation(len(train_pairs))]

    # get list of X columns, they are ones with '_1' or '_2' in the name
    #allCols = train_pairs.columns
    #isX = list(map(lambda x: ('_1' in x) or ('_2' in x), allCols))
    #X_columns = allCols[isX]
    
    # split X and Y data
    #CV_X = train_pairs[X_columns]
    #CV_Y = train_pairs['sameArtist']
    
    # define which indices belong to each fold
    foldLocsList = [] #list of Index objects, one for each fold
    
    foldSize = int(len(train_X)/5)
        
    for foldNum in range(k):
        if foldNum == k-1:
            foldLocsList.append( train_X.index[foldNum*foldSize : len(train_X)] )
        else:
            foldLocsList.append( train_X.index[foldNum*foldSize : (foldNum+1)*(foldSize) ] )
    
    # set up dataframe for collecting results
    columnsList = list(itertools.product(('train', 'test'), ('roc', 'precision', 'recall', 'npp', 'specificity')))
    results = pd.DataFrame(index = range(numFoldsToTest), columns = pd.MultiIndex.from_tuples(columnsList))
    
    # test each fold
    for testNum in range(numFoldsToTest):

        # indices of training data
        trainLocs = pd.Index([])
        for foldNum in range(k):
            if foldNum != testNum:
                trainLocs = trainLocs.append( foldLocsList[foldNum] )
        
        # indices of test data
        testLocs = foldLocsList[testNum]

        #print(trainLocs)
        # set up Xs
        CV_train_X = train_X.loc[trainLocs]
        CV_test_X = train_X.loc[testLocs]

        # set up Ys
        CV_train_Y = train_Y.loc[trainLocs]
        CV_test_Y = train_Y.loc[testLocs]
        
        # fit model
        clf = RandomForestClassifier(n_estimators=50, min_samples_split = 15, n_jobs=30)

        start = time.time()

        print('starting fit.. ', end='')

        clf.fit(CV_train_X, CV_train_Y)

        end = time.time()
                               
        print('total training time: %s' % (end - start) )

        # get in-sample and out-of-sample results
                               
        pred_train = clf.predict_proba(CV_train_X)[:,1]
        train_results = computePredictStats( pred_train, CV_train_Y)

        pred_test = clf.predict_proba(CV_test_X)[:,1]
        test_results = computePredictStats( pred_test, CV_test_Y)
       
        for stat in ('roc', 'precision', 'recall', 'npp', 'specificity'):
            results.loc[testNum, ('train', stat)] = train_results[stat]
            results.loc[testNum, ('test', stat)] = test_results[stat]
        
        #print(list(zip(CV_train_X.columns, clf.feature_importances_)))
            
    return results              

### run k-fold cv

In [111]:
k = 5
numFoldsToTest = 1
temp_X = train_X.iloc[0:600000].copy()
temp_Y = train_Y.iloc[0:600000].copy()

results = kFoldCV(train_X, train_Y, k, numFoldsToTest)

print(results[ [('train', 'roc'), ('test', 'roc')]])

print(results[ [('train', 'roc'), ('test', 'roc')]].mean())

starting fit.. total training time: 18.400565147399902
      train      test
        roc       roc
0  0.991157  0.865105
train  roc    0.991157
test   roc    0.865105
dtype: float64


In [112]:
len(train_X)

1997446

3971668     True
6516366    False
3162886     True
26492       True
6509114    False
3792170     True
7686620    False
816199     False
4731108    False
8577883     True
4310747     True
5744312    False
1543219    False
7599995     True
8887572     True
5856454     True
4697968     True
998308     False
2887861     True
4249195     True
6263214     True
8816829    False
583428     False
6884377    False
1787909    False
2662779    False
203534     False
1898273     True
2803991     True
1482595    False
           ...  
7523755    False
545249     False
7916729    False
7937393     True
8508417    False
7129101    False
7636851     True
9248563    False
4118804    False
6997358     True
6012088     True
9271512     True
8162212    False
5599927    False
4773303    False
6911696    False
3946205    False
6211267    False
3803618     True
6175038     True
3483517    False
4194113     True
5135775     True
5555872     True
4707122     True
9050879     True
282671     False
7906140    Fal

### full training

In [114]:
clf = RandomForestClassifier(n_estimators=100, min_samples_split=15, n_jobs=30)

start = time.time()

print('starting fit')
#excluding the patient_id column from the fit and prediction
clf.fit(train_X, train_Y)

end = time.time()

print('total training time: %s' % (end - start) )

#columnsList = list(itertools.product(('train', 'test'), ('roc', 'precision', 'recall', 'npp', 'specificity')))
#results = pd.DataFrame(index = range(numFoldsToTest), columns = pd.MultiIndex.from_tuples(columnsList))


starting fit
total training time: 37.411495208740234


### save model

In [115]:
##save model

start = time.clock()

with open('my_dumped_classifier.pkl', 'wb') as fid:
    pickle.dump(clf, fid) 

end = time.clock()
print('total saving time: %s' % (end - start) )

total saving time: 3.6660020000003897


### load model

In [28]:
start = time.clock()

# load it again
with open('my_dumped_classifier.pkl', 'rb') as fid:
    clf = pickle.load(fid)

end = time.clock()
print('total loading time: %s' % (end - start) )



total loading time: 0.007420999999339983


## test on test set

In [155]:
test_predictions = clf.predict_proba(test_X)[:,1]

 ## prepare submission
    

In [166]:
submission = pd.DataFrame(index = test_pairs.index)
submission['sameArtist'] = test_predictions
submission.to_csv('/data/notebook/notebooks/my_submission_01.csv', index=True)

In [158]:
submission = pd.DataFrame(index = test_pairs.index)
submission['sameArtist'] = test_predictions

In [162]:
submission[0:5]

Unnamed: 0_level_0,sameArtist
index,Unnamed: 1_level_1
0,0.146189
1,0.915134
2,0.062293
3,0.395387
4,0.117785


21916047