In [1]:
%matplotlib inline



In [2]:
%load_ext line_profiler

In [3]:
import os
import itertools
import random
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import multiprocessing
import time
import traceback

from PIL import Image
import skimage.data
import skimage.exposure
import skimage.color

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score

import scipy.misc

## make pairs

In [4]:
def make_pairs(train_info):
    """Creates training data from the supplied training image information file"""
    artists = train_info.artist.unique()

    n = train_info.groupby('artist').size()
    n = (2*n**2).sum() 
    t = pd.DataFrame(np.zeros((n, 4)), columns=['artist1', 'image1', 'artist2', 'image2'])
    i = 0
    j = 0
    
    for artist in artists:

        #artist info is Ax2 matrix of artist, filename
        artistInfo = train_info[train_info.artist==artist][['artist', 'filename']].values
        
        use = train_info[train_info.artist != artist ].index.values
        np.random.shuffle(use)
        
        #nm = np.min([a.shape[0]**2, train_info[train_info.artist != m].shape[0] ])
        numExamples = np.min([len(artistInfo)**2, sum(train_info.artist != artist) ])
        use = use[0:numExamples]
        
        #diffArtistInfo a Bx2 matrix of artist, filename
        diffArtistInfo = train_info[train_info.artist!=artist][['artist', 'filename']].ix[use, :].values

        
        toAdd_SameArtist = pd.DataFrame(np.concatenate([  np.repeat(artistInfo[:, 0], len(artistInfo)).reshape((-1,1)), #artist
                                            np.repeat(artistInfo[:, 1],
                                            artistInfo.shape[0]).reshape((-1,1)),
                                            np.tile(artistInfo, (len(artistInfo), 1))],
                                         axis=1),
                          columns=['artist1', 'image1', 'artist2', 'image2'])
        toAdd_SameArtist = toAdd_SameArtist.loc[0:numExamples, :]
        
        toAdd_DiffArtist = pd.DataFrame(np.concatenate([np.tile(artistInfo,
                                                  (len(artistInfo), 1))[0:len(diffArtistInfo), :],
                                          diffArtistInfo], axis=1),
                          columns=['artist1', 'image1', 'artist2', 'image2'])
        toAdd_DiffArtist = toAdd_DiffArtist.loc[0:numExamples, :]
        
        #print(j, i, a2.shape[0], b2.shape[0])
        #print(b2)
        t.iloc[i:i+len(toAdd_SameArtist), :] = toAdd_SameArtist.values
        t.iloc[i+len(toAdd_SameArtist):i+len(toAdd_SameArtist)+len(toAdd_DiffArtist), :] = toAdd_DiffArtist.values
        
        i += len(toAdd_SameArtist) + len(toAdd_DiffArtist)
        j += 1
        if j%100==0:
            print('finished %s of %s artists'%(j, len(artists)))

    print('make pairs completed')
    t = t[~t.image2.isin([np.nan, 0])]
    return t[t.image1 > t.image2]

## Prep Image List

In [4]:
# def prepImageList(image_info, isTest):
#     """given the train_image_info or submission_info, returns a dataframe with a single column containing filenames of images"""
#     if isTest:
#         images = list(set(list(image_info.image1.unique()) + list(image_info.image2.unique())))
#         result = pd.DataFrame(np.array(images).reshape((-1, 1)), columns = ['filename'])
#     else:
#         result = pd.DataFrame(columns = ['filename'], data = image_info['filename'] )
    
#     return result

## Get Features Parent

In [5]:
def getFeaturesParent(isTest):
    """Creates features for training and test images. This function utilizes multiprocessing.
    Args:
        isTest: bool to fetch training or test data
    Returns:
        pandas DataFrame containing features
    """
    
    num_cores = multiprocessing.cpu_count() - 5
    
    argsList = []
    
    for jobNum in range(num_cores):
        argsList.append((isTest, jobNum, num_cores))
        

    print('Launching %s jobs' % (num_cores))
    startTime = time.time()
    
    pool = multiprocessing.Pool(num_cores)
    image_features_list = pool.starmap(getFeaturesWorker, argsList)
    pool.close()
    pool.join()
    
    image_features = pd.concat(image_features_list)
    
    endTime = time.time()
    
    print("collecting features complete, time taken = %.2f minutes" % ((endTime - startTime) / 60.0))
    return image_features

## Get Features Worker

In [6]:
def getFeaturesWorker(isTest, jobNum, totalJobs):
    """Child function for computing image features, only to be called by getFeaturesParent
    Args:
        isTest: whether to compute features for test or training images
        jobNum: which job number this is
        totalJobs: total number of jobs
    Returns:
        pandas dataframe containing a data for a fraction of the training or test images
    """
    if isTest:
        mydir = r'/data/test_data/test'
        info = pd.read_csv(r'/data/test_data/submission_info.csv', index_col = 0)
    else:
        mydir = r'/data/training_data/train'
        info = pd.read_csv(r'/data/training_data/train_info.csv', index_col = 0)
    
    info = info.iloc[40000:]
    
    totalNumImages = len(info)
    
    chunkSize = np.int(totalNumImages/totalJobs)
    
    if jobNum == totalJobs - 1:
        startInd = jobNum * chunkSize
        endInd = totalNumImages
    else:
        startInd = jobNum * chunkSize
        endInd = (jobNum + 1) * chunkSize
        
    info = info.iloc[startInd:endInd]
    
    info['pixelsx'] = np.nan
    info['pixelsy'] = np.nan
    info['size_bytes'] = np.nan
    
    info['r_mean'] = np.nan
    info['r_med'] = np.nan
    info['r_std'] = np.nan
    
    info['g_mean'] = np.nan
    info['g_med'] = np.nan
    info['g_std'] = np.nan
    
    info['b_mean'] = np.nan
    info['b_med'] = np.nan
    info['b_std'] = np.nan
    
    info['h_mean'] = np.nan
    info['h_var'] = np.nan
    
    info['s_mean'] = np.nan
    info['s_std'] = np.nan
    info['s_med'] = np.nan
    
    info['v_mean'] = np.nan
    info['v_std'] = np.nan
    info['v_med'] = np.nan
    
    info['is_grayscale'] = np.nan
      
    print('Job %s, starting getting image info for images %s-%s' % (jobNum, startInd, endInd-1))
    startTime = time.clock()
    
    for ind, i in enumerate(info.index.values):
        try:       
            #im = Image.open(mydir+'/'+info.loc[i, 'filename'])
            #info.loc[i, 'pixelsx'], info.loc[i, 'pixelsy'] = im.size
            
            im = skimage.data.imread(mydir + '/' + info.loc[i, 'filename'])
                
            info.loc[i, 'pixelsx'] = im.shape[1]
            info.loc[i, 'pixelsy'] = im.shape[0]
            
            grayscale = (len(im.shape) == 2)
            
            if grayscale:
                info.loc[i, 'r_mean'] = im.mean()
                info.loc[i, 'g_mean'] = info.loc[i, 'b_mean'] = info.loc[i, 'r_mean']
                
                info.loc[i, 'r_med'] = np.median(im)
                info.loc[i, 'g_med'] = info.loc[i, 'b_med'] = info.loc[i, 'r_med']
                
                info.loc[i, 'r_std'] = im.std()
                info.loc[i, 'g_std'] = info.loc[i, 'b_std'] = info.loc[i, 'r_std']
                
                info.loc[i, 'is_grayscale' ] = 1
                
                info.loc[i, 'h_mean'] = 0
                info.loc[i, 'h_var'] = 0
                info.loc[i, 's_mean'] = 0
                info.loc[i, 's_std'] = 0
                info.loc[i, 's_med'] = 0
                info.loc[i, 'v_mean'] = info.loc[i, 'r_mean']/256.0
                info.loc[i, 'v_std'] = info.loc[i, 'r_std']/256.0
                info.loc[i, 'v_med'] = info.loc[i, 'r_med']/256.0
                
            else:
                info.loc[i, 'r_mean'] = im[:,:,0].mean()
                info.loc[i, 'g_mean'] = im[:,:,1].mean()
                info.loc[i, 'b_mean'] = im[:,:,2].mean()
                info.loc[i, 'r_med'] = np.median(im[:,:,0])
                info.loc[i, 'g_med'] = np.median(im[:,:,1])
                info.loc[i, 'b_med'] = np.median(im[:,:,2])
                info.loc[i, 'r_std'] = im[:,:,0].std()
                info.loc[i, 'g_std'] = im[:,:,1].std()
                info.loc[i, 'b_std'] = im[:,:,2].std()
                info.loc[i, 'is_grayscale' ] = 0
                
                #if it is in RGBA, we don't handle it for now
                if (len(im.shape) == 3) and im.shape[2] == 4:
                    print('%s is rgba' % info.loc[i, 'filename'])
                else:     
                    # convert image to hue/saturation/value
                    hsvImage = skimage.color.rgb2hsv(im)
                    angles = hsvImage[:,:,0] * 2.0 * np.pi

                    # average hue is converting the (0-1) hue value to unit vector coordinates
                    # and finding the average direction
                    sinSum = np.sin(angles).sum()
                    cosSum = np.cos(angles).sum()
                    info.loc[i, 'h_mean'] = np.arctan(sinSum/cosSum)

                    # use the variance formula for a circulator distribution
                    R2 = np.power(sinSum, 2) + np.power(cosSum, 2)
                    numPixels = info.loc[i, 'pixelsx'] * info.loc[i, 'pixelsy']
                    R_bar = np.sqrt(R2)/numPixels
                    info.loc[i, 'h_var'] = 1 - R_bar

                    info.loc[i, 's_mean'] = hsvImage[:,:,1].mean()
                    info.loc[i, 's_std'] = np.median(hsvImage[:,:,1])
                    info.loc[i, 's_med'] = hsvImage[:,:,1].std()
                    info.loc[i, 'v_mean'] = hsvImage[:,:,2].mean()
                    info.loc[i, 'v_std'] = np.median(hsvImage[:,:,2])
                    info.loc[i, 'v_med'] = hsvImage[:,:,2].std()
                
            #im = cv2.imread(dir+'/'+info.loc[i, 'new_filename'])
            #info.loc[i, 'pixelsx'], info.loc[i, 'pixelsy'] = im.shape[0:2]
            info.loc[i, 'size_bytes'] = os.path.getsize(mydir+'/'+info.loc[i, 'filename']) 
            if (ind+1)%100==0:
                currentTime = time.clock()
                print('Job %s, finished %s of %s, total time = %.2f min' %
                     (jobNum, (ind+1), len(info), (currentTime - startTime)/60.0))
        except:
            print('job %s - error in %s' % (jobNum, mydir+'/'+info.loc[i, 'filename']))
            traceback.print_exc()
    
    currentTime = time.clock()
    print('- Job %s, finished getting image info, total time = %.2f min' % ( jobNum, (currentTime - startTime) / 60.0))
    
    return info

    #return info.rename(columns={'filename' : 'new_filename'})

### load image info

In [7]:
#load training info
train_info = pd.read_csv(r'/data/training_data/train_info.csv', index_col=0)
submission_info = pd.read_csv(r'/data/test_data/submission_info.csv', index_col=0)

#shuffle and save info
#train_info = train_info.iloc[np.random.permutation(len(train_info))]
#submission_info = submission_info.iloc[np.random.permutation(len(submission_info))]

#train_info.to_csv(r'/data/training_data/train_info.csv')
#submission_info.to_csv(r'/data/test_data/submission_info.csv')

In [9]:
len(train_info)

NameError: name 'train_info' is not defined

### create submission image info from submission pairs

In [6]:
# submission image data is a bunch of images pairs, but we may want to work with a list of test images instead

#submission_pairs = pd.read_csv(r'/data/test_data/submission_pairs.csv')
#images = list(set(list(submission_info.image1.unique()) + list(submission_info.image2.unique())))
#submission_info = pd.DataFrame(data=images, columns=['filename'])
#submission_info = pd.read_csv(r'/data/test_data/submission_info.csv', index_col = 0)

### make training pairs

In [None]:
#make training pairs
#train_pairs = make_pairs(train_image_info)
#train_pairs[ 'sameArtist' ] = train_pairs[ 'artist1' ] == train_pairs[ 'artist2' ]

### save training pairs

In [None]:
#save as csv
#train_pairs.to_csv(r'/data/training_data/train_pairs.csv')

### load pairs

In [8]:
#load pairs
train_pairs = pd.read_csv(r'/data/training_data/train_pairs.csv', index_col = 0)
#submission_pairs = pd.read_csv(r'/data/test_data/submission_pairs.csv')

### shuffle pairs, reduce number if necessary

In [10]:
train_pairs = train_pairs.iloc[np.random.permutation(len(train_pairs))]
train_pairs = train_pairs.iloc[0:100]

### compute features

In [39]:
print('Begin computing features')
startTime = time.time()

train_features = getFeaturesParent(False)

endTime = time.time()
print("Finished computing features, time taken = %.2f min" % ((endTime-startTime)/60.0) )

Begin computing features
Launching 35 jobs
Job 32, starting getting image info for images 36032-37157
Job 18, starting getting image info for images 20268-21393
Job 21, starting getting image info for images 23646-24771
Job 25, starting getting image info for images 28150-29275
Job 34, starting getting image info for images 38284-39432
Job 11, starting getting image info for images 12386-13511
Job 0, starting getting image info for images 0-1125
Job 16, starting getting image info for images 18016-19141
Job 30, starting getting image info for images 33780-34905
Job 23, starting getting image info for images 25898-27023
Job 28, starting getting image info for images 31528-32653
Job 31, starting getting image info for images 34906-36031
Job 26, starting getting image info for images 29276-30401
Job 19, starting getting image info for images 21394-22519
Job 29, starting getting image info for images 32654-33779
Job 13, starting getting image info for images 14638-15763
Job 12, starting ge



Job 33, finished 300 of 1126, total time = 7.46 min
Job 17, finished 300 of 1126, total time = 7.48 min
Job 5, finished 400 of 1126, total time = 7.48 min
Job 3, finished 300 of 1126, total time = 7.48 min
Job 31, finished 300 of 1126, total time = 7.57 min
Job 21, finished 400 of 1126, total time = 7.78 min
47419.jpg is rgba
Job 20, finished 400 of 1126, total time = 7.83 min
90205.jpg is rgba
Job 15, finished 400 of 1126, total time = 7.84 min
Job 29, finished 400 of 1126, total time = 7.87 min
Job 10, finished 400 of 1126, total time = 7.97 min
Job 27, finished 400 of 1126, total time = 7.98 min
54305.jpg is rgba
Job 13, finished 400 of 1126, total time = 8.02 min
Job 32, finished 400 of 1126, total time = 8.04 min
Job 1, finished 400 of 1126, total time = 8.04 min
Job 24, finished 400 of 1126, total time = 8.08 min
Job 9, finished 400 of 1126, total time = 8.19 min
Job 26, finished 400 of 1126, total time = 8.24 min
Job 19, finished 400 of 1126, total time = 8.37 min
Job 18, finish

Traceback (most recent call last):
  File "<ipython-input-38-2a43c68ab209>", line 71, in getFeaturesWorker
    info.loc[i, 'pixelsx'] = im.shape[1]
IndexError: tuple index out of range


Job 20, finished 600 of 1126, total time = 12.55 min
Job 27, finished 600 of 1126, total time = 12.67 min
Job 26, finished 600 of 1126, total time = 12.69 min
Job 23, finished 600 of 1126, total time = 12.79 min
Job 31, finished 500 of 1126, total time = 12.84 min
Job 25, finished 700 of 1126, total time = 12.83 min
Job 16, finished 600 of 1126, total time = 12.96 min
Job 8, finished 700 of 1126, total time = 12.97 min
Job 30, finished 600 of 1126, total time = 13.21 min
Job 11, finished 600 of 1126, total time = 13.23 min
69691.jpg is rgba
Job 3, finished 600 of 1126, total time = 13.41 min
Job 15, finished 700 of 1126, total time = 13.42 min
Job 18, finished 600 of 1126, total time = 13.45 min
Job 5, finished 700 of 1126, total time = 13.60 min
Job 6, finished 600 of 1126, total time = 13.61 min
Job 21, finished 700 of 1126, total time = 13.71 min
Job 29, finished 700 of 1126, total time = 13.73 min
Job 34, finished 600 of 1149, total time = 13.84 min
Job 4, finished 800 of 1126, tot

Traceback (most recent call last):
  File "<ipython-input-38-2a43c68ab209>", line 71, in getFeaturesWorker
    info.loc[i, 'pixelsx'] = im.shape[1]
IndexError: tuple index out of range


Job 30, finished 800 of 1126, total time = 17.13 min
Job 14, finished 700 of 1126, total time = 17.17 min
Job 31, finished 700 of 1126, total time = 17.23 min
Job 18, finished 800 of 1126, total time = 17.35 min
Job 23, finished 800 of 1126, total time = 17.37 min
Job 20, finished 800 of 1126, total time = 17.37 min
Job 15, finished 900 of 1126, total time = 17.46 min
Job 8, finished 900 of 1126, total time = 17.52 min
Job 21, finished 900 of 1126, total time = 17.61 min
Job 4, finished 1000 of 1126, total time = 17.63 min
Job 17, finished 800 of 1126, total time = 17.84 min
Job 29, finished 900 of 1126, total time = 17.85 min
Job 6, finished 800 of 1126, total time = 17.92 min
Job 12, finished 800 of 1126, total time = 17.94 min
34022.jpg is rgba
Job 33, finished 800 of 1126, total time = 17.97 min
Job 19, finished 900 of 1126, total time = 17.99 min
Job 11, finished 800 of 1126, total time = 18.01 min
Job 13, finished 800 of 1126, total time = 18.03 min
Job 0, finished 900 of 1126, t

Traceback (most recent call last):
  File "<ipython-input-38-2a43c68ab209>", line 71, in getFeaturesWorker
    info.loc[i, 'pixelsx'] = im.shape[1]
IndexError: tuple index out of range


Job 28, finished 1100 of 1126, total time = 21.09 min
Job 0, finished 1100 of 1126, total time = 21.15 min
Job 22, finished 900 of 1126, total time = 21.24 min
Job 30, finished 1000 of 1126, total time = 21.27 min
Job 18, finished 1000 of 1126, total time = 21.26 min
Job 14, finished 900 of 1126, total time = 21.29 min
- Job 28, finished getting image info, total time = 21.32 min
Job 20, finished 1000 of 1126, total time = 21.34 min
- Job 7, finished getting image info, total time = 21.36 min
Job 16, finished 1000 of 1126, total time = 21.39 min
Job 23, finished 1000 of 1126, total time = 21.49 min
Job 27, finished 1100 of 1126, total time = 21.50 min
Job 8, finished 1100 of 1126, total time = 21.51 min
Job 24, finished 1100 of 1126, total time = 21.60 min
Job 15, finished 1100 of 1126, total time = 21.60 min
- Job 0, finished getting image info, total time = 21.72 min
Job 26, finished 1000 of 1126, total time = 21.82 min
- Job 8, finished getting image info, total time = 21.84 min
- J

Traceback (most recent call last):
  File "<ipython-input-38-2a43c68ab209>", line 71, in getFeaturesWorker
    info.loc[i, 'pixelsx'] = im.shape[1]
IndexError: tuple index out of range


- Job 9, finished getting image info, total time = 22.75 min
Job 20, finished 1100 of 1126, total time = 22.78 min
Job 13, finished 1100 of 1126, total time = 22.85 min
- Job 16, finished getting image info, total time = 22.86 min
Job 6, finished 1000 of 1126, total time = 22.86 min
Job 23, finished 1100 of 1126, total time = 22.89 min
- Job 13, finished getting image info, total time = 23.01 min
- Job 32, finished getting image info, total time = 23.06 min
Job 22, finished 1000 of 1126, total time = 23.05 min
- Job 20, finished getting image info, total time = 23.04 min
Job 26, finished 1100 of 1126, total time = 23.09 min
Job 18, finished 1100 of 1126, total time = 23.10 min
Job 17, finished 1100 of 1126, total time = 23.12 min
Job 14, finished 1000 of 1126, total time = 23.19 min
Job 30, finished 1100 of 1126, total time = 23.23 min
- Job 23, finished getting image info, total time = 23.25 min
- Job 26, finished getting image info, total time = 23.26 min
Job 33, finished 1100 of 112

In [42]:
len(train_features)

39433

### save features

In [None]:
#test_features_sorted.to_csv(r'/data/test_data/test_features.csv')
#test_features_sorted.to_csv(r'/data/test_data/test_features.csv')
#y = pd.read_csv(r'/data/test_data/test_features.csv', index_col=0)
#test_features_sorted = test_features_sorted.drop('Unnamed: 0',1)

In [43]:
train_features.to_csv(r'/data/training_data/train_features_40000_end.csv')
#test_features_sorted.to_csv(r'/data/test_data/test_features.csv')
#y = pd.read_csv(r'/data/test_data/test_features.csv', index_col=0)
#test_features_sorted = test_features_sorted.drop('Unnamed: 0',1)

### load features

In [9]:
#load features
train_features_0 = pd.read_csv(r'/data/training_data/train_features_0_20000.csv', index_col = 0)
train_features_1 = pd.read_csv(r'/data/training_data/train_features_20000_40000.csv', index_col = 0)
train_features_2 = pd.read_csv(r'/data/training_data/train_features_40000_end.csv', index_col = 0)

In [13]:
train_features = pd.concat([train_features_0, train_features_1, train_features_2])

In [16]:
pd.isnull(train_features).sum()

filename            0
artist              0
title              13
style             765
genre             893
date            20255
pixelsx             7
pixelsy             7
size_bytes         21
r_mean              7
r_med               7
r_std               7
g_mean              7
g_med               7
g_std               7
b_mean              7
b_med               7
b_std               7
h_mean             52
h_var              52
s_mean             52
s_std              52
s_med              52
v_mean             52
v_std              52
v_med              52
is_grayscale        7
dtype: int64

### additional processing on feature

In [19]:
# saved features are straight from the feature functions with no handling of nulls, etc
# these have to be addressed prior to training/predicting

rgb_features = ['r_mean', 'r_med', 'r_std', 'g_mean', 'g_med',
                'g_std', 'b_mean', 'b_med', 'b_std',]

size_features =  [ 'pixelsx',
       'pixelsy', 'size_bytes' ]


# take out unnecessary columns
feature_names = ['pixelsx', 'pixelsy', 'size_bytes', 
                 'r_mean', 'r_med', 'r_std',
                 'g_mean', 'g_med', 'g_std',
                 'b_mean', 'b_med', 'b_std',
                 'h_mean', 'h_var',
                 's_mean', 's_med', 's_std',
                 'v_mean', 'v_med', 'v_std',
                 'is_grayscale']

train_features = train_features[ ['filename'] + feature_names ]

### Join training features to training pairs

In [20]:
#join pair data to image features
image_feature_names = ['pixelsx', 'pixelsy', 'size_bytes', 
                 'r_mean', 'r_med', 'r_std',
                 'g_mean', 'g_med', 'g_std',
                 'b_mean', 'b_med', 'b_std',
                 'h_mean', 'h_var',
                 's_mean', 's_med', 's_std',
                 'v_mean', 'v_med', 'v_std', 
                 'is_grayscale' ]

col_dict_1 = {}
col_dict_2 = {}

for feature in feature_names:
    col_dict_1[feature] = '%s_1' % feature
    col_dict_2[feature] = '%s_2' % feature

train_pairs = train_pairs.merge(train_features,
                                left_on='image1', right_on='filename')
train_pairs.rename( columns = col_dict_1,
                      inplace=True)
train_pairs = train_pairs.merge(train_features,
                                left_on='image2', right_on='filename')
train_pairs.rename( columns = col_dict_2,
                      inplace=True)

### remove nulls

In [22]:
# we remove the nulls after the join, could also be done before
train_pairs = train_pairs[~train_pairs['pixelsx_1'].isnull()]
train_pairs = train_pairs[~train_pairs['pixelsx_2'].isnull()]
train_pairs = train_pairs[~train_pairs['size_bytes_1'].isnull()]
train_pairs = train_pairs[~train_pairs['size_bytes_2'].isnull()]
train_pairs = train_pairs[~train_pairs['h_mean_1'].isnull()]
train_pairs = train_pairs[~train_pairs['h_mean_2'].isnull()]


print(train_pairs.isnull().sum())

artist1           0
image1            0
artist2           0
image2            0
sameArtist        0
filename_x        0
pixelsx_1         0
pixelsy_1         0
size_bytes_1      0
r_mean_1          0
r_med_1           0
r_std_1           0
g_mean_1          0
g_med_1           0
g_std_1           0
b_mean_1          0
b_med_1           0
b_std_1           0
h_mean_1          0
h_var_1           0
s_mean_1          0
s_med_1           0
s_std_1           0
v_mean_1          0
v_med_1           0
v_std_1           0
is_grayscale_1    0
filename_y        0
pixelsx_2         0
pixelsy_2         0
size_bytes_2      0
r_mean_2          0
r_med_2           0
r_std_2           0
g_mean_2          0
g_med_2           0
g_std_2           0
b_mean_2          0
b_med_2           0
b_std_2           0
h_mean_2          0
h_var_2           0
s_mean_2          0
s_med_2           0
s_std_2           0
v_mean_2          0
v_med_2           0
v_std_2           0
is_grayscale_2    0
dtype: int64


### results helper

In [23]:
def computePredictStats(y_prob, y_true, threshold = 0.5):
    """ compute accuracy, precision, recall, negative precision, specificity, and auc roc
        Args:
            y_prob: array of floats from 0.0 - 1.0
            y_true: array of booleans
            threshold: true/false threshold value, between 0.0-1.0
        Returns:
            dict of classification metrics
    """
    # y_pred = np.array([True, True, False, False])
    # y_true = np.array([True, True, True, True])
    y_pred = y_prob > threshold
    
    total = len(y_prob)
    true_pos = sum( (y_pred == True) & (y_true == True) )
    true_neg = sum( (y_pred == False) & (y_true == False) )
    false_pos = sum( (y_pred == True) & (y_true == False) )
    false_neg = sum( (y_pred == False ) & (y_true == True) )
    
    accuracy = (true_pos + true_neg) / total
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    npp = true_neg / (true_neg + false_neg) #negative prediction value
    specificity = true_neg / (true_neg + false_pos)
    roc = roc_auc_score(y_true, y_prob)
    
    return { 'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'npp': npp,
            'specificity': specificity,
            'roc': roc,
            'true_pos': true_pos,
            'true_neg': true_neg,
            'false_pos': false_pos,
            'false_neg': false_neg,
           }

### split data into X and Y

In [64]:
train_pairs = train_pairs.iloc[np.random.permutation(len(train_pairs))]

# get list of X columns, they are ones with '_1' or '_2' in the name
allCols = train_pairs.columns
isX = list(map(lambda x: ('_1' in x) or ('_2' in x), allCols))
X_columns = allCols[isX]

# split X and Y data
train_X = train_pairs[X_columns]
train_Y = train_pairs['sameArtist']

### k-fold CV

In [68]:
def kFoldCV(train_X, train_Y, k, numFoldsToTest):
    """ Perform k-fold cross validation. Will modify train_pairs (shuffle rows)
        Args:
            train_pairs: Dataframe containing training image pairs with features. Should have no nulls.
            k: k, at least 2
            numFoldsToTest: how many folds to actually test, at most k
        Returns:
            dataframe with CV results
    """

    numFoldsToTest = min(k, numFoldsToTest)
    k = max(2, k)
    
    # shuffle rows
    #train_pairs = train_pairs.iloc[np.random.permutation(len(train_pairs))]

    # get list of X columns, they are ones with '_1' or '_2' in the name
    #allCols = train_pairs.columns
    #isX = list(map(lambda x: ('_1' in x) or ('_2' in x), allCols))
    #X_columns = allCols[isX]
    
    # split X and Y data
    #CV_X = train_pairs[X_columns]
    #CV_Y = train_pairs['sameArtist']
    
    # define which indices belong to each fold
    foldLocsList = [] #list of Index objects, one for each fold
    
    foldSize = int(len(train_X)/5)
        
    for foldNum in range(k):
        if foldNum == k-1:
            foldLocsList.append( train_X.index[foldNum*foldSize : len(train_X)] )
        else:
            foldLocsList.append( train_X.index[foldNum*foldSize : (foldNum+1)*(foldSize) ] )
    
    # set up dataframe for collecting results
    columnsList = list(itertools.product(('train', 'test'), ('roc', 'precision', 'recall', 'npp', 'specificity')))
    results = pd.DataFrame(index = range(numFoldsToTest), columns = pd.MultiIndex.from_tuples(columnsList))
    
    # test each fold
    for testNum in range(numFoldsToTest):

        # indices of training data
        trainLocs = pd.Index([])
        for foldNum in range(k):
            if foldNum != testNum:
                trainLocs = trainLocs.append( foldLocsList[foldNum] )
        
        # indices of test data
        testLocs = foldLocsList[testNum]

        print(trainLocs)
        # set up Xs
        CV_train_X = train_X.loc[trainLocs]
        CV_test_X = train_X.loc[testLocs]

        # set up Ys
        CV_train_Y = train_Y.loc[trainLocs]
        CV_test_Y = train_Y.loc[testLocs]
        
        # fit model
        clf = RandomForestClassifier(n_estimators=60, min_samples_split=5, max_depth=10, n_jobs=1)

        start = time.time()

        print('starting fit')

        clf.fit(CV_train_X, CV_train_Y)

        end = time.time()
                               
        print('total training time: %s' % (end - start) )

        # get in-sample and out-of-sample results
                               
        pred_train = clf.predict_proba(CV_train_X)[:,1]
        train_results = computePredictStats( pred_train, CV_train_Y)

        pred_test = clf.predict_proba(CV_test_X)[:,1]
        test_results = computePredictStats( pred_test, CV_test_Y)
       
        for stat in ('roc', 'precision', 'recall', 'npp', 'specificity'):
            results.loc[testNum, ('train', stat)] = train_results[stat]
            results.loc[testNum, ('test', stat)] = test_results[stat]
        
    return results              

### run k-fold cv

In [None]:
k = 5
numFoldsToTest = 1
temp_X = train_X.iloc[0:3000000].copy()
temp_Y = train_Y.iloc[0:3000000].copy()

results = kFoldCV(temp_X, temp_Y, k, numFoldsToTest)

print(results)

Int64Index([2979102, 4628207,  759591, 2974870, 5156457, 7081573, 4926355,
            4024620, 4631563, 6404482,
            ...
            7744472, 7694105, 4488323, 4104087, 8942494, 4501288, 3584866,
            6234599, 7529968, 1365379],
           dtype='int64', length=2400000)
starting fit


### full training

In [18]:
clf = RandomForestClassifier(n_estimators=100, min_samples_split=50, max_depth=8 )

start = time.clock()

print('starting fit')
#excluding the patient_id column from the fit and prediction
clf.fit(train_X, train_Y)

end = time.clock()

print('total training time: %s' % (end - start) )

columnsList = list(itertools.product(('train', 'test'), ('roc', 'precision', 'recall', 'npp', 'specificity')))
results = pd.DataFrame(index = range(numFoldsToTest), columns = pd.MultiIndex.from_tuples(columnsList))


starting fit
total training time: 917.0945500000003


### save model

In [26]:
##save model

start = time.clock()

with open('my_dumped_classifier.pkl', 'wb') as fid:
    pickle.dump(clf, fid) 

end = time.clock()
print('total saving time: %s' % (end - start) )

total saving time: 0.012914999999338761


### load model

In [28]:
start = time.clock()

# load it again
with open('my_dumped_classifier.pkl', 'rb') as fid:
    clf = pickle.load(fid)

end = time.clock()
print('total loading time: %s' % (end - start) )



total loading time: 0.007420999999339983


In [53]:
#load test set definitions
test_pairs = pd.read_csv(r'/data/test_data/submission_info.csv', index_col= 0)

#get features
#raw_test_image_info = get_image_info(test_pairs, r'/data/test_data/test')

#get raw training data features
#raw_test_image_info['bytes_per_pixel'] =raw_test_image_info['size_bytes']/(raw_test_image_info['pixelsx']*raw_test_image_info['pixelsy'])
#raw_test_image_info['aspect_ratio'] = raw_test_image_info['pixelsx']/raw_test_image_info['pixelsy']

#save raw_train_image_info
#raw_test_image_info.to_csv(r'/data/test_data/raw_test_image_info.csv')

#load raw_train_image_info
#raw_test_image_info = pd.read_csv(r'/data/test_data/raw_test_image_info.csv', index_col = 0)

## join test features to test pairs

In [51]:
def joinFeaturesToPairs(image_info, image_pairs):
    #join pair data to image features
    trimmed_image_info = image_info[['new_filename',
                                     'pixelsx',
                                     'pixelsy',
                                     'bytes_per_pixel',
                                     'aspect_ratio']]
    image_pairs = image_pairs.merge(trimmed_image_info,
                              left_on='image1', right_on='new_filename')
    image_pairs.rename( columns = {'pixelsx': 'pixelsx_1',
                        'pixelsy': 'pixelsy_1',
                        'bytes_per_pixel' : 'bytes_per_pixel_1',
                        'aspect_ratio':'aspect_ratio_1'})
    image_pairs = image_pairs.merge(trimmed_image_info,
                                    left_on='image2', right_on='new_filename')
    train_pairs.rename( columns = {'pixelsx': 'pixelsx_2',
                        'pixelsy': 'pixelsy_2',
                        'bytes_per_pixel' : 'bytes_per_pixel_2',
                        'aspect_ratio':'aspect_ratio_2'})
    return train_pairs

test_pairs_final = joinFeaturesToPairs(raw_test_image_info, test_pairs)

## test on test set

In [24]:
test_predictions = clf.pred(test_pairs_final)

['mymodel.pkl',
 'mymodel.pkl_01.npy',
 'mymodel.pkl_02.npy',
 'mymodel.pkl_03.npy',
 'mymodel.pkl_04.npy',
 'mymodel.pkl_05.npy',
 'mymodel.pkl_06.npy',
 'mymodel.pkl_07.npy',
 'mymodel.pkl_08.npy',
 'mymodel.pkl_09.npy',
 'mymodel.pkl_10.npy',
 'mymodel.pkl_11.npy',
 'mymodel.pkl_12.npy',
 'mymodel.pkl_13.npy',
 'mymodel.pkl_14.npy',
 'mymodel.pkl_15.npy',
 'mymodel.pkl_16.npy',
 'mymodel.pkl_17.npy',
 'mymodel.pkl_18.npy',
 'mymodel.pkl_19.npy',
 'mymodel.pkl_20.npy',
 'mymodel.pkl_21.npy',
 'mymodel.pkl_22.npy',
 'mymodel.pkl_23.npy',
 'mymodel.pkl_24.npy',
 'mymodel.pkl_25.npy',
 'mymodel.pkl_26.npy',
 'mymodel.pkl_27.npy',
 'mymodel.pkl_28.npy',
 'mymodel.pkl_29.npy',
 'mymodel.pkl_30.npy',
 'mymodel.pkl_31.npy',
 'mymodel.pkl_32.npy',
 'mymodel.pkl_33.npy',
 'mymodel.pkl_34.npy',
 'mymodel.pkl_35.npy',
 'mymodel.pkl_36.npy',
 'mymodel.pkl_37.npy',
 'mymodel.pkl_38.npy',
 'mymodel.pkl_39.npy',
 'mymodel.pkl_40.npy',
 'mymodel.pkl_41.npy',
 'mymodel.pkl_42.npy',
 'mymodel.pkl_43.n

 ## prepare submission
    

In [None]:
submission = submission_info[['index']]
submission['sameArtist'] = y_pred
submission.to_csv('submission.csv', index=False)