In [1]:
import os
import random
import numpy as np
import pandas as pd
from PIL import Image
#from matplotlib import pyplot as plt

#import networkx as nx
#from networkx.utils import cuthill_mckee_ordering

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score  

## make pairs

In [2]:
def make_pairs(train_info):
    artists = train_info.artist.unique()

    n = train_info.groupby('artist').size()
    n = (2*n**2).sum() 
    t = pd.DataFrame(np.zeros((n, 4)), columns=['artist1', 'image1', 'artist2', 'image2'])
    i = 0
    j = 0
    for artist in artists:

        #artist info is Ax2 matrix of artist, filename
        artistInfo = train_info[train_info.artist==artist][['artist', 'filename']].values
        use = train_info[train_info.artist != artist ].index.values
        np.random.shuffle(use)
        #nm = np.min([a.shape[0]**2, train_info[train_info.artist != m].shape[0] ])
        numExamples = np.min([len(artistInfo)**2, sum(train_info.artist != artist) ])
        use = use[0:numExamples]
        
        #diffArtistInfo a Bx2 matrix of artist, filename
        diffArtistInfo = train_info[train_info.artist!=artist][['artist', 'filename']].ix[use, :].values

        
        toAdd_SameArtist = pd.DataFrame(np.concatenate([  np.repeat(artistInfo[:, 0], len(artistInfo)).reshape((-1,1)), #artist
                                            np.repeat(artistInfo[:, 1],
                                            artistInfo.shape[0]).reshape((-1,1)),
                                            np.tile(artistInfo, (artistInfo.shape[0], 1))],
                                         axis=1),
                          columns=['artist1', 'image1', 'artist2', 'image2'])
        toAdd_SameArtist = toAdd_SameArtist.loc[0:numExamples, :]
        toAdd_DiffArtist = pd.DataFrame(np.concatenate([np.tile(artistInfo,
                                                  (len(artistInfo), 1))[0:diffArtistInfo.shape[0], :],
                                          diffArtistInfo], axis=1),
                          columns=['artist1', 'image1', 'artist2', 'image2'])
        toAdd_DiffArtist = toAdd_DiffArtist.loc[0:numExamples, :]
        
        #print(j, i, a2.shape[0], b2.shape[0])
        #print(b2)
        t.iloc[i:i+len(toAdd_SameArtist), :] = toAdd_SameArtist
        t.iloc[i+len(toAdd_SameArtist):i+len(toAdd_SameArtist)+len(toAdd_DiffArtist), :] = toAdd_DiffArtist.values
        
        i += len(toAdd_SameArtist) + len(toAdd_DiffArtist)
        j += 1
        if j%100==0:
            print('finished %s of %s artists'%(j, len(artists)))

    print('make pairs completed')
    t = t[~t.image2.isin([np.nan, 0])]
    return t[t.image1 > t.image2]

## Get Image Info

In [90]:
def get_image_info(test_info, mydir):
    if mydir == r'/data/test_data/test':
        images = list(set(list(test_info.image1.unique()) + list(test_info.image2.unique())))
        info = pd.DataFrame(np.array(images).reshape((-1, 1)), columns = ['filename'])
    else:
        info = test_info
    
    info['pixelsx'] = np.nan
    info['pixelsy'] = np.nan
    info['size_bytes'] = np.nan
    
    
    for ind, i in enumerate(info.index.values):
        try:
            im = Image.open(mydir+'/'+info.loc[i, 'filename'])
            info.loc[i, 'pixelsx'], info.loc[i, 'pixelsy'] = im.size
            #im = cv2.imread(dir+'/'+info.loc[i, 'new_filename'])
            #info.loc[i, 'pixelsx'], info.loc[i, 'pixelsy'] = im.shape[0:2]
            info.loc[i, 'size_bytes'] = os.path.getsize(mydir+'/'+info.loc[i, 'filename']) 
            if ind%10000==0:
                print('finished %s of %s'%(ind, len(info)))
        except:
            print(mydir+'/'+info.loc[i, 'filename'])
        
    return info.rename(columns={'filename' : 'new_filename'})

## Prep Data

In [4]:
#x_train, y_train, x_cv, y_cv = prep_data([train_info, None], 'cv')    
#x_test, y_test = prep_data([None, submission_info], 'test')    
def prep_data(input_info, split):
    orig_info = input_info[0]
    data = input_info[1]
    
    if split=='cv':
        artists = info.artist.unique()
        np.random.shuffle(artists)
        
        info = get_image_info(orig_info, r'/data/training_data/train')
        info['bytes_per_pixel'] = 1.0*info['size_bytes']/(info['pixelsx']*info['pixelsy'])
        info['aspect_ratio'] = 1.0*info['pixelsx']/info['pixelsy']
        train_artists = artists[0:int(0.8*len(artists))]
        test_artists = artists[int(0.8*len(artists)):]    
        
        train = make_pairs(info[info.artist.isin(train_artists)])
        test = make_pairs(info[info.artist.isin(test_artists)])
        train['in_train'] = True
        test['in_train'] = False
        data = train.append(test)
        data['sameArtist'] = data['artist1'] == data['artist2']
        
    if split=='test':

        info = get_image_info(data, r'/data/test_data/test')
        info['bytes_per_pixel'] = 1.0*info['size_bytes']/(info['pixelsx']*info['pixelsy'])
        info['aspect_ratio'] = 1.0*info['pixelsx']/info['pixelsy']
        
        data['in_train'] = False
    
        if 'artist1' in data.columns:
            data['sameArtist'] = data['artist1'] == data['artist2']

    
    data2 = pd.merge(data, info[['new_filename', 'pixelsx', 'pixelsy', 'size_bytes', 'bytes_per_pixel', 'aspect_ratio']], how='left', left_on='image1', right_on='new_filename')
    data2.drop('new_filename', 1, inplace=True)
    
    data2 = pd.merge(data2, info[['new_filename', 'pixelsx', 'pixelsy', 'size_bytes', 'bytes_per_pixel', 'aspect_ratio']], how='left', left_on='image2', right_on='new_filename')
    data2.drop('new_filename', 1, inplace=True)
    
    x_train = data2[data2.in_train==True][['pixelsx_x', 'pixelsy_x', 'size_bytes_x', 'bytes_per_pixel_x', 'aspect_ratio_x', 'pixelsx_y', 'pixelsy_y', 'size_bytes_y', 'bytes_per_pixel_y', 'aspect_ratio_y']].values
    x_test = data2[data2.in_train==False][['pixelsx_x', 'pixelsy_x', 'size_bytes_x', 'bytes_per_pixel_x', 'aspect_ratio_x', 'pixelsx_y', 'pixelsy_y', 'size_bytes_y', 'bytes_per_pixel_y', 'aspect_ratio_y']].values
    
    
    if 'artist1' in data.columns: 
        y_train = data2[data2.in_train==True]['sameArtist'].values
        y_test = data2[data2.in_train==False]['sameArtist'].values
    else:
        y_test = None    
    
    if split=='cv':        
        return x_train, y_train, x_test, y_test  
    if split=='test':
        return x_test, y_test

## Train classifier

In [5]:
def train_classifier(x_train, y_train, x_cv, y_cv):    
    clf = RandomForestClassifier(n_estimators=100)
    
    #clf = xgb.XGBClassifier(n_estimators=10000, learning_rate=0.025, max_depth=4)
    print('starting fit')
    #excluding the patient_id column from the fit and prediction
    clf.fit(x_train[::5], y_train[::5])
    print('starting pred')
    
    y_pred = np.zeros(x_cv.shape[0])
    for i in xrange(4):
        y_pred[i::4] = clf.predict_proba(x_cv[i::4])[:,1] 
    
    if not y_cv is None:
        print(roc_auc_score(y_cv, y_pred))
    
    return y_pred, clf

## Main

In [6]:
def make_submission():
    train_info = pd.read_csv(r'/data/training_data/train_info.csv')
    submission_info = pd.read_csv(r'/data/test_data/submission_info.csv')
    print('prepping training and cv data')
    x_train, y_train, x_cv, y_cv = prep_data([train_info, None], 'cv')    
    print('prepping test data')
    x_test, y_test = prep_data([None, submission_info], 'test')    
    
    print('starting classifier')
    y_pred, clf = train_classifier(x_train, y_train, x_test, y_test) 

    submission = submission_info[['index']]
    submission['sameArtist'] = y_pred
    submission.to_csv('submission.csv', index=False)

## main

In [50]:
#load training info
train_image_info = pd.read_csv(r'/data/training_data/train_info.csv')
submission_info = pd.read_csv(r'/data/test_data/submission_info.csv')

#make training pairs
#train_pairs = make_pairs(train_info)
#train_pairs[ 'sameArtist' ] = train_pairs[ 'artist1' ]== train_pairs[ 'artist2' ]

#save as csv
#train_pairs.to_csv(r'/data/training_data/train_pairs.csv')

#load pairs
train_pairs = pd.read_csv(r'/data/training_data/train_pairs.csv', index_col = 0)

#get raw training data features
# raw_train_image_info = get_image_info(train_image_info, r'/data/training_data/train')
# raw_train_image_info['bytes_per_pixel'] =CV_all_image_info['size_bytes']/(CV_all_image_info['pixelsx']*CV_all_image_info['pixelsy'])
# raw_train_image_info['aspect_ratio'] = CV_all_image_info['pixelsx']/CV_all_image_info['pixelsy']

#save raw_train_image_info
#raw_train_image_info.to_csv(r'/data/training_data/raw_train_image_info.csv')

#load raw_train_image_info
raw_train_image_info = pd.read_csv(r'/data/training_data/raw_train_image_info.csv', index_col = 0)


#join pair data to image features
raw_train_trimmed_image_info = raw_train_image_info[['new_filename',
                                                      'pixelsx',
                                                      'pixelsy',
                                                      'bytes_per_pixel',
                                                      'aspect_ratio']]
train_pairs = train_pairs.merge(raw_train_trimmed_image_info,
                                left_on='image1', right_on='new_filename')
train_pairs.rename( columns = {'pixelsx': 'pixelsx_1',
                    'pixelsy': 'pixelsy_1',
                    'bytes_per_pixel' : 'bytes_per_pixel_1',
                    'aspect_ratio':'aspect_ratio_1'},
                      inplace=True)
train_pairs = train_pairs.merge(raw_train_trimmed_image_info,
                                left_on='image2', right_on='new_filename')
train_pairs.rename( columns = {'pixelsx': 'pixelsx_2',
                    'pixelsy': 'pixelsy_2',
                    'bytes_per_pixel' : 'bytes_per_pixel_2',
                    'aspect_ratio':'aspect_ratio_2'},
                      inplace=True)



finished 0 of 79433
finished 500 of 79433
finished 1000 of 79433
finished 1500 of 79433
finished 2000 of 79433
finished 2500 of 79433
finished 3000 of 79433
finished 3500 of 79433
finished 4000 of 79433
finished 4500 of 79433
finished 5000 of 79433
finished 5500 of 79433




finished 6000 of 79433
finished 6500 of 79433
finished 7000 of 79433
finished 7500 of 79433
finished 8000 of 79433
finished 8500 of 79433
finished 9000 of 79433
finished 9500 of 79433
finished 10000 of 79433
finished 10500 of 79433
finished 11000 of 79433
finished 11500 of 79433
finished 12000 of 79433
finished 12500 of 79433
finished 13000 of 79433
finished 13500 of 79433
finished 14000 of 79433
finished 14500 of 79433
finished 15000 of 79433
finished 15500 of 79433
finished 16000 of 79433
finished 16500 of 79433
finished 17000 of 79433
finished 17500 of 79433




finished 18000 of 79433
finished 18500 of 79433
finished 19000 of 79433
finished 19500 of 79433
finished 20000 of 79433
finished 20500 of 79433




finished 21000 of 79433




finished 21500 of 79433
finished 22000 of 79433
finished 22500 of 79433
finished 23000 of 79433




finished 23500 of 79433
finished 24000 of 79433
finished 24500 of 79433
finished 25000 of 79433
finished 25500 of 79433
finished 26000 of 79433




finished 26500 of 79433




finished 27000 of 79433
finished 27500 of 79433
finished 28000 of 79433
finished 28500 of 79433
finished 29000 of 79433
finished 29500 of 79433
finished 30000 of 79433
finished 30500 of 79433
finished 31000 of 79433
finished 31500 of 79433
finished 32000 of 79433
finished 32500 of 79433
/data/training_data/train/42359.jpg
finished 33000 of 79433
finished 33500 of 79433




finished 34000 of 79433
finished 34500 of 79433
finished 35000 of 79433




finished 35500 of 79433




finished 36000 of 79433
finished 36500 of 79433
finished 37000 of 79433
finished 37500 of 79433
finished 38000 of 79433
finished 38500 of 79433
finished 39000 of 79433
finished 39500 of 79433
finished 40000 of 79433
finished 40500 of 79433
finished 41000 of 79433
finished 41500 of 79433
finished 42000 of 79433
finished 42500 of 79433
finished 43000 of 79433
finished 43500 of 79433
finished 44000 of 79433
finished 44500 of 79433
finished 45000 of 79433
finished 45500 of 79433
finished 46000 of 79433
finished 46500 of 79433
finished 47000 of 79433
finished 47500 of 79433
finished 48000 of 79433
finished 48500 of 79433
finished 49000 of 79433
finished 49500 of 79433
finished 50000 of 79433
finished 50500 of 79433
finished 51000 of 79433
finished 51500 of 79433




finished 52000 of 79433
finished 52500 of 79433
finished 53000 of 79433
finished 53500 of 79433
finished 54000 of 79433
finished 54500 of 79433
finished 55000 of 79433
finished 55500 of 79433
finished 56000 of 79433
finished 56500 of 79433
finished 57000 of 79433
finished 57500 of 79433
finished 58000 of 79433
finished 58500 of 79433
finished 59000 of 79433
finished 59500 of 79433
finished 60000 of 79433
finished 60500 of 79433
finished 61000 of 79433
finished 61500 of 79433
finished 62000 of 79433
finished 62500 of 79433
finished 63000 of 79433
finished 63500 of 79433




finished 64000 of 79433
finished 64500 of 79433
finished 65000 of 79433
finished 65500 of 79433
finished 66000 of 79433
finished 66500 of 79433
finished 67000 of 79433
finished 67500 of 79433
finished 68000 of 79433
finished 68500 of 79433
finished 69000 of 79433
finished 69500 of 79433
finished 70000 of 79433
finished 70500 of 79433
finished 71000 of 79433
finished 71500 of 79433
finished 72000 of 79433
finished 72500 of 79433
finished 73000 of 79433
finished 73500 of 79433
finished 74000 of 79433
finished 74500 of 79433
finished 75000 of 79433
finished 75500 of 79433
finished 76000 of 79433
finished 76500 of 79433
finished 77000 of 79433
finished 77500 of 79433
finished 78000 of 79433
finished 78500 of 79433
finished 79000 of 79433


## 80/20 CV

In [103]:




### perform 80/20 cross-validation

# declare X columns
base_columns = [ 'pixelsx',
            'pixelsy',
            'bytes_per_pixel',
            'aspect_ratio']

X_columns = [ col+'_1' for col in base_columns] + [ col+'_2' for col in base_columns]

# shuffle rows
train_pairs = train_pairs.iloc[np.random.permutation(len(train_pairs))]

#split pair data into train and test
foldSize = int(len(train_pairs)/5)

CV_pairs_train = train_pairs.iloc[0:4*foldSize]
CV_pairs_test = train_pairs.iloc[4*foldSize:len(train_pairs)]

#set up Xs
CV_train_X = CV_pairs_train[X_columns]
CV_test_X = CV_pairs_test[X_columns]

#set up Ys
CV_train_Y = CV_pairs_train['sameArtist']
CV_test_Y = CV_pairs_test['sameArtist']



Index(['pixelsx_1', 'pixelsy_1', 'bytes_per_pixel_1', 'aspect_ratio_1',
       'pixelsx_2', 'pixelsy_2', 'bytes_per_pixel_2', 'aspect_ratio_2'],
      dtype='object')

## scratch

['pixelsx_x',
 'pixelsy_x',
 'bytes_per_pixel_x',
 'aspect_ratio_x',
 'pixelsx_y',
 'pixelsy_y',
 'bytes_per_pixel_y',
 'aspect_ratio_y']

In [None]:
### perform 5-fold cross validation

# shuffle rows
# train_pairs = train_pairs.iloc[np.random.permutation(len(train_pairs))]

# for cv_num in range(0,5):
#     len(train_pairs)/5 = chunkSize
#     if cv_num > 0:
#         test_start = i*chunkSize
#         test_end = (i+1)*chunkSize
#         train_start
#         train_end
#split training examples into 5 groups


#split training examples into CV_Train_X and CV_Train_Y, also create CV_Test