# PART 2: Compute bag distance

## Summary:#

    Each business is viewed as a bag of features.
    For each bag, we compute the Chamfer distance from it to every bag in the training set. 
       
Ref:  Equation (8) in <a href="http://158.109.8.37/files/Amo2013.pdf#Page=14">Amores' Survey paper</a>    

## Distance Between Two Bags##

Each instance is a 2048-dimension vector.

Let $A,B$ be bags of instances. The Chamfer distance of $A$ and $B$ is defined as:

$$ D(A,B) = \frac{1}{|A|}\sum_{a\in A}\min_{b\in B}||a-b|| +\frac{1}{|B|}\sum_{b\in B}\min_{a\in A}||a-b|| $$

In [1]:
from sklearn.metrics.pairwise import euclidean_distances
def dist_bags(A,B):    
    dists = euclidean_distances(A,B)
    partA = np.sum(np.min(dists, axis=1))/A.shape[0]
    partB = np.sum(np.min(dists, axis=0))/B.shape[0]
    return partA+partB    

## Compute mutual bag distances for bags in the training set ##

#### Load image features####

In [2]:
data_root = '/home/ncchen/Kaggle-Yelp/input/'

import numpy as np
import pandas as pd 
import h5py
import time

train_photo_to_biz = pd.read_csv(data_root+'train_photo_to_biz_ids.csv')
train_labels = pd.read_csv(data_root+'train.csv').dropna()
train_labels['labels'] = train_labels['labels'].apply(lambda x: tuple(sorted(int(t) for t in x.split())))
train_labels.set_index('business_id', inplace=True)
trainbiz_ids = train_labels.index.unique()
print "Number of business: ", len(trainbiz_ids) ,   "(4 business with missing labels are dropped)\n"

## Load image features
f = h5py.File(data_root+'train_image_Resfeatures.h5','r')
train_image_ids= np.copy(f['photo_id'])
train_image_features = np.copy(f['feature'])
f.close()

print "Image features: ", train_image_features.shape

Number of business:  1996 (4 business with missing labels are dropped)

Image features:  (234842, 2048)


#### For each business,  collect its bag of instance features #

In [3]:
train_bag={}
for biz in trainbiz_ids:      
    label = train_labels.loc[biz]['labels']
    image_index = train_photo_to_biz[train_photo_to_biz['business_id']==biz].index.tolist()
    features = train_image_features[image_index]
    train_bag[biz] = features

#### For every bag in the training set, compute its distance to every bag in the training set

In [4]:
train_bags_df = pd.DataFrame(columns=trainbiz_ids, index = trainbiz_ids)
import time
t1= time.time()
count = 0 
i=0
j=0
for i in range(len(trainbiz_ids)):
    row = trainbiz_ids[i]
    train_bags_df.loc[row][row]=0    
    for j in range(len(trainbiz_ids)):
        if j>i:
            col = trainbiz_ids[j]
            train_bags_df.loc[row][col] = dist_bags(train_bag[row],train_bag[col])
            train_bags_df.loc[col][row] = train_bags_df.loc[row][col]       
    count +=1
    if count%200==0 or count==len(trainbiz_ids):
        print 'Bags processed: %d, Time passed: %s seconds' % (count, "{0:.0f}".format(time.time()-t1))        

Bags processed: 200, Time passed: 429 seconds
Bags processed: 400, Time passed: 789 seconds
Bags processed: 600, Time passed: 1096 seconds
Bags processed: 800, Time passed: 1366 seconds
Bags processed: 1000, Time passed: 1598 seconds
Bags processed: 1200, Time passed: 1788 seconds
Bags processed: 1400, Time passed: 1936 seconds
Bags processed: 1600, Time passed: 2037 seconds
Bags processed: 1800, Time passed: 2101 seconds
Bags processed: 1996, Time passed: 2119 seconds


#### Print the results.

In [7]:
train_bags_df[0:5]

Unnamed: 0,1000,1001,100,1006,1010,101,1011,1012,1014,1015,...,982,985,988,989,99,991,993,997,998,999
1000,0.0,43.635,38.7868,38.1584,37.5219,37.0114,37.3985,39.2988,40.6927,35.9434,...,38.9271,40.0304,42.0761,37.0212,36.2376,37.591,38.6389,40.5294,36.8481,37.5152
1001,43.635,0.0,42.2867,39.9622,40.8257,40.9876,38.9786,40.4867,43.1006,37.7113,...,40.1981,41.4028,43.7319,40.2264,40.7268,40.0207,37.7635,40.3887,38.345,40.087
100,38.7868,42.2867,0.0,36.5512,39.437,35.3241,36.6776,36.6791,38.341,35.854,...,38.5173,37.3074,41.1436,36.8412,35.9523,35.3644,38.4662,40.3305,36.5669,37.421
1006,38.1584,39.9622,36.5512,0.0,37.8729,35.9911,36.1379,37.8704,39.3629,35.7869,...,37.8669,36.0441,42.4736,35.6409,35.493,35.8046,37.2314,39.9787,36.3664,35.6067
1010,37.5219,40.8257,39.437,37.8729,0.0,37.2431,35.6252,37.494,40.498,36.3187,...,39.9483,38.9902,41.3535,35.6709,36.258,34.8482,37.9472,40.1095,35.956,38.3824


In [8]:
with open(data_root+"train_bag_distance_ResFeatures.csv",'w') as f:  
    train_bags_df.to_csv(f,index=True)

## Compute bag distances for bags in the test set ##

#### Load image features from both training and test sets

In [5]:
data_root = '/home/ncchen/Kaggle-Yelp/input/'

import numpy as np
import pandas as pd 
import h5py
import time


train_photo_to_biz = pd.read_csv(data_root+'train_photo_to_biz_ids.csv')
train_labels = pd.read_csv(data_root+'train.csv').dropna()
train_labels['labels'] = train_labels['labels'].apply(lambda x: tuple(sorted(int(t) for t in x.split())))
train_labels.set_index('business_id', inplace=True)
trainbiz_ids = train_labels.index.unique()
## Load train-image features
f = h5py.File(data_root+'train_image_Resfeatures.h5','r')
train_image_ids= np.copy(f['photo_id'])
train_image_features = np.copy(f['feature'])
f.close()

test_photo_to_biz = pd.read_csv(data_root+'test_photo_to_biz.csv')
testbiz_ids = test_photo_to_biz['business_id'].unique()

## Load test-image features
f = h5py.File(data_root+'test_image_Resfeatures.h5','r')
test_image_ids = list(np.copy(f['photo_id']))
test_image_ids = [int(name.split('/')[-1][:-4]) for name in test_image_ids]  #remove the full path and the str ".jpg"
test_image_features = np.copy(f['feature'])
f.close()
print "Number of test business: ", len(testbiz_ids)

Number of test business:  10000


#### For each bag in the test set, compute its distance to each bag in the training set.

In [6]:
test_bags_df = pd.DataFrame(columns=trainbiz_ids, index = testbiz_ids)
import time
t1= time.time()
count = 0 
i=0
j=0
for i in range(len(testbiz_ids)):
    row = testbiz_ids[i]
    image_ids = test_photo_to_biz[test_photo_to_biz['business_id']==row]['photo_id'].tolist()  
    image_index = [test_image_ids.index(x) for x in image_ids]           
    test_bag= test_image_features[image_index]
    for j in range(len(trainbiz_ids)):
        col = trainbiz_ids[j]
        test_bags_df.loc[row][col] = dist_bags(test_bag,train_bag[col])      
    count +=1
    if count%1000==0 or count==len(testbiz_ids):
        print 'Bags processed: %d, Time passed: %s seconds' % (count, "{0:.0f}".format(time.time()-t1))       

Bags processed: 1000, Time passed: 2196 seconds
Bags processed: 2000, Time passed: 4827 seconds
Bags processed: 3000, Time passed: 7389 seconds
Bags processed: 4000, Time passed: 10240 seconds
Bags processed: 5000, Time passed: 13080 seconds
Bags processed: 6000, Time passed: 15804 seconds
Bags processed: 7000, Time passed: 18579 seconds
Bags processed: 8000, Time passed: 21315 seconds
Bags processed: 9000, Time passed: 24086 seconds
Bags processed: 10000, Time passed: 26968 seconds


In [7]:
test_bags_df[0:5]

Unnamed: 0,1000,1001,100,1006,1010,101,1011,1012,1014,1015,...,982,985,988,989,99,991,993,997,998,999
003sg,37.3192,38.6761,36.4388,36.1272,36.1035,35.667,34.8266,36.0612,38.3432,34.8344,...,37.4554,36.5954,39.7357,35.3395,35.0505,35.0547,35.6809,38.2772,35.0722,36.0168
00er5,37.4025,38.959,36.4132,36.6614,35.6592,35.859,35.2402,36.2395,38.1755,35.1803,...,37.3321,37.4789,39.1052,35.7292,35.4402,35.2169,35.9398,38.3872,35.7211,36.4379
00kad,38.305,38.5915,37.3864,36.9925,36.6515,36.5316,35.9556,37.7083,39.9014,35.8804,...,37.726,37.6514,40.6317,36.2247,36.0693,36.1183,36.601,38.8725,36.5032,37.1432
00mc6,40.5048,42.7541,39.5418,40.1939,39.0241,38.9807,38.2188,40.1663,42.5303,39.2206,...,40.3932,41.4542,42.5933,38.8546,38.4171,39.7271,40.1835,41.1038,38.3295,40.259
00q7x,39.7721,42.271,39.0635,38.3774,37.6874,37.4987,37.6337,38.843,40.1378,36.9648,...,39.1877,38.9009,41.8175,37.9562,36.998,37.3905,37.9149,40.5239,38.234,38.0665


In [8]:
with open(data_root+"test_bag_distance_ResFeatures.csv",'w') as f:  
    test_bags_df.to_csv(f,index=True)