# PART 2: Compute a feature vector for each buisness

## Summary:<br>

     This Kaggle competition is a Multiple instance learning (MIL) problem:      
     Each training example (a business) has multiple instances (photos).          
     We'll use the SimpleMI algorithm briefly mentioned in 
     https://en.wikipedia.org/wiki/Multiple_instance_learning
     
     In part 1, we've obtained a 2048-dim feature vector for each image.
     In part 2, for each business, we will compute the mean feature vector among images that belong to it.
     In this way, each business is correspondent to a single feature, i.e., the mean feature vector.

## Process buisness in the training set

In [4]:
data_root = '/Volumes/My Passport/yelp/'

import numpy as np
import pandas as pd 
import h5py
import time

train_photo_to_biz = pd.read_csv(data_root+'train_photo_to_biz_ids.csv')
train_labels = pd.read_csv(data_root+'train.csv').dropna()
train_labels['labels'] = train_labels['labels'].apply(lambda x: tuple(sorted(int(t) for t in x.split())))
train_labels.set_index('business_id', inplace=True)
biz_ids = train_labels.index.unique()
print "Number of business: ", len(biz_ids) ,   "(4 business with missing labels are dropped)"

## Load image features
f = h5py.File(data_root+'train_image_HOGfeatures.h5','r')
train_image_features = np.copy(f['feature'])
f.close()


t= time.time()
## For each business, compute a feature vector 
df = pd.DataFrame(columns=['business','label','feature vector'])
index = 0
for biz in biz_ids:  
    
    label = train_labels.loc[biz]['labels']
    image_index = train_photo_to_biz[train_photo_to_biz['business_id']==biz].index.tolist()
    folder = data_root+'train_photo_folders/'  
    
    features = train_image_features[image_index]
    mean_feature =list(np.mean(features,axis=0))

    df.loc[index] = [biz, label, mean_feature]
    index+=1
    if index%1000==0:
        print "Buisness processed: ", index, "Time passed: ", "{0:.1f}".format(time.time()-t), "sec"

with open(data_root+"train_biz_HOGfeatures.csv",'w') as f:  
    df.to_csv(f, index=False)

Number of business:  1996 (4 business with missing labels are dropped)
Buisness processed:  1000 Time passed:  5.0 sec


In [5]:
# Check file content
train_business = pd.read_csv(data_root+'train_biz_HOGfeatures.csv')
print train_business.shape
train_business[0:5]

(1996, 3)


Unnamed: 0,business,label,feature vector
0,1000,"(1, 2, 3, 4, 5, 6, 7)","[0.14005625, 0.085037857, 0.09016072, 0.126218..."
1,1001,"(0, 1, 6, 8)","[0.12032899, 0.050164513, 0.077049561, 0.11339..."
2,100,"(1, 2, 4, 5, 6, 7)","[0.14770095, 0.077750131, 0.087755665, 0.13897..."
3,1006,"(1, 2, 4, 5, 6)","[0.16893548, 0.078272745, 0.12111282, 0.085566..."
4,1010,"(0, 6, 8)","[0.17430022, 0.070603535, 0.11224926, 0.115906..."


## Process business in the test set

In [6]:
data_root = '/Volumes/My Passport/yelp/'

import numpy as np
import pandas as pd 
import h5py
import time

In [7]:
test_photo_to_biz = pd.read_csv(data_root+'test_photo_to_biz.csv')
biz_ids = test_photo_to_biz['business_id'].unique()

## Load image features
f = h5py.File(data_root+'test_image_HOGfeatures.h5','r')
image_filenames = list(np.copy(f['photo_id']))
image_filenames = [name.split('/')[-1][:-4] for name in image_filenames]  #remove the full path and the str ".jpg"
image_features = np.copy(f['feature'])
f.close()
print "Number of business: ", len(biz_ids)

df = pd.DataFrame(columns=['business','feature vector'])
index = 0
t = time.time()

for biz in biz_ids:     
    
    image_ids = test_photo_to_biz[test_photo_to_biz['business_id']==biz]['photo_id'].tolist()  
    image_index = [image_filenames.index(str(x)) for x in image_ids]
     
    folder = data_root+'test_photo_folders/'            
    features = image_features[image_index]
    mean_feature =list(np.mean(features,axis=0))

    df.loc[index] = [biz, mean_feature]
    index+=1
    if index%1000==0:
        print "Buisness processed: ", index, "Time passed: ", "{0:.1f}".format(time.time()-t), "sec"

with open(data_root+"test_biz_HOGfeatures.csv",'w') as f:  
    df.to_csv(f, index=False)

Number of business:  10000
Buisness processed:  1000 Time passed:  136.4 sec
Buisness processed:  2000 Time passed:  380.0 sec
Buisness processed:  3000 Time passed:  620.8 sec
Buisness processed:  4000 Time passed:  906.5 sec
Buisness processed:  5000 Time passed:  1185.3 sec
Buisness processed:  6000 Time passed:  1448.0 sec
Buisness processed:  7000 Time passed:  1718.8 sec
Buisness processed:  8000 Time passed:  1986.8 sec
Buisness processed:  9000 Time passed:  2258.4 sec
Buisness processed:  10000 Time passed:  2546.5 sec


In [8]:
# Check file content
test_business = pd.read_csv(data_root+'test_biz_HOGfeatures.csv')
print test_business.shape
test_business[0:5]

(10000, 2)


Unnamed: 0,business,feature vector
0,003sg,"[0.14029846, 0.097122177, 0.10904062, 0.155347..."
1,00er5,"[0.13802128, 0.08558739, 0.10501912, 0.1498876..."
2,00kad,"[0.14562644, 0.10004855, 0.12065777, 0.1286887..."
3,00mc6,"[0.14964491, 0.12405226, 0.1319675, 0.1532997,..."
4,00q7x,"[0.11541133, 0.10103348, 0.13063087, 0.1571736..."
