# PART 2: Compute a feature vector for each buisness

Summary:<br>

     This Kaggle competition is a Multiple instance learning (MIL) problem:      
     Each training example (a business) has multiple instances (photos).          
     We'll use the SimpleMI algorithm briefly mentioned in 
     https://en.wikipedia.org/wiki/Multiple_instance_learning
     
     In part 1, we've obtained a 4096-dim feature vector for each image.
     In part 2, for each business, we will compute the mean feature vector among images that belong to it.
     In this way, each business is correspondent to a single feature, i.e., the mean feature vector.

## Process buisness in the training set

In [1]:
data_root = '/media/sf_Yelp/input/'

import numpy as np
import pandas as pd 
import h5py
import time

train_photo_to_biz = pd.read_csv(data_root+'train_photo_to_biz_ids.csv')
train_labels = pd.read_csv(data_root+'train.csv').dropna()
train_labels['labels'] = train_labels['labels'].apply(lambda x: tuple(sorted(int(t) for t in x.split())))
train_labels.set_index('business_id', inplace=True)
biz_ids = train_labels.index.unique()
print "Number of business: ", len(biz_ids) ,   "(4 business with missing labels are dropped)"

## Load image features
f = h5py.File(data_root+'train_image_fc7features.h5','r')
train_image_features = np.copy(f['feature'])
f.close()


t= time.time()
## For each business, compute a feature vector 
df = pd.DataFrame(columns=['business','label','mean vector','std vector'])
index = 0
for biz in biz_ids:  
    
    label = train_labels.loc[biz]['labels']
    image_index = train_photo_to_biz[train_photo_to_biz['business_id']==biz].index.tolist()
    folder = data_root+'train_photo_folders/'  
    
    features = train_image_features[image_index]
    mean_feature =list(np.mean(features,axis=0))
    std_feature =list(np.std(features,axis=0))

    df.loc[index] = [biz, label, mean_feature, std_feature]
    index+=1
    if index%1000==0:
        print "Buisness processed: ", index, "Time passed: ", "{0:.1f}".format(time.time()-t), "sec"

with open(data_root+"train_biz_fc7features_extra.csv",'w') as f:  
    df.to_csv(f, index=False)


Number of business:  1996 (4 business with missing labels are dropped)
Buisness processed:  1000 Time passed:  5.4 sec


In [5]:
# Check file content
train_business = pd.read_csv(data_root+'train_biz_fc7features_extra.csv')
print train_business.shape
train_business[0:5]

(1996, 4)


Unnamed: 0,business,label,mean vector,std vector
0,1000.0,"(1, 2, 3, 4, 5, 6, 7)","[0.20032024, 0.44084537, 0.23249489, 0.3600976...","[0.71970487, 1.0737038, 0.72123754, 0.82178557..."
1,1001.0,"(0, 1, 6, 8)","[0.0013769998, 0.59398097, 0.55060995, 0.18394...","[0.0038947437, 1.3847774, 1.5573601, 0.5202878..."
2,100.0,"(1, 2, 4, 5, 6, 7)","[0.11435749, 0.033177156, 0.12572332, 0.539482...","[0.46961057, 0.23131144, 0.4261772, 1.0798392,..."
3,1006.0,"(1, 2, 4, 5, 6)","[0.075851507, 0.052600037, 0.059594199, 0.7067...","[0.34759533, 0.24104358, 0.18853067, 1.4747, 1..."
4,1010.0,"(0, 6, 8)","[0.39024171, 0.28424361, 0.0, 0.1655795, 0.460...","[0.88030589, 0.89885724, 0.0, 0.51703572, 0.71..."


## Process business in the test set

In [3]:
data_root = '/media/sf_Yelp/input/'

import numpy as np
import pandas as pd 
import h5py
import time

In [4]:
test_photo_to_biz = pd.read_csv(data_root+'test_photo_to_biz_ids.csv')
biz_ids = test_photo_to_biz['business_id'].unique()

## Load image features
f = h5py.File(data_root+'test_image_fc7features.h5','r')
image_filenames = list(np.copy(f['photo_id']))
image_filenames = [name.split('/')[-1][:-4] for name in image_filenames]  #remove the full path and the str ".jpg"
image_features = np.copy(f['feature'])
f.close()
print "Number of business: ", len(biz_ids)

df = pd.DataFrame(columns=['business','mean vector','std vector'])
index = 0
t = time.time()

for biz in biz_ids:     
    
    image_ids = test_photo_to_biz[test_photo_to_biz['business_id']==biz]['photo_id'].tolist()  
    image_index = [image_filenames.index(str(x)) for x in image_ids]
     
    folder = data_root+'test_photo_folders/'            
    features = image_features[image_index]
    mean_feature =list(np.mean(features,axis=0))
    std_feature =list(np.std(features,axis=0))
    
    df.loc[index] = [biz, mean_feature, std_feature]
    index+=1
    if index%1000==0:
        print "Buisness processed: ", index, "Time passed: ", "{0:.1f}".format(time.time()-t), "sec"

with open(data_root+"test_biz_fc7features_extra.csv",'w') as f:  
    df.to_csv(f, index=False)

MemoryError: 

In [None]:
# Check file content
test_business = pd.read_csv(data_root+'test_biz_fc7features_extra.csv')
print test_business.shape
test_business[0:5]