In [1]:
# Will use pre-trained MXNet model for detecting cats.
# See https://gluon-cv.mxnet.io/build/examples_detection/demo_yolo.html#sphx-glr-build-examples-detection-demo-yolo-py
from gluoncv import model_zoo, data, utils
import mxnet
import numpy

# Utils used to extract images from tars
import tarfile
import os

# Used to display progress message
from tqdm import tqdm_notebook

# Used to read dataset
import pandas
import pyarrow.parquet as parquet

# Where the downloaded data are
input_path = './'
# Where to store results
output_path = './'

In [2]:
# Using the fastest model for detection
net = model_zoo.get_model('yolo3_darknet53_coco', pretrained=True)

In [3]:
# Detected class for cats
cat_class = net.classes.index('cat')

# Batches speedup inference significantly
batchSize = 37 * 2
numImages = 352758

# Iterate over images in tar, etract, predict, persist results
with tarfile.open(input_path + '/imagesThumbnails/testThumbnails.tar') as thumbnails:
    with open(output_path + '/catability.csv', 'w') as out:
        for batch in tqdm_notebook(range(int(numImages / batchSize))):
            # Accumulate batch results (mxnet has lazy computation model - 
            # actuall computation occure when you access the results)
            classes = list()
            scores = list()
            names = list()
            for i in range(batchSize):
                
                # Extract the image
                image = thumbnails.next()
                imagePath = path + image.name
                thumbnails.extract(image, path)
                
                # Load and pre-process
                x, _ = data.transforms.presets.yolo.load_test(imagePath, short=120)

                # Predict
                class_IDs, scores_v, _ = net(x)

                # Accumulate
                names.append(image.name)
                classes.append(class_IDs)
                scores.append(scores_v)
                
                # Cleanup
                os.remove(path + image.name)

            # Process accumulated batch results
            for class_IDs, scores, name in zip(classes,scores,names):
                cat_indexes = numpy.where(class_IDs.flatten()[0].asnumpy() == cat_class)[0]
                cat_index = cat_indexes[0] if len(cat_indexes) > 0 else 0
                if cat_index > 0:
                    out.write(name[0:32] + ',' + str(scores.flatten()[0].asnumpy()[cat_index]) + '\n')

HBox(children=(IntProgress(value=0, max=4767), HTML(value='')))




In [4]:
# Re-read catability predicts
catability = pandas.read_csv(output_path + './catability.csv', names = ['image', 'catability']).set_index('image')
catability.head(10)

Unnamed: 0_level_0,catability
image,Unnamed: 1_level_1
0000e46b41cdfa07fc87bf34434df10c,0.103371
00029b34539ce122831447a0d3d0d107,0.314821
000358b0f77efac8c47c3db6e9769546,0.01525
00070fe888e515cb3203fd58355e2ee0,0.069347
000ad9c574a24041122ccc2837750b8c,0.012116
0010166023be855faca9ca445f19323b,0.031435
00116c2631040f31b10fc9629d375efa,0.013658
0014c0dee4e6863c43d9fd9ce314526f,0.040489
001520e1ce876b064f6d1ae679fc6114,0.014823
001ae87e92221ed882d8d3fc04568e6d,0.253328


In [5]:
# Build a dictionary for faster access
catability_map = catability.to_dict('index')

In [6]:
# Read the test data
test = parquet.read_table(input_path + '/imagesTest', \
                          columns = ['instanceId_userId', 'instanceId_objectId','ImageId']).to_pandas()
test.head(10)

  labels, = index.labels


Unnamed: 0,instanceId_userId,instanceId_objectId,ImageId
0,602,25708174,[d215ecd5da0a088db89b97db3b6e811a]
1,602,24393746,[79eacdd5e803ad80d1c62de4047d74ad]
2,803,25400331,[771591947cf0a72be0702fe53e83fe11]
3,803,36083673,[86a85b7f9a4f528e87dececd0b400fac]
4,803,33713302,[244ed11c75d01ab4c22ece5b6ab72f39]
5,3065,22946340,[269d9e263e34733e92944380a12b23ac]
6,3065,38325772,[056a349e7c71694baab197b79fe9b7d3]
7,3065,36426827,[24fec4db493603e1d546bcb6b71a5ae3]
8,3065,37549050,[9bc47732b71a66f12248649eef291ed5]
9,3065,20719945,[8e16231d4adb8e91d4fc13d949f1e2ce]


In [7]:
# Weight each item by its maximal catability
test['score'] = test.ImageId.apply(lambda imgs: \
    -max([catability_map.get(x, {'catability' : 0.0})['catability'] for x in imgs]))
test.head(20)

Unnamed: 0,instanceId_userId,instanceId_objectId,ImageId,score
0,602,25708174,[d215ecd5da0a088db89b97db3b6e811a],-0.0
1,602,24393746,[79eacdd5e803ad80d1c62de4047d74ad],-0.0
2,803,25400331,[771591947cf0a72be0702fe53e83fe11],-0.0
3,803,36083673,[86a85b7f9a4f528e87dececd0b400fac],-0.0
4,803,33713302,[244ed11c75d01ab4c22ece5b6ab72f39],-0.0
5,3065,22946340,[269d9e263e34733e92944380a12b23ac],-0.0
6,3065,38325772,[056a349e7c71694baab197b79fe9b7d3],-0.0
7,3065,36426827,[24fec4db493603e1d546bcb6b71a5ae3],-0.0
8,3065,37549050,[9bc47732b71a66f12248649eef291ed5],-0.0
9,3065,20719945,[8e16231d4adb8e91d4fc13d949f1e2ce],-0.0


In [8]:
# Peek only needed columns and sort
result = test[["instanceId_userId", "instanceId_objectId", "score"]].sort_values(
    by=['instanceId_userId', 'score', 'instanceId_objectId'])
result.head(10)

Unnamed: 0,instanceId_userId,instanceId_objectId,score
528324,59,29096263,-0.0
528325,59,39141251,-0.0
876981,107,10501436,-0.0
876982,107,20368797,-0.0
876979,107,22664858,-0.0
876978,107,35377184,-0.0
876980,107,35614366,-0.0
876983,107,38038671,-0.0
876977,107,39090161,-0.0
1052308,158,30223730,-0.0


In [9]:
# Collect predictions for each user
submit = result.groupby("instanceId_userId")['instanceId_objectId'].apply(list)
submit.head(10)

instanceId_userId
59                                  [29096263, 39141251]
107    [10501436, 20368797, 22664858, 35377184, 35614...
158                                 [30223730, 33303760]
188                                 [30483180, 37981047]
254                                 [15200850, 24219820]
377    [16916552, 28882686, 36170070, 37077770, 38100...
527             [21421114, 38334144, 39098017, 39110394]
602                                 [24393746, 25708174]
803                       [25400331, 33713302, 36083673]
908                                 [21144274, 39092075]
Name: instanceId_objectId, dtype: object

In [10]:
# Persist the first submit
submit.to_csv(output_path + "/imagesSubmit.csv.gz", header = False, compression='gzip')