## Image Retrieval
The goal of this notebook is to retrieve the relevant images based on a set of ads provided by IST.

### Inputs
1. ads.jl   
This is a JSON lines file of ads each with an `_id` attribute.   

### Outputs
1. image_documents.jl   
These are the children documents (images) related to all of the ads from ads.jl.    
2. image_url_to_valid_sha1.csv    
This is a mapping of the URLs images are stored at to their SHA1, the images in this file
have been deemed valid by `smqtk-check-images`.

In [None]:
__depends__ = ['ads.jl']
__dest__ = ['ad_ids.txt',
            'image_documents.jl',
            'image_url_to_valid_sha1.csv',
            'image_documents_retrieval.joblog',
            'image_urls.txt',
            'image_retrieval.joblog']

ADS = __depends__[0]
AD_IDS = __dest__[0]
IMAGE_DOCUMENTS = __dest__[1]
IMAGE_URL_TO_VALID_SHA1 = __dest__[2]
IMAGE_DOCUMENTS_JOBLOG = __dest__[3]
IMAGE_URLS = __dest__[4]
IMAGE_JOBLOG = __dest__[5]

In [None]:
import csv

In [None]:
!jq -r '._id' $ADS | sort --unique > $AD_IDS

In [None]:
# find documents from ES that are children of the ad ids (meaning, images)
!parallel --joblog $IMAGE_DOCUMENTS_JOBLOG \
          --retries 3 \
          --arg-file $AD_IDS \
          --max-args 50 \
          --jobs 10 \
        python ../scripts/get_es_child_documents.py > $IMAGE_DOCUMENTS 

In [None]:
!jq -r '.obj_stored_url' $IMAGE_DOCUMENTS | sort --unique > $IMAGE_URLS

num_image_urls = !wc -l $IMAGE_URLS
print num_image_urls

In [None]:
# download images 
!parallel --joblog $IMAGE_JOBLOG \
          --retries 3 \
          --arg-file $IMAGE_URLS \
          --max-args 1 \
          --jobs 20 \
        python ../scripts/image_download.py > image_url_to_sha1.csv

In [None]:
# validate images
!find images -type f > image_paths.txt
valid_image_sha1s = !smqtk-check-images --file-list image_paths.txt | cut -d, -f2

In [None]:
with open('image_url_to_sha1.csv') as infile, open(IMAGE_URL_TO_VALID_SHA1, 'w') as outfile:
    writer = csv.writer(outfile, lineterminator='\n')
    
    for (image_url, sha1) in csv.reader(infile):
        if sha1 in valid_image_sha1s:
            writer.writerow([image_url, sha1])        