## Data Preparation   
The goal of this notebook is to take the inputs described below and generate a single CSV with only the information required to perform image-level processing and ad/cluster level aggregating, while performing basic sanity checks. 

### Inputs
1. CP1_train_ads_labelled_fall2016.jsonl   
This is a json lines file of ads that contain _id, class, cluster_id
2. es_child_documents.jl   
This is a json lines file of image objects which contain obj_stored_url and obj_parent
3. image_url_to_valid_sha1.csv    
This is a csv containing 2 columns: an image url (obj_stored_url), and a sha1 checksum of the file

### Outputs
1. CP1_data.csv    
This is a csv file containing 4 columns: cluster_id, ad_id, image_sha, class

In [None]:
__depends__ = ['CP1_train_ads_labelled_fall2016.jsonl',
               'es_child_documents.jl',
               'image_url_to_valid_sha1.csv']
__dest__ = ['CP1_data.csv']

In [None]:
OFFICIAL_DATA_FILE = 'CP1_train_ads_labelled_fall2016.jsonl'

In [None]:
import csv
import json
import numpy as np

from collections import Counter, defaultdict

### Sanity checking the official data

Assumptions:   
1) The relationship between ad_id and cluster_id is many -> 1    
2) The relationship between cluster_id and class is 1 -> 1

Abbreviate the data to what we need: ad ids, cluster ids, and classes.

In [None]:
cluster_id_to_ad_ids = defaultdict(set)
cluster_id_to_class = defaultdict(set)

with open(OFFICIAL_DATA_FILE) as infile:
    for line in infile:
        document = json.loads(line.strip())
        
        cluster_id_to_ad_ids[document['cluster_id']].add(document['_id'])
        cluster_id_to_class[document['cluster_id']].add(document['class'])

In [None]:
# Sanity check that each cluster has at least one ad
for cluster_id, ad_ids in cluster_id_to_ad_ids.iteritems():
    assert len(ad_ids) > 0
    
# Sanity check no ad falls in more than one cluster (assumption 1)
all_ad_ids = []
num_unique_ad_ids = 0

for _, ad_ids in cluster_id_to_ad_ids.iteritems():
    all_ad_ids += list(ad_ids)
    num_unique_ad_ids += len(ad_ids)
    
assert len(all_ad_ids) == num_unique_ad_ids

# Sanity check that each cluster only belongs to one class (assumption 2) 
for _, cls in cluster_id_to_class.iteritems():
    assert len(cls) == 1

### Official data descriptions

In [None]:
print '%d clusters (%d positive, %d negative)' % (len(cluster_id_to_class),
                                                  len([x for x in cluster_id_to_class.values() if x == {1}]),
                                                  len([x for x in cluster_id_to_class.values() if x == {0}]))

In [None]:
ads_per_positive_cluster = [len(ad_ids) for cid, ad_ids in cluster_id_to_ad_ids.iteritems() \
                            if cluster_id_to_class[cid] == {1}]
print 'min/med/avg/max/total ads per positive cluster: %d/%d/%d/%d/%d' % (min(ads_per_positive_cluster),
                                                                          np.median(ads_per_positive_cluster),
                                                                          np.average(ads_per_positive_cluster),
                                                                          max(ads_per_positive_cluster),
                                                                          sum(ads_per_positive_cluster))

In [None]:
ads_per_negative_cluster = [len(ad_ids) for cid, ad_ids in cluster_id_to_ad_ids.iteritems() \
                            if cluster_id_to_class[cid] == {0}]
print 'min/med/avg/max/total ads per negative cluster: %d/%d/%d/%d/%d' % (min(ads_per_negative_cluster),
                                                                          np.median(ads_per_negative_cluster),
                                                                          np.average(ads_per_negative_cluster),
                                                                          max(ads_per_negative_cluster),
                                                                          sum(ads_per_negative_cluster))

### Associating Imagery

The shas present here have already been vetted by SMQTK.

In [None]:
ad_id_to_shas = defaultdict(set)
ad_id_to_image_urls = defaultdict(set)


with open('es_child_documents.jl') as infile:
    for line in infile:
        document = json.loads(line.strip())
        
        if isinstance(document['obj_parent'], list):
            ad_ids = document['obj_parent']
        else:
            ad_ids = [document['obj_parent']]
            
        for ad_id in ad_ids:
            if document['obj_stored_url']:
                ad_id_to_image_urls[ad_id].add(document['obj_stored_url'])
        
image_url_to_sha = {}
with open('image_url_to_valid_sha1.csv') as infile:
    for (image_url, sha1) in csv.reader(infile):
        image_url_to_sha[image_url] = sha1
        

for (ad_id, image_urls) in ad_id_to_image_urls.iteritems():
    try:
        ad_id_to_shas[ad_id] = set([image_url_to_sha[url] for url in image_urls])
    except KeyError:
        # There might not be a sha1 for the image url since some shas were invalid (from SMQTK) 
        pass

In [None]:
# Sanity check that each ad has at least 1 sha
for shas in ad_id_to_shas.values():
    assert len(shas) > 0
    
# Sanity check that each cluster has at least 1 ad with at least 1 sha
for (cluster_id, ad_ids) in cluster_id_to_ad_ids.iteritems():
    cluster_shas = set()
    for ad_id in ad_ids:
        cluster_shas |= ad_id_to_shas[ad_id]
        
    if not len(cluster_shas) > 0:
        print cluster_id

In [None]:
shas_per_ad = map(len, ad_id_to_shas.values())
print 'min/med/avg/max/total images per ad: %d/%d/%d/%d/%d' % (min(shas_per_ad),
                                                               np.median(shas_per_ad),
                                                               np.average(shas_per_ad),
                                                               max(shas_per_ad),
                                                               sum(shas_per_ad))

Create one CSV with all the relevant information, in the format of:    
cluster_id, ad_id, image_sha, class

In [None]:
with open('CP1_data.csv', 'w') as outfile:
    writer = csv.writer(outfile, lineterminator='\n')
    
    for (cluster_id, ad_ids) in cluster_id_to_ad_ids.iteritems():
        for ad_id in ad_ids:
            for image_sha in ad_id_to_shas[ad_id]:
                writer.writerow([cluster_id, ad_id, image_sha, list(cluster_id_to_class[cluster_id])[0]])

In [None]:
# Find shas that are marked positive and negative
sha_to_class = defaultdict(set)

with open('CP1_data.csv') as infile:
    for (cid, ad_id, sha, cls) in csv.reader(infile):
        sha_to_class[sha].add(cls)
        
bad_shas = set([sha for sha, classes in sha_to_class.iteritems() if len(classes) > 1])

print len(bad_shas)

In [None]:
# Print clusters ordered by number of images
clusters = defaultdict(set)

with open('CP1_data.csv') as infile:
    for (cid, ad_id, sha, cls) in csv.reader(infile):
        clusters[cid].add(sha)

In [None]:
clusters_by_size = sorted(clusters.items(), key=lambda x: len(x[1]), reverse=True)

for (cluster_id, shas) in clusters_by_size:
    print '%s %s %d' % (cluster_id, cluster_id_to_class[cluster_id], len(shas))