<i> This notebook presents a greedy heuristic algorithm approach to the Google HashCode Optimization Problem, more details for which can be found at https://www.kaggle.com/c/hashcode-photo-slideshow/data. Within this approach, a (nxn/2,1) heap priority queue is created to calculate "interest metric" values between each pair of images. The highest-value pairs are continuously appended to the string until all elements have been added. After individual horizontal and vertical images are added, a meta-level optimization is performed to club vertical images together. While this may possibly miss global minima as double-vertical slides are not considered in the initial ordering, the split reduces the computational order of magnitude. </i>

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import gc
import psutil
import pickle
import time
from joblib import Parallel, delayed
import collections
import h5py
import gc
from sortedcontainers import SortedDict, SortedList

In [2]:
with open('d_pet_pictures.txt', 'rb') as f:
    data = f.readlines()[1:]

<h2> Create Greedy Algorithm Helper Functions </h2>

In [3]:
def generate_tagdict(data):
    hdict, vdict = {}, {}
    for i in range(len(data)):
        row = np.vectorize(lambda s: s.decode('utf-8'))(data[i].split())
        if row[0]=="H": hdict[str(i)] = set(row[1:])
        else: vdict[str(i)] = set(row[1:])
    return hdict, vdict

In [4]:
#GLOBAL VARIABLES INVARIANT
#Needs hdict and vdict, dictionaries of image tag sets
def calculate_interest(name1, name2):
    #Invariant: names are of the form (H/V):K where H/V indicates the dictionary in question and K is the key
    #The hdict and vdict dictionaries contain sets
    namedisamb = (lambda s: hdict[s.split(":")[1]] if s.split(":")[0]=="H" else 
                  vdict[s.split(":")[1]])
    tag1, tag2 = namedisamb(name1), namedisamb(name2)
    return min(len(tag1&tag2), len(tag1-tag2), len(tag2-tag1))

In [5]:
#Generate interest master list for subset of arrays
#Memory limitations prohibit a 90000x90000 master matrix
def generate_interest_dict(keys):
    start = time.time()
    checkpoints = np.arange(0, keys.shape[0], keys.shape[0]/5)[1:]
    interestdict = SortedDict()
    for i in range(keys.shape[0]):
        for j in range(i+1, keys.shape[0]):
            value = calculate_interest(keys[i], keys[j])
            if value==0: continue
            if interestdict.get(value, False)==False:
                interestdict[value] = collections.deque([(i,j)])
            else:
                interestdict[value].append((i,j))
        if (i+1) in checkpoints: 
            print("Calculation Checkpoint of "+str(i+1)+":"+str(time.time()-start))
    print(time.time()-start)
    return interestdict

In [6]:
#PARAMETERS:
#interestdict: a sorted dictionaries of deques containing interest values
#lookupdict: a dictionary of occurences
#keys: list of names of keys
def getmaxfn(interestdict, lookupdict, keys):
    if len(interestdict)==0: return
    curcount1, curcount2 = 2, 2
    while (curcount1==2) or (curcount2==2):
        curval = interestdict.keys()[-1]
        curmax = interestdict[curval].pop()
        curcount1, curcount2 = lookupdict.get(keys[curmax[0]],0), lookupdict.get(keys[curmax[1]],0)
        if not interestdict[curval]:
            del interestdict[curval]
    lookupdict[keys[curmax[0]]] = curcount1+1
    lookupdict[keys[curmax[1]]] = curcount2+1
    return (keys[curmax[0]], keys[curmax[1]])

In [38]:
#Collapse Deque
#PARAMETERS:
#pair: size-2 tuple containing the new added transition
#curarrang: a deque of deques containing current valid transition sequences
def deque_update(pair, curarrang):
    #Resolve Merge of Two Elements
    def resolve_double(first, second):
        #INVARIANT: The first and second will have an overlapping value as the first was updated but not second
        if curarrang[first][0] in [curarrang[second][0], curarrang[second][-1]]:
            curarrang[first].reverse()
        if curarrang[first][-1]==curarrang[second][0]:
            curarrang[second].popleft()
        elif curarrang[first][-1]==curarrang[second][-1]:
            curarrang[second].reverse()
            curarrang[second].popleft()
        curarrang[first]+=curarrang[second]
        del curarrang[second]
    match, lastm = False, 0
    i=0
    while i<len(curarrang):
        if curarrang[i][0]==pair[0]:
            if not match:
                curarrang[i].appendleft(pair[1])
                lastm =  i
                match = True
                i+=1
            else:
                resolve_double(lastm, i)
        elif curarrang[i][0]==pair[1]:
            if not match:
                curarrang[i].appendleft(pair[0])
                lastm =  i
                match = True
                i+=1
            else:
                resolve_double(lastm, i)
        elif curarrang[i][-1]==pair[0]:
            if not match:
                curarrang[i].append(pair[1])
                lastm =  i
                match = True
                i+=1
            else:
                resolve_double(lastm, i)
        elif curarrang[i][-1]==pair[1]:
            if not match:
                curarrang[i].append(pair[0])
                lastm =  i
                match = True
                i+=1
            else:
                resolve_double(lastm, i)
        else:
            i+=1
    if match: return
    curarrang.append(collections.deque(pair))

<h2> Run Single-Image Greedy Heuristic (Layer 1)</h2>

In [8]:
#Global Variables
nsplits = 4
hdict, vdict = generate_tagdict(data)
hkeys = np.vectorize(lambda s: "H:"+str(s))(np.array(list(hdict.keys())))
vkeys = np.vectorize(lambda s: "V:"+str(s))(np.array(list(vdict.keys())))
totalkeys = np.append(hkeys, vkeys) 

In [9]:
indices = np.arange(totalkeys.shape[0])
np.random.seed(1)
np.random.shuffle(indices)
indices = np.split(indices, nsplits)

In [10]:
def individual_run(keys):
    interestdict = generate_interest_dict(keys)
    lookupdict = {}
    curarrang = collections.deque()
    i=0
    while (len(curarrang)==0) or (len(curarrang[0])<len(keys)):
        maxval = getmaxfn(interestdict, lookupdict, keys)
        deque_update(maxval, curarrang)
        i+=1
        if (i%100)==0: print(i)
    return curarrang

<h2> Generate Helper Functions for Replacement of Double-Vertical Images

<h2> Group Double-Vertical Images to Enhance Interest (Layer 2) </h2>

<h2> Finalize and Write Submission File </h2>