<i> This notebook presents a greedy heuristic algorithm approach to the Google HashCode Optimization Problem, more details for which can be found at https://www.kaggle.com/c/hashcode-photo-slideshow/data. Within this approach, a (nxn/2,1) heap priority queue is created to calculate "interest metric" values between each pair of images. The highest-value pairs are continuously appended to the string until all elements have been added. After individual horizontal and vertical images are added, a meta-level optimization is performed to club vertical images together. While this may possibly miss global minima as double-vertical slides are not considered in the initial ordering, the split  reduces the computational order of magnitude. </i>

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import gc
import psutil
import pickle
import time
from joblib import Parallel, delayed
import collections
import h5py
from sortedcontainers import SortedDict, SortedList

In [2]:
with open('d_pet_pictures.txt', 'rb') as f:
    data = f.readlines()[1:]

<h2> Generate Upper-Triangular "Interest Value" Matrix </h2>

In [3]:
def generate_tagdict(data):
    hdict, vdict = {}, {}
    for i in range(len(data)):
        row = np.vectorize(lambda s: s.decode('utf-8'))(data[i].split())
        if row[0]=="H": hdict[str(i)] = set(row[1:])
        else: vdict[str(i)] = set(row[1:])
    return hdict, vdict
hdict, vdict = generate_tagdict(data)

In [4]:
def calculate_interest(name1, name2):
    #Invariant: names are of the form (H/V):K where H/V indicates the dictionary in question and K is the key
    #The hdict and vdict dictionaries contain sets
    namedisamb = (lambda s: hdict[s.split(":")[1]] if s.split(":")[0]=="H" else 
                  vdict[s.split(":")[1]])
    tag1, tag2 = namedisamb(name1), namedisamb(name2)
    return min(len(tag1&tag2), len(tag1-tag2), len(tag2-tag1))

In [5]:
hkeys = np.vectorize(lambda s: "H:"+str(s))(np.array(list(hdict.keys())))
vkeys = np.vectorize(lambda s: "V:"+str(s))(np.array(list(vdict.keys())))
totalkeys = np.append(hkeys, vkeys) 

In [6]:
#Generate interest master list
start = time.time()
interestdict = SortedDict()
for i in range(totalkeys.shape[0]):
    for j in range(i+1, totalkeys.shape[0]):
        value = calculate_interest(totalkeys[i], totalkeys[j])
        if interestdict.get(value, False)==False:
            interestdict[value] = collections.deque([(i,j)])
        else:
            interestdict[value].append((i,j))
    if ((i+1)%10000==0) and (i!=0): print("Calculation Checkpoint: "+str(i+1))
print(time.time()-start)

54.48229670524597


<h2> Create Greedy Algorithm Helper Functions </h2>

In [13]:
#Global Variables
imagecounter = 0
lookupdict = {}
curarrang = collections.deque()

In [None]:
def getmaxfn():
    curmax = interestvals.pop()
    if (lookupdict.get(curmax[0], 0)!=2) and (lookupdict.get(curmax[1], 0)!=2):
        return curmax[2]
    return getmaxfn()

In [15]:
#Collapse Deque
def deque_update(pair):
    #Resolve Merge of Two Elements
    def resolve_double(first, second):
        #INVARIANT: The first and second will have an overlapping value as the first was updated but not second
        if curarrang[first][0] in [curarrang[second][0], curarrang[second][-1]]:
            curarrang[first].reverse()
        if curarrang[first][-1]==curarrang[second][0]:
            curarrang[second].popleft()
        elif curarrang[first][-1]==curarrang[second][-1]:
            curarrang[second].reverse()
            curarrang[second].popleft()
        curarrang[first]+=curarrang[second]
        del curarrang[second]
    match, lastm = False, 0
    for i in range(len(curarrang)):
        if curarrang[i][0]==pair[0]:
            if not match:
                curarrang[i].appendleft(pair[1])
                lastm =  i
                match = True
            else:
                resolve_double(lastm, i)
        elif curarrang[i][0]==pair[1]:
            if not match:
                curarrang[i].appendleft(pair[0])
                lastm =  i
                match = True
            else:
                resolve_double(lastm, i)
        elif curarrang[i][-1]==pair[0]:
            if not match:
                curarrang[i].append(pair[1])
                lastm =  i
                match = True
            else:
                resolve_double(lastm, i)
        elif curarrang[i][-1]==pair[1]:
            if not match:
                curarrang[i].append(pair[0])
                lastm =  i
                match = True
            else:
                resolve_double(lastm, i)
    if match: return
    curarrang.append(collections.deque(pair))

<h2> Run Single-Image Greedy Heuristic (Layer 1)</h2>

In [None]:
while imagecounter<90000:
    pass

<h2> Generate Replacement Values for Double-Vertical Images

<h2> Group Double-Vertical Images to Enhance Interest (Layer 2) </h2>