<i> This notebook presents a greedy heuristic algorithm approach to the Google HashCode Optimization Problem, more details for which can be found at https://www.kaggle.com/c/hashcode-photo-slideshow/data. Within this approach, a (nxn/2,1) heap priority queue is created to calculate "interest metric" values between each pair of images. The highest-value pairs are continuously appended to the string until all elements have been added. After individual horizontal and vertical images are added, a meta-level optimization is performed to club vertical images together. While this may possibly miss global minima as double-vertical slides are not considered in the initial ordering, the split reduces the computational order of magnitude. </i>

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import gc
import psutil
import pickle
import time
from joblib import Parallel, delayed
import collections
import h5py
import gc
from sortedcontainers import SortedDict, SortedList

In [2]:
with open('d_pet_pictures.txt', 'rb') as f:
    data = f.readlines()[1:]

<h2> Create Greedy Algorithm Helper Functions </h2>

In [3]:
def generate_tagdict(data):
    hdict, vdict = {}, {}
    for i in range(len(data)):
        row = np.vectorize(lambda s: s.decode('utf-8'))(data[i].split())
        if row[0]=="H": hdict[str(i)] = set(row[1:])
        else: vdict[str(i)] = set(row[1:])
    return hdict, vdict

In [4]:
#GLOBAL VARIABLES INVARIANT
#Needs hdict and vdict, dictionaries of image tag sets
def calculate_interest(name1, name2):
    #Invariant: names are of the form (H/V):K where H/V indicates the dictionary in question and K is the key
    #The hdict and vdict dictionaries contain sets
    namedisamb = (lambda s: hdict[s.split(":")[1]] if s.split(":")[0]=="H" else 
                  vdict[s.split(":")[1]])
    tag1, tag2 = namedisamb(name1), namedisamb(name2)
    return min(len(tag1&tag2), len(tag1-tag2), len(tag2-tag1))

In [5]:
#Generate interest master list for subset of arrays
#Memory limitations prohibit a 90000x90000 master matrix
def generate_interest_dict(keys):
    start = time.time()
    checkpoints = np.arange(0, keys.shape[0], keys.shape[0]/5)[1:]
    interestdict = SortedDict()
    for i in range(keys.shape[0]):
        for j in range(i+1, keys.shape[0]):
            value = calculate_interest(keys[i], keys[j])
            if value==0: continue
            if interestdict.get(value, False)==False:
                interestdict[value] = collections.deque([(i,j)])
            else:
                interestdict[value].append((i,j))
        if (i+1) in checkpoints: 
            print("Calculation Checkpoint of "+str(i+1)+":"+str(time.time()-start))
    print(time.time()-start)
    return interestdict

In [6]:
#PARAMETERS:
#interestdict: a sorted dictionaries of deques containing interest values
#lookupdict: a dictionary of occurences
#keys: list of names of keys
def getmaxfn(interestdict, lookupdict, keys):
    curcount1, curcount2 = 2, 2
    while (curcount1==2) or (curcount2==2):
        if len(interestdict)==0: return None, False
        curval = interestdict.keys()[-1]
        curmax = interestdict[curval].pop()
        curcount1, curcount2 = lookupdict.get(keys[curmax[0]],0), lookupdict.get(keys[curmax[1]],0)
        if not interestdict[curval]:
            del interestdict[curval]
    lookupdict[keys[curmax[0]]] = curcount1+1
    lookupdict[keys[curmax[1]]] = curcount2+1
    return (keys[curmax[0]], keys[curmax[1]]), True

In [7]:
#Collapse Deque
#PARAMETERS:
#pair: size-2 tuple containing the new added transition
#curarrang: a deque of deques containing current valid transition sequences
#lookupdict: a dictionary recording occurences
def deque_update(pair, curarrang, lookupdict):
    #Resolve Merge of Two Elements
    def resolve_double(first, second):
        #INVARIANT: The first and second will have an overlapping value as the first was updated but not second
        if curarrang[first][0] in [curarrang[second][0], curarrang[second][-1]]:
            curarrang[first].reverse()
        if curarrang[first][-1]==curarrang[second][0]:
            curarrang[second].popleft()
        elif curarrang[first][-1]==curarrang[second][-1]:
            curarrang[second].reverse()
            curarrang[second].popleft()
        curarrang[first]+=curarrang[second]
        del curarrang[second]
    match, lastm = False, 0
    i=0
    while i<len(curarrang):
        if curarrang[i][0]==pair[0]:
            #Prevent circular deques
            if curarrang[i][-1]==pair[1]: 
                lookupdict[pair[0]]-=1
                lookupdict[pair[1]]-=1
                return
            if not match:
                curarrang[i].appendleft(pair[1])
                lastm =  i
                match = True
                i+=1
            else:
                resolve_double(lastm, i)
        elif curarrang[i][0]==pair[1]:
            if curarrang[i][-1]==pair[0]: 
                lookupdict[pair[0]]-=1
                lookupdict[pair[1]]-=1
                return
            if not match:
                curarrang[i].appendleft(pair[0])
                lastm =  i
                match = True
                i+=1
            else:
                resolve_double(lastm, i)
        elif curarrang[i][-1]==pair[0]:
            if curarrang[i][0]==pair[1]: 
                lookupdict[pair[0]]-=1
                lookupdict[pair[1]]-=1
                return
            if not match:
                curarrang[i].append(pair[1])
                lastm =  i
                match = True
                i+=1
            else:
                resolve_double(lastm, i)
        elif curarrang[i][-1]==pair[1]:
            if curarrang[i][-1]==pair[0]:
                lookupdict[pair[0]]-=1
                lookupdict[pair[1]]-=1
                return
            if not match:
                curarrang[i].append(pair[0])
                lastm =  i
                match = True
                i+=1
            else:
                resolve_double(lastm, i)
        else:
            i+=1
    if match: return
    curarrang.append(collections.deque(pair))

<h2> Run Single-Image Greedy Heuristic (Layer 1)</h2>

In [8]:
#Global Variables
nsplits = 4
hdict, vdict = generate_tagdict(data)
hkeys = np.vectorize(lambda s: "H:"+str(s))(np.array(list(hdict.keys())))
vkeys = np.vectorize(lambda s: "V:"+str(s))(np.array(list(vdict.keys())))
totalkeys = np.append(hkeys, vkeys) 

In [9]:
indices = np.arange(totalkeys.shape[0])
np.random.seed(1)
np.random.shuffle(indices)
indices = np.split(indices, nsplits)

In [10]:
def individual_run(keys):
    interestdict = generate_interest_dict(keys)
    lookupdict = {}
    curarrang = collections.deque()
    while (len(curarrang)==0) or (len(curarrang[0])<len(keys)):
        maxval, valid = getmaxfn(interestdict, lookupdict, keys)
        #In case all positive interest metrics have been exhausted
        if not valid: break
        deque_update(maxval, curarrang, lookupdict)
    finalarrang = collections.deque()
    for arrang in curarrang:
        finalarrang+=arrang
    del curarrang
    remimg = set(keys)-set(finalarrang)
    for img in remimg:
        finalarrang.append(img)
    return finalarrang

In [11]:
finalresults = collections.deque()
for mask in indices:
    start = time.time()
    arrang = individual_run(totalkeys[mask])
    gc.collect()
    finalresults.append(arrang)
    print("Mask Optimized in "+str(time.time()-start)+" seconds")
pickle.dump(finalresults, open("./layer1arrang.pkl", "wb"))

Calculation Checkpoint of 4500:424.9527807235718
Calculation Checkpoint of 9000:822.3509747982025
Calculation Checkpoint of 13500:1063.8601069450378
Calculation Checkpoint of 18000:1207.6297268867493
1255.2410807609558
Mask Optimized in 2244.443123102188 seconds
Calculation Checkpoint of 4500:444.54129004478455
Calculation Checkpoint of 9000:794.7886159420013
Calculation Checkpoint of 13500:1043.4002268314362
Calculation Checkpoint of 18000:1190.953207731247
1237.5998117923737
Mask Optimized in 2285.985775947571 seconds
Calculation Checkpoint of 4500:464.16116309165955
Calculation Checkpoint of 9000:815.0599539279938
Calculation Checkpoint of 13500:1061.4540979862213
Calculation Checkpoint of 18000:1215.1491250991821
1261.6071569919586
Mask Optimized in 2252.183938741684 seconds
Calculation Checkpoint of 4500:421.3487648963928
Calculation Checkpoint of 9000:752.308002948761
Calculation Checkpoint of 13500:993.1060469150543
Calculation Checkpoint of 18000:1132.5027568340302
1177.8050417

<h2> Generate Helper Functions for Replacement of Double-Vertical Images

In [11]:
#INVARIANT: Every element of arrang is a 1-element or 2-element tuple
def interest_change(index1, index2, arrang):
    def important_index(index, arrang):
        if (index==0): vals = [arrang[1]]
        elif (index==(len(arrang)-1)): vals = [arrang[index-1]]
        else: vals = [arrang[index-1],arrang[index+1]]
        return vals
    def total_interest(images, tup):
        nameret = (lambda s: hdict[s.split(":")[1]] if s.split(":")[0]=="H" else 
                  vdict[s.split(":")[1]])
        interest = 0
        main = nameret(tup[0]) if len(tup)==1 else nameret(tup[0]).union(nameret(tup[1]))
        for pair in images:
            curset = nameret(pair[0]) if len(pair)==1 else nameret(pair[0]).union(nameret(pair[1]))
            interest+=min(len(curset&main), len(curset-main), len(main-curset))
        return interest
    desvals, sourcevals = important_index(index1, arrang), important_index(index2, arrang)
    if index1==index2-1: sourcevals = sourcevals[1:]
    elif index1==index2+1: sourcevals = sourcevals[:-1]
    preinterest=total_interest(desvals, arrang[index1]) + total_interest(sourcevals, arrang[index2])
    if index1==index2-1:
        desvals=desvals[:-1]+sourcevals
        sourcevals = []
    elif index1==index2+1:
        desvals=desvals[1:]+sourcevals
        sourcevals = []
    postinterest=total_interest(desvals, (arrang[index1][0], arrang[index2][0]))
    if len(sourcevals)>1:
        postinterest+=total_interest(sourcevals[1:], sourcevals[0])
    return postinterest-preinterest

In [12]:
def best_swap(vertindices, mainindex, arrang):
    bestindex, bestinterest, firstdes = -1, None, True
    for index in vertindices:
        if index==mainindex: continue
        value1 = interest_change(mainindex, index, arrang)
        value2 = interest_change(index, mainindex, arrang)
        if (not bestinterest) or (value1>bestinterest):
            bestindex = index
            bestinterest = value1
            firstdes = True
        if (not bestinterest) or (value2>bestinterest):
            bestindex = index
            bestinterest = value2
            firstdes = False
    return bestindex, bestinterest, firstdes

<h2> Group Double-Vertical Images to Enhance Interest (Layer 2) </h2>

In [13]:
finalresults = pickle.load(open("./layer1arrang.pkl", "rb"))
vertindices = {i:np.where(np.vectorize(lambda s: 'V' in s)(np.array(finalresults[i])))[0]
               for i in range(len(finalresults))}
#Uniform data structure (to tuple) for efficient functions
for arr in finalresults:
    for i in range(len(arr)):
        arr[i] = (arr[i],)

In [14]:
def grouping_run(arrang, indexset):
    i=0
    while i<5:
        if len(indexset)<2: return
        if (len(arrang[i])==1) and ("V" in arrang[i][0]):
            bestindex, bestinterest, firstdes = best_swap(indexset, i, arrang)
            indexset = indexset[(indexset!=i)&(indexset!=bestindex)]
            if firstdes:
                arrang[i] = (arrang[i][0], arrang[bestindex][0])
                del arrang[bestindex]
                if bestindex>i: i+=1
                indexset[indexset>bestindex]-=1
            else:
                arrang[bestindex] = (arrang[i][0], arrang[bestindex][0])
                del arrang[i]
                indexset[indexset>i]-=1
        else:
            i+=1
        if (i%100==0): print(i)

In [15]:
%time grouping_run(finalresults[0], vertindices[0])

0
0
0
0
0
0
0
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1200
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
2900
2900
2900
2900
2900
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4200
4300
4400
4500
4600
4700
4800
4900
5000
5000
5100
5200
5200
5300
5400
5500
5600
5600
5600
5700
5800
5900
6000
6100
6200
6200
6300
6400
6400
6500
6500
6600
6700
6800
6800
6900
7000
7100
7100
7200
7300
7300
7400
7500
7500
7500
7500
7600
7700
7700
7700
7800
7900
8000
8100
8200
8300
8400
8400
8400
8400
8500
8600
8700
8700
8700
8700
8700
8800
8800
8900
8900
8900
9000
9100
9200
9200
9200
9200
9300
9400
9500
9500
9600
9700
9700
9700
9700
9700
9800
CPU times: user 1h 3min 47s, sys: 9.51 s, total: 1h 3min 57s
Wall time: 1h 4min 9s


In [16]:
def interest(tuple1, tuple2):
    set1 = set()
    nameret = (lambda s: hdict[s.split(":")[1]] if s.split(":")[0]=="H" else 
                  vdict[s.split(":")[1]])
    for elem in tuple1:
        set1 = set1.union(nameret(elem))
    set2 = set()
    for elem in tuple2:
        set2 = set2.union(nameret(elem))
    return min(len(set1&set2), len(set1-set2), len(set2-set1))

In [17]:
val = 0
for i in range(len(finalresults[0])-1):
    val+=interest(finalresults[0][i], finalresults[0][i+1])

In [16]:
finalresults[0]

deque([('H:67493',),
       ('V:35218', 'V:74121'),
       ('V:89668', 'H:32588'),
       ('V:491', 'V:39989'),
       ('V:39791', 'H:27551'),
       ('H:58230',),
       ('V:16591', 'H:41507'),
       ('V:1079', 'H:5464'),
       ('H:64761',),
       ('V:64041', 'V:61619'),
       ('H:84260',),
       ('H:18943',),
       ('V:41277', 'V:20218'),
       ('H:39427',),
       ('H:73396',),
       ('H:68206',),
       ('H:35982',),
       ('H:89586',),
       ('V:30134', 'H:73634'),
       ('V:47757', 'V:31152'),
       ('H:25227',),
       ('V:19312', 'V:46831'),
       ('V:2178', 'V:47888'),
       ('V:81651', 'V:77304'),
       ('H:32359',),
       ('H:50422',),
       ('H:87227',),
       ('H:89582',),
       ('H:37653',),
       ('V:50233', 'V:27977'),
       ('H:68814',),
       ('H:9806',),
       ('H:76870',),
       ('V:1883', 'H:89557'),
       ('V:33796', 'V:27691'),
       ('H:46835',),
       ('V:4294', 'H:57022'),
       ('V:38144', 'V:81977'),
       ('V:34148', 'V:67972'),

In [20]:
sum([(len(r)==1) and (r[0][0]=="V") for r in finalresults[0]])

8998

<h2> Finalize and Write Submission File </h2>

In [16]:
a = np.array([1,2,3])

In [21]:
sum([len(r)==2 for r in finalresults[0]])

100