# Binary (Min) Heap

#### https://runestone.academy/runestone/books/published/pythonds/Trees/PriorityQueueswithBinaryHeaps.html


#### Basic operations of binary heap:
1. BinaryHeap(): create a new, empty heap.
2. insert(k): adds a new item to heap.
3. find_min(): return the minimum key element, leaving item in the heap.
4. del_min(): return the item with minimum key value, removing the item from the heap.
5. is_empty(): return true if the heap is empty, false otherwise.
6. size(): return the number of items in the heap.
7. bulid_heap(list): builds a new heap from a list of keys.


In [32]:
# define class BinHeap()


# ==========================data structure==========================
class Node():
    def __init__(self,count=0):
        self.count=count
    def add_count(self,count=1):
        self.count+=count
    def __str__(self):
        return 'count: {}'.format(self.count)
    def __repr__(self):
        return ''

class Head(Node):
    def __init__(self,count=1):
        super().__init__(count)
        self.distinct = hyperloglog.HyperLogLog(0.01)
        self.maxID=''
    def __str__(self):
        return '[total count: {}, distinct: {}, max: {}]'.format(self.count,len(self.distinct),self.maxID)
    def __repr__(self):
        return '[total count: {}, distinct: {}, max: {}]'.format(self.count,len(self.distinct),self.maxID)

class Tail(Node):
    def __init__(self,ID,count):
        self.ID = ID
        super().__init__(count)
    def __str__(self):
        return '[ID: {}, count: {}]'.format(self.ID,self.count)
    def __repr__(self):
        return '[ID: {}, count: {}]'.format(self.ID,self.count)

# ==========================Heap==========================

class BinHeap():
    def __init__(self):
        self.heap_list=[Tail('',sys.maxsize)]
        self.current_size=0
    def perc_up(self,current_size):
        i=current_size
        while i //2 >0:
            if self.heap_list[i].count> self.heap_list[i//2].count:
                self.heap_list[i].count, self.heap_list[i//2].count=self.heap_list[i//2].count,self.heap_list[i].count             
            i=i//2
    def insert(self,k):
        self.heap_list.append(k)
        self.current_size+=1
        self.perc_up(self.current_size)
    
    def __str__(self):
        return str(self.heap_list)
       
    def max_child(self,i):
        # index從1起算
        if i*2+1>self.current_size:
            return i*2
            # 若只剩1個child就只能return 它
        else:
            if self.heap_list[i*2]> self.heap_list[i*2+1]:
                # 左大於右
                return i*2
            else:
                return i*2+1
    def perc_down(self,i):
        # delete 才需要從root往下調整
        while (i*2)<=self.current_size:
            maxchild=self.max_child(i)
            if self.heap_list[i].count<self.heap_list[maxchild].count:
                self.heap_list[i].count, self.heap_list[maxchild].count=self.heap_list[maxchild].count,self.heap_list[i].count
            i=maxchild
    def del_max(self):
        return_value=self.heap_list[1]
            # return root
        self.heap_list[1]=self.heap_list[self.current_size]
            # swap(root, last)
        self.current_size-=1
        self.heap_list.pop()
        #print(self.heap_list)
            # remove list[last element]
        self.perc_down(1)
            # 從root往下調整
        return return_value

# ==========================UpdateSk==========================
def UpdateSk(element,Sk_head,Sk):
    e_max=get_emax()
    width,depth=get_width_depth()
    col,row=position(element)
        # col / row index of element 
    avg=0
    # print("{} send to Sk[{}][{}]".format(element,row,col))
    # ==========================update sketch==========================
    Sk_head[row].count+=element.count
    Sk_head[row].distinct.add(element.ID)
    Sk[row][col]+=1
    Update_local_max(Sk_head[row],Sk[row],element,col)
    Update_emax(Sk_head,Sk)

    '''
    print("e_max:{}".format(e_max))
    for i in range(len(Sk)):
        print("Sk[{}]:{},{}".format(i,Sk_head[i],Sk[i]))
    print('')
    '''    

# ==========================update local max==========================       
def Update_local_max(head_item,element_list,element,column):
    # local max need only 1 row
    #print("In Update_local_max:")
    width,depth=get_width_depth()
    if head_item.maxID=='':
        head_item.maxID=element.ID
    else:
        # local_max_col=(mmh3.hash(head_item.maxID,signed=False))% ((width*numerator)//denominator)
        local_max_col=(mmh3.hash(head_item.maxID,signed=False))% width
        if element_list[local_max_col]<element_list[column]:
            head_item.maxID=element.ID

# ==========================update e_max==========================
def Update_emax(head,sketch):
    # pass whole array
    #print("In Update_emax:")
    e_max=get_emax()
    width,depth=get_width_depth()
    for i in range(len(head)):
        if head[i].maxID=='':
            continue
        else:
            local_max_col,local_max_row=position(Tail(head[i].maxID,0))
            if sketch[local_max_row][local_max_col]>e_max.count:
                e_max.ID=head[i].maxID
                e_max.count=sketch[local_max_row][local_max_col]

# ========================== BringBack=========================
def BringBack(e_min,head,sketch):
    # bring e_max back to Top
    # e_min=e_max, e_max=Null, delete e_max.count in Sketch, send e_min into Sketch
    e_max=get_emax()
    temp=Tail(e_min.ID,e_min.count)
    e_min.ID=e_max.ID
    e_min.count=e_max.count
    DeleteSk(e_max,head,sketch)
    UpdateSk(temp,head,sketch)

# ==========================DeleteSk=========================
def DeleteSk(element,head,sketch):
    # e_max in sketch: sketch[r][c]=0, total count-=sketch[row][col]
    width,depth=get_width_depth()
    col,row=position(element)
    head[row].count-=e_max.count
        # total_count-=element.count
    sketch[row][col]=0
    head[row].maxID=''
    element.ID=""
    element.count=0
# ==========================Tools=========================    
def get_emax():
    return e_max
def get_width_depth():
    return width,depth

def find(e,element_list):
    # return index of e in element_list
    try:
        index=[ele.ID for ele in element_list].index(e.ID)+1
    except:
        index=-99
    return index

def position(element):
    numerator,denominator=get_fraction()
    width,depth=get_width_depth()
    hash1=spookyhash.hash32(bytes(str(element.ID),encoding='utf-8'))
        # input: byte
        # output:unsigned- 32 bit int
    hash2=mmh3.hash(element.ID, signed=False)
        # input: str
        # output: unsigned- 32 bit int
    col=hash2 % ((width*numerator)//denominator)
    row=hash1 % depth
    return col,row
def get_fraction():
    return numerator,denominator    
# ==========================main==========================    
import numpy as np
import spookyhash
import mmh3
import os
import pandas as pd
import time
import operator
import hyperloglog
import sys
import random

filename='caida_0.dat'
filepath="dataset\\"
src_data=os.path.join(filepath,filename)

depth=4
width=128
size=512
Sk_head=[Head(0) for j in range(depth)]
Sketch=np.zeros((depth,width),dtype='int32')
e_max=Tail('',0)
numerator=1
denominator=1
Top=BinHeap()

start=time.time()

item_count=30
income=0
with open(src_data,'rb') as file:
     #以binary讀取，資料型態也為byte
    while True:
        e=str(file.read(13))
        if len(e)<13:
            print('EOF')
            break
        else:
            # item_count-=1
            # income+=1
            # print("read {}-th element:{}".format(income,e))
            item=Tail(e,1)
            index=find(item,Top.heap_list[1:])     
            if index<0:
                if Top.current_size<size:
                    Top.insert(item)
                else:
                    UpdateSk(item,Sk_head,Sketch)
            else:
                Top.heap_list[index].count+=1
                Top.perc_up(Top.current_size)
        if e_max.count>Top.heap_list[Top.current_size].count:
            BringBack(Top.heap_list[Top.current_size],Sk_head,Sketch)
            Top.perc_up(Top.current_size)
            # print("Top after BringBack:\n{}".format(Top))

end=time.time()
print("Execution time:{} seconds.".format(str(end-start)))
print("Total memory {} bytes".format(sys.getsizeof(Top)+Sketch.nbytes+sys.getsizeof(Sk_head[0])*depth))
print("Top:{} bytes, Sketch:{} bytes, Sketch_head:{} bytes.".format(sys.getsizeof(Top),Sketch.nbytes,sys.getsizeof(Sk_head[0])*depth))
for i in range(len(Sketch)):
    print("Sk[{}]:{},{}".format(i,Sk_head[i],Sketch[i]))
print('')



#====================result compare=============================
templi=[[i.ID,i.count] for i in Top.heap_list]
df=pd.DataFrame(templi,columns=['ID', 'Count'])

path="Caida0\\"
name="My_caida0"+'_'+str(size)+'_'+str(depth)+'_'+str(width)
df.to_csv(path+name+".csv",index=False)

groundtruth='caida_0_ground_truth.csv'
final=name+".csv"

# ====================precision, ARE, AAE====================
grtruth=pd.read_csv(os.path.join(path,groundtruth))
    # compare with Top-k and groundtruth[k]
My_result=pd.read_csv(os.path.join(path,final))

# precision
gt_set=set(grtruth['Element'][:size])
my_set=set(My_result['ID'])
precision=len(gt_set & my_set)/len(my_set)
    # &: set 交集運算
print("Precision: {}".format(precision))
        
# ====================ARE/AAE in Top====================
gt_dict=dict(grtruth.values.tolist())
my_dict=dict(My_result.values.tolist())
distinct=len(gt_dict)
are_error=0
aae_error=0
tp=0
fp=0
row_distinct=[len(i.distinct) for i in Sk_head]
    # distinct eleemnts in each row of Sketch

for item in my_dict:
    # ARE/AAE in my result
    if item in gt_dict:
        # count in Top
        are_error+=abs(my_dict[item]-gt_dict[item])/my_dict[item]
            # true count- estimate count /estimate count
        aae_error+=abs(my_dict[item]-gt_dict[item])
            # true count- estimate count /estimate count
        if my_dict[item] == gt_dict[item]:
            tp+=1
        else:
            fp+=1
    else:
        # estimate count in Sketch
        item_col,item_row=position(Tail(item,1))
        count=Sketch[item_row][item_col]//(row_distinct[item_row]/width)
        if count<1:
            count=1
        are_error+=abs(count-gt_dict[item])/count
        aae_error+=abs(count-gt_dict[item])
        
ARE=are_error/distinct
AAE=aae_error/distinct
print("Find:{}, TP:{}, FP:{}".format(len(gt_set & my_set),tp,fp))
print("ARE: {}".format(ARE))
print("AAE: {}".format(AAE))

# ====================ARE/AAE of all====================
ARE_all=0
AAE_all=0
for item in gt_dict:
    if item in my_dict:
        are_error+=abs(my_dict[item]-gt_dict[item])/my_dict[item]
            # true count- estimate count /estimate count
        aae_error+=abs(my_dict[item]-gt_dict[item])
            # true count- estimate count /estimate count
    else:
        # estimate count in Sketch
        item_col,item_row=position(Tail(item,1))
        count=Sketch[item_row][item_col]*width//row_distinct[item_row]
            # avg count= (estimated_count/distinct)/width
        if count<1:
            count=1
        are_error+=abs(count-gt_dict[item])/count
        aae_error+=abs(count-gt_dict[item])
        
ARE_all=are_error/distinct
AAE_all=aae_error/distinct
print("ARE_all: {}".format(ARE_all))
print("AAE_all: {}".format(AAE_all))

EOF
Execution time:406.26619267463684 seconds.
Total memory 2328 bytes
Top:56 bytes, Sketch:2048 bytes, Sketch_head:224 bytes.
Sk[0]:[total count: 573827, distinct: 44783, max: b'9\x7f\xa8\xe5\xb3~\x01`\x91\x08\x01\xbb\x06'],[1458 1760  627  850  237  380 1686 1128 2307  622  626 1047 2255 1697
  731   88  901 2342  968  650 1648   71 2141 2245 2077  314  268 1788
 1655 1491  871  640 1795   62 1345 1367  520 2314 1360 1674 2207 2228
  936 1214 1111 1076 1283   56  642  995  967 1941 1604 1866 2265 2308
  813 1192  413 2043 1378  509 1776  587  745 1684 2283  574 1760 1837
 1273  905 1130 1129 1716 1234 1342  176 2113  860 2340 1972 1866 1396
 1995 1082 2128  315 1833 1902  480  876 1811 2026  292 1594 2169 1716
  362 1848  179 1901   62 1552  735 2009  725  255 1912 1759  647  503
 2052 2165 2327 1311 1103 1253  256 2183 2203  476  261 1916 1485 2067
 1167 1531]
Sk[1]:[total count: 562406, distinct: 44750, max: b'\xd2\xdc\x94\xc1\xb5\x86\x01`\xc1\xaa\x01\xbb\x06'],[ 658 1680  494 1584

TypeError: a bytes-like object is required, not 'float'