In [2]:

import numpy as np
import spookyhash
import mmh3
import os
import pandas as pd
import time
import operator
import hyperloglog
import sys

# ==========================data structure==========================
class Node():
    def __init__(self,count=0):
        self.count=count
    def add_count(self,count=1):
        self.count+=count
    def __str__(self):
        return 'count: {}'.format(self.count)
    def __repr__(self):
        return ''

class Head(Node):
    def __init__(self,count=1):
        super().__init__(count)
        self.distinct = hyperloglog.HyperLogLog(0.01)
        self.maxID=''
    def __str__(self):
        return '[total count: {}, distinct: {}, max: {}]'.format(self.count,len(self.distinct),self.maxID)
    def __repr__(self):
        return '[total count: {}, distinct: {}, max: {}]'.format(self.count,len(self.distinct),self.maxID)

class Tail(Node):
    def __init__(self,ID,count):
        self.ID = ID
        super().__init__(count)
    def __str__(self):
        return '[ID: {}, count: {}]'.format(self.ID,self.count)
    def __repr__(self):
        return '[ID: {}, count: {}]'.format(self.ID,self.count)

# ==========================UpdateSk==========================
def UpdateSk(element,Sk_head,Sk):
    e_max=get_emax()
    width,depth=get_width_depth()
    col,row=position(element)
        # col / row index of element 
    avg=0
    #print("{} send to Sk[{}][{}]".format(element,row,col))
    # ==========================update sketch==========================
    Sk_head[row].count+=element.count
    #Sk_head[row].distinct.add(element.ID)
    Sk[row][col]+=1
    Update_local_max(Sk_head[row],Sk[row],element,col)
    Update_emax(Sk_head,Sk)

'''
    print("e_max:{}".format(e_max))
    for i in range(len(Sk)):
        print("Sk[{}]:{},{}".format(i,Sk_head[i],Sk[i]))
    print('')
'''


# ==========================update local max==========================       
def Update_local_max(head_item,element_list,element,column):
    # local max need only 1 row
    #print("In Update_local_max:")
    width,depth=get_width_depth()
    if head_item.maxID=='':
        head_item.maxID=element.ID
    else:
        # local_max_col=(mmh3.hash(head_item.maxID,signed=False))% ((width*numerator)//denominator)
        local_max_col=(mmh3.hash(head_item.maxID,signed=False))% width
        if element_list[local_max_col]<element_list[column]:
            head_item.maxID=element.ID


# ==========================update e_max==========================
def Update_emax(head,sketch):
    # pass whole array
    #print("In Update_emax:")
    e_max=get_emax()
    width,depth=get_width_depth()
    for i in range(len(head)):
        if head[i].maxID=='':
            continue
        else:
            local_max_col,local_max_row=position(Tail(head[i].maxID,0))
            if sketch[local_max_row][local_max_col]>e_max.count:
                e_max.ID=head[i].maxID
                e_max.count=sketch[local_max_row][local_max_col]

# ========================== BringBack=========================
def BringBack(e_min,head,sketch):
    # bring e_max back to Top
    # e_min=e_max, e_max=Null, delete e_max.count in Sketch, send e_min into Sketch
    e_max=get_emax()
    temp=Tail(e_min.ID,e_min.count)
    e_min.ID=e_max.ID
    e_min.count=e_max.count
    DeleteSk(e_max,head,sketch)
    UpdateSk(temp,head,sketch)

# ==========================DeleteSk=========================
def DeleteSk(element,head,sketch):
    # e_max in sketch: sketch[r][c]=0, total count-=sketch[row][col]
    width,depth=get_width_depth()
    col,row=position(element)
    head[row].count-=e_max.count
        # total_count-=element.count
    sketch[row][col]=0
    head[row].maxID=''
    element.ID=""
    element.count=0
# ==========================Tools=========================    
def get_emax():
    return e_max
def get_width_depth():
    return width,depth

def find(e,element_list):
    # return index of e in element_list
    try:
        index=[ele.ID for ele in element_list].index(e.ID)
    except:
        index=-99
    return index

def position(element):
    numerator,denominator=get_fraction()
    width,depth=get_width_depth()
    hash1=spookyhash.hash32(bytes(str(element.ID),encoding='utf-8'))
        # input: byte
        # output:unsigned- 32 bit int
    hash2=mmh3.hash(element.ID, signed=False)
        # input: str
        # output: unsigned- 32 bit int
    col=hash2 % ((width*numerator)//denominator)
    row=hash1 % depth
    return col,row
def get_fraction():
    return numerator,denominator    
    
# ==========================main=========================    

filename='caida_0.dat'
filepath="..\\dataset\\"
src_data=os.path.join(filepath,filename)
depth=4
width=128
size=512
numerator=1
denominator=1

start=time.time()

Sk_head=[Head(0) for j in range(depth)]
Sketch=np.zeros((depth,width),dtype='int32')
e_max=Tail('',0)
Top=[]

item_count=1000
income=0
with open(src_data,'rb') as file:
     #以binary讀取，資料型態也為byte
    while True:
        e=str(file.read(13))
        if len(e)<13:
            print('EOF')
            break
        else:
            #item_count-=1
            #income+=1
            #print("read {}-th element:{}".format(income,e))
            item=Tail(e,1)            
            index=find(item,Top)
            if index<0:
                if len(Top)<size:
                    Top.append(item)
                else:
                    UpdateSk(item,Sk_head,Sketch)
            else:
                Top[index].count+=1
        Top.sort(key=operator.attrgetter('count'),reverse=True)
        if e_max.count>Top[-1].count:
            BringBack(Top[-1],Sk_head,Sketch)
            #print('Top after BringBack: \n\t{}'.format(Top)) 

end=time.time()
print("Execution time:{} seconds.".format(str(end-start)))
print("Total memory {} bytes".format(sys.getsizeof(Top)+Sketch.nbytes+sys.getsizeof(Sk_head[0])*depth))
print("Top:{} bytes, Sketch:{} bytes, Sketch_head:{} bytes.".format(sys.getsizeof(Top),Sketch.nbytes,sys.getsizeof(Sk_head[0])*depth))
for i in range(len(Sketch)):
    print("Sk[{}]:{},{}".format(i,Sk_head[i],Sketch[i]))
print('')

#====================result compare=============================

templi=[[i.ID,i.count] for i in Top]
df=pd.DataFrame(templi,columns=['ID', 'Count'])
name="My_caida0"+'_'+str(size)+'_'+str(depth)+'_'+str(width)
df.to_csv(path+name+".csv",index=False)

path="..\\Caida0\\"
groundtruth='caida_0_ground_truth.csv'
final=name+".csv"

# ====================precision, ARE, AAE====================
grtruth=pd.read_csv(os.path.join(path,groundtruth))
    # compare with Top-k and groundtruth[k]
My_result=pd.read_csv(os.path.join(path,final))

# precision
gt_set=set(grtruth['Element'][:size])
my_set=set(My_result['ID'])
precision=len(gt_set & my_set)/len(my_set)
    # &: set 交集運算
print("Precision: {}".format(precision))
        
# ARE in Top
gt_dict=dict(grtruth.values.tolist())
my_dict=dict(My_result.values.tolist())
distinct=len(gt_dict)
are_error=0
aae_error=0
tp=0
fp=0


for item in my_dict:
    if item in gt_dict:
        are_error+=abs(my_dict[item]-gt_dict[item])/my_dict[item]
        aae_error+=abs(my_dict[item]-gt_dict[item])
        if my_dict[item] == gt_dict[item]:
            tp+=1
        else:
            fp+=1
    else:
        item_col,item_row=position(Tail(item,1))
        are_error+=abs(Sketch[item_row][item_col]-gt_dict[item])/Sketch[item_row][item_col]
        aae_error+=abs(Sketch[item_row][item_col]-gt_dict[item])
        
ARE=are_error/distinct
AAE=aae_error/distinct
print("Find:{}, TP:{}, FP:{}".format(len(gt_set & my_set),tp,fp))
print("ARE: {}".format(ARE))
print("AAE: {}".format(AAE))

EOF
Execution time:374.683128118515 seconds.
Total memory 6544 bytes
Top:4272 bytes, Sketch:2048 bytes, Sketch_head:224 bytes.
Sk[0]:[total count: 432364, distinct: 0, max: b'l\xd7\xbd~\x8b\xad+\xfc\xe2\x04\x00P\x06'],[ 78 288 393 103 174 244 196 512 284 547 372 328 150 262   8  24 311 419
 306 541 169 231 211  53 108 239 151 561 497 281 559 390 281 237 501 437
 104  16 167 358 240 304 210  15 200 352 105 239 405 264  58 344  61 360
 400 309 211 247 355 498 222 310 368 413  80  16 408 294 205 301 344 371
 192  29  53 530 218  51 163 402 331 469 134 301 338  45 329 561 259  22
 519 560 368  73 119 188  68 465 196 396 398 119 467 260 317 419 121 467
 342   4  80  16 356  24 216 142 427 224 356 268 228 239 263 191 212 543
 423 413]
Sk[1]:[total count: 434829, distinct: 0, max: b'\xca\xbb\x1d\x1f\x83\x8b\x01`\xf54\x01\xbb\x06'],[546 244 440 197  62  15 163  66 307 563  36 257 436  78 273 562 258 196
  76 564 102  34 255 479  12 336 455 304 159 298 540 372  89  72 501 408
 364 252 451 230 4

In [5]:
# hyperloglog

import numpy as np
import spookyhash
import mmh3
import os
import pandas as pd
import time
import operator
import hyperloglog
import sys

# ==========================data structure==========================
class Node():
    def __init__(self,count=0):
        self.count=count
    def add_count(self,count=1):
        self.count+=count
    def __str__(self):
        return 'count: {}'.format(self.count)
    def __repr__(self):
        return ''

class Head(Node):
    def __init__(self,count=1):
        super().__init__(count)
        self.distinct = hyperloglog.HyperLogLog(0.01)
        self.maxID=''
    def __str__(self):
        return '[total count: {}, distinct: {}, max: {}]'.format(self.count,len(self.distinct),self.maxID)
    def __repr__(self):
        return '[total count: {}, distinct: {}, max: {}]'.format(self.count,len(self.distinct),self.maxID)

class Tail(Node):
    def __init__(self,ID,count):
        self.ID = ID
        super().__init__(count)
    def __str__(self):
        return '[ID: {}, count: {}]'.format(self.ID,self.count)
    def __repr__(self):
        return '[ID: {}, count: {}]'.format(self.ID,self.count)

# ==========================UpdateSk==========================
def UpdateSk(element,Sk_head,Sk):
    e_max=get_emax()
    width,depth=get_width_depth()
    col,row=position(element)
        # col / row index of element 
    avg=0
    #print("{} send to Sk[{}][{}]".format(element,row,col))
    # ==========================update sketch==========================
    Sk_head[row].count+=element.count
    Sk_head[row].distinct.add(element.ID)
    Sk[row][col]+=1
    Update_local_max(Sk_head[row],Sk[row],element,col)
    Update_emax(Sk_head,Sk)

'''
    print("e_max:{}".format(e_max))
    for i in range(len(Sk)):
        print("Sk[{}]:{},{}".format(i,Sk_head[i],Sk[i]))
    print('')
'''


# ==========================update local max==========================       
def Update_local_max(head_item,element_list,element,column):
    # local max need only 1 row
    #print("In Update_local_max:")
    width,depth=get_width_depth()
    if head_item.maxID=='':
        head_item.maxID=element.ID
    else:
        # local_max_col=(mmh3.hash(head_item.maxID,signed=False))% ((width*numerator)//denominator)
        local_max_col=(mmh3.hash(head_item.maxID,signed=False))% width
        if element_list[local_max_col]<element_list[column]:
            head_item.maxID=element.ID


# ==========================update e_max==========================
def Update_emax(head,sketch):
    # pass whole array
    #print("In Update_emax:")
    e_max=get_emax()
    width,depth=get_width_depth()
    for i in range(len(head)):
        if head[i].maxID=='':
            continue
        else:
            local_max_col,local_max_row=position(Tail(head[i].maxID,0))
            if sketch[local_max_row][local_max_col]>e_max.count:
                e_max.ID=head[i].maxID
                e_max.count=sketch[local_max_row][local_max_col]    


# ========================== BringBack=========================
def BringBack(e_min,head,sketch):
    # bring e_max back to Top
    # e_min=e_max, e_max=Null, delete e_max.count in Sketch, send e_min into Sketch
    e_max=get_emax()
    temp=Tail(e_min.ID,e_min.count)
    e_min.ID=e_max.ID
    e_min.count=e_max.count
    DeleteSk(e_max,head,sketch)
    UpdateSk(temp,head,sketch)

# ==========================DeleteSk=========================
def DeleteSk(element,head,sketch):
    # e_max in sketch: sketch[r][c]=0, total count-=sketch[row][col]
    width,depth=get_width_depth()
    col,row=position(element)
    head[row].count-=e_max.count
        # total_count-=element.count
    sketch[row][col]=0
    head[row].maxID=''
    element.ID=""
    element.count=0
# ==========================Tools=========================    
def get_emax():
    return e_max
def get_width_depth():
    return width,depth

def find(e,element_list):
    # return index of e in element_list
    try:
        index=[ele.ID for ele in element_list].index(e.ID)
    except:
        index=-99
    return index

def position(element):
    numerator,denominator=get_fraction()
    width,depth=get_width_depth()
    hash1=spookyhash.hash32(bytes(str(element.ID),encoding='utf-8'))
        # input: byte
        # output:unsigned- 32 bit int
    hash2=mmh3.hash(element.ID, signed=False)
        # input: str
        # output: unsigned- 32 bit int
    col=hash2 % ((width*numerator)//denominator)
    row=hash1 % depth
    return col,row
def get_fraction():
    return numerator,denominator    
    
# ==========================main=========================    

filename='caida_0.dat'
filepath="..\\dataset\\"
src_data=os.path.join(filepath,filename)
depth=4
width=256
size=256
numerator=1
denominator=1

start=time.time()

Sk_head=[Head(0) for j in range(depth)]
Sketch=np.zeros((depth,width),dtype='int32')
e_max=Tail('',0)
Top=[]

item_count=1000
income=0
with open(src_data,'rb') as file:
     #以binary讀取，資料型態也為byte
    while True:
        e=str(file.read(13))
        if len(e)<13:
            print('EOF')
            break
        else:
            #item_count-=1
            #income+=1
            #print("read {}-th element:{}".format(income,e))
            item=Tail(e,1)            
            index=find(item,Top)
            if index<0:
                if len(Top)<size:
                    Top.append(item)
                else:
                    UpdateSk(item,Sk_head,Sketch)
            else:
                Top[index].count+=1
        Top.sort(key=operator.attrgetter('count'),reverse=True)
        if e_max.count>Top[-1].count:
            BringBack(Top[-1],Sk_head,Sketch)
            #print('Top after BringBack: \n\t{}'.format(Top)) 

end=time.time()
print("Execution time:{} seconds.".format(str(end-start)))
print("Total memory {} bytes".format(sys.getsizeof(Top)+Sketch.nbytes+sys.getsizeof(Sk_head[0])*depth))
print("Top:{} bytes, Sketch:{} bytes, Sketch_head:{} bytes.".format(sys.getsizeof(Top),Sketch.nbytes,sys.getsizeof(Sk_head[0])*depth))
for i in range(len(Sketch)):
    print("Sk[{}]:{},{}".format(i,Sk_head[i],Sketch[i]))
print('')

#====================result compare=============================

templi=[[i.ID,i.count] for i in Top]
df=pd.DataFrame(templi,columns=['ID', 'Count'])

path="..\\Caida0\\"
name="My_caida0"+'_'+str(size)+'_'+str(depth)+'_'+str(width)
df.to_csv(path+name+".csv",index=False)

groundtruth='caida_0_ground_truth.csv'
final=name+".csv"

# ====================precision, ARE, AAE====================
grtruth=pd.read_csv(os.path.join(path,groundtruth))
    # compare with Top-k and groundtruth[k]
My_result=pd.read_csv(os.path.join(path,final))

# precision
gt_set=set(grtruth['Element'][:size])
my_set=set(My_result['ID'])
precision=len(gt_set & my_set)/len(my_set)
    # &: set 交集運算
print("Precision: {}".format(precision))
        
# ====================ARE/AAE in Top====================
gt_dict=dict(grtruth.values.tolist())
my_dict=dict(My_result.values.tolist())
distinct=len(gt_dict)
are_error=0
aae_error=0
tp=0
fp=0
row_distinct=[len(i.distinct) for i in Sk_head]
    # distinct eleemnts in each row of Sketch

for item in my_dict:
    # ARE/AAE in my result
    if item in gt_dict:
        # count in Top
        are_error+=abs(my_dict[item]-gt_dict[item])/my_dict[item]
            # true count- estimate count /estimate count
        aae_error+=abs(my_dict[item]-gt_dict[item])
            # true count- estimate count /estimate count
        if my_dict[item] == gt_dict[item]:
            tp+=1
        else:
            fp+=1
    else:
        # estimate count in Sketch
        item_col,item_row=position(Tail(item,1))
        count=Sketch[item_row][item_col]//(row_distinct[item_row]/width)
        if count<1:
            count=1
        are_error+=abs(count-gt_dict[item])/count
        aae_error+=abs(count-gt_dict[item])
        
ARE=are_error/distinct
AAE=aae_error/distinct
print("Find:{}, TP:{}, FP:{}".format(len(gt_set & my_set),tp,fp))
print("ARE: {}".format(ARE))
print("AAE: {}".format(AAE))

# ====================ARE/AAE of all====================
ARE_all=0
AAE_all=0
for item in gt_dict:
    if item in my_dict:
        are_error+=abs(my_dict[item]-gt_dict[item])/my_dict[item]
            # true count- estimate count /estimate count
        aae_error+=abs(my_dict[item]-gt_dict[item])
            # true count- estimate count /estimate count
    else:
        # estimate count in Sketch
        item_col,item_row=position(Tail(item,1))
        count=Sketch[item_row][item_col]*width//row_distinct[item_row]
            # avg count= (estimated_count/distinct)/width
        if count<1:
            count=1
        are_error+=abs(count-gt_dict[item])/count
        aae_error+=abs(count-gt_dict[item])
        
ARE_all=are_error/distinct
AAE_all=aae_error/distinct
print("ARE_all: {}".format(ARE_all))
print("AAE_all: {}".format(AAE_all))

EOF
Execution time:228.19424557685852 seconds.
Total memory 6536 bytes
Top:2216 bytes, Sketch:4096 bytes, Sketch_head:224 bytes.
Sk[0]:[total count: 494181, distinct: 44905, max: b'c\x01\xfb\xde\x04_o,\xe2h\xf9K\x11'],[314  74 863 372 222 990 315 136 489 266 294 280 956 396 193 917 168 929
 966 994 852 476 693 881 970 599 399 508   2 775 517 778 734 622 400 984
 478 116 522 966 775 963 341 553 474 510 703 922  44 943 474 464 449 787
 765 986 937 331 868 445  35 973 204 182 324 365 419 365 940 642 218 472
 752 912 747 620 647 471 122 119 843 273 922 626 539 927 283 391 568 749
 699 256 860 618 632 951 275 945 962  71 101  99 749 299 707 190 934 800
 469 429 844 274 396 973 173 897 862 578 599 818 674   9 602 414  72 493
 508 236 818 869  27 737 715 425 131 978 386 365 513 976 245 692 341 934
 922 486 267  34 233 748 788 825 744 156 881 871 714 531 779 829 164 348
 747 599 298 779 318 747 589 434  32 528 939 885 411 974 122 828 900 272
  97 432 952 452 523 337 386 570 745 706 439 269 480