In [51]:

import numpy as np
import spookyhash
import mmh3
import os
import pandas as pd
import time
import operator
import hyperloglog
import sys

# ==========================data structure==========================
class Node():
    def __init__(self,count=0):
        self.count=count
    def add_count(self,count=1):
        self.count+=count
    def __str__(self):
        return 'count: {}'.format(self.count)
    def __repr__(self):
        return ''

class Head(Node):
    def __init__(self,count=1):
        super().__init__(count)
        self.distinct = hyperloglog.HyperLogLog(0.01)
        self.maxID=''
    def __str__(self):
        return '[total count: {}, distinct: {}, max: {}]'.format(self.count,len(self.distinct),self.maxID)
    def __repr__(self):
        return '[total count: {}, distinct: {}, max: {}]'.format(self.count,len(self.distinct),self.maxID)

class Tail(Node):
    def __init__(self,ID,count):
        self.ID = ID
        super().__init__(count)
    def __str__(self):
        return '[ID: {}, count: {}]'.format(self.ID,self.count)
    def __repr__(self):
        return '[ID: {}, count: {}]'.format(self.ID,self.count)

# ==========================UpdateSk==========================
def UpdateSk(element,Sk_head,Sk):
    e_max=get_emax()
    width,depth=get_width_depth()
    col,row=position(element)
        # col / row index of element 
    avg=0
    #print("{} send to Sk[{}][{}]".format(element,row,col))
    # ==========================update sketch==========================
    Sk_head[row].count+=element.count
    #Sk_head[row].distinct.add(element.ID)
    Sk[row][col]+=1
    Update_local_max(Sk_head[row],Sk[row],element,col)
    Update_emax(Sk_head,Sk)

'''
    print("e_max:{}".format(e_max))
    for i in range(len(Sk)):
        print("Sk[{}]:{},{}".format(i,Sk_head[i],Sk[i]))
    print('')
'''


# ==========================update local max==========================       
def Update_local_max(head_item,element_list,element,column):
    # local max need only 1 row
    #print("In Update_local_max:")
    width,depth=get_width_depth()
    if head_item.maxID=='':
        head_item.maxID=element.ID
    else:
        # local_max_col=(mmh3.hash(head_item.maxID,signed=False))% ((width*numerator)//denominator)
        local_max_col=(mmh3.hash(head_item.maxID,signed=False))% width
        if element_list[local_max_col]<element_list[column]:
            head_item.maxID=element.ID


# ==========================update e_max==========================
def Update_emax(head,sketch):
    # pass whole array
    #print("In Update_emax:")
    e_max=get_emax()
    width,depth=get_width_depth()
    for i in range(len(head)):
        if head[i].maxID=='':
            continue
        else:
            local_max_col,local_max_row=position(Tail(head[i].maxID,0))
            if sketch[local_max_row][local_max_col]>e_max.count:
                e_max.ID=head[i].maxID
                e_max.count=sketch[local_max_row][local_max_col]

# ========================== BringBack=========================
def BringBack(e_min,head,sketch):
    # bring e_max back to Top
    # e_min=e_max, e_max=Null, delete e_max.count in Sketch, send e_min into Sketch
    e_max=get_emax()
    temp=Tail(e_min.ID,e_min.count)
    e_min.ID=e_max.ID
    e_min.count=e_max.count
    DeleteSk(e_max,head,sketch)
    UpdateSk(temp,head,sketch)

# ==========================DeleteSk=========================
def DeleteSk(element,head,sketch):
    # e_max in sketch: sketch[r][c]=0, total count-=sketch[row][col]
    width,depth=get_width_depth()
    col,row=position(element)
    head[row].count-=e_max.count
        # total_count-=element.count
    sketch[row][col]=0
    head[row].maxID=''
    element.ID=""
    element.count=0
# ==========================Tools=========================    
def get_emax():
    return e_max
def get_width_depth():
    return width,depth

def find(e,element_list):
    # return index of e in element_list
    try:
        index=[ele.ID for ele in element_list].index(e.ID)
    except:
        index=-99
    return index

def position(element):
    numerator,denominator=get_fraction()
    width,depth=get_width_depth()
    hash1=spookyhash.hash32(bytes(str(element.ID),encoding='utf-8'))
        # input: byte
        # output:unsigned- 32 bit int
    hash2=mmh3.hash(element.ID, signed=False)
        # input: str
        # output: unsigned- 32 bit int
    col=hash2 % ((width*numerator)//denominator)
    row=hash1 % depth
    return col,row
def get_fraction():
    return numerator,denominator    
    
# ==========================main=========================    

filename='caida_0.dat'
filepath="..\\dataset\\"
src_data=os.path.join(filepath,filename)
depth=8
width=256
size=1024
numerator=1
denominator=1

start=time.time()

Sk_head=[Head(0) for j in range(depth)]
Sketch=np.zeros((depth,width),dtype='int32')
e_max=Tail('',0)
Top=[]

item_count=1000
income=0
with open(src_data,'rb') as file:
     #以binary讀取，資料型態也為byte
    while True:
        e=str(file.read(13))
        if len(e)<13:
            print('EOF')
            break
        else:
            #item_count-=1
            #income+=1
            #print("read {}-th element:{}".format(income,e))
            item=Tail(e,1)            
            index=find(item,Top)
            if index<0:
                if len(Top)<size:
                    Top.append(item)
                else:
                    UpdateSk(item,Sk_head,Sketch)
            else:
                Top[index].count+=1
        Top.sort(key=operator.attrgetter('count'),reverse=True)
        if e_max.count>Top[-1].count:
            BringBack(Top[-1],Sk_head,Sketch)
            #print('Top after BringBack: \n\t{}'.format(Top)) 

end=time.time()
print("Execution time:{} seconds.".format(str(end-start)))
print("Total memory {} bytes".format(sys.getsizeof(Top)+Sketch.nbytes+sys.getsizeof(Sk_head[0])*depth))
print("Top:{} bytes, Sketch:{} bytes, Sketch_head:{} bytes.".format(sys.getsizeof(Top),Sketch.nbytes,sys.getsizeof(Sk_head[0])*depth))
for i in range(len(Sketch)):
    print("Sk[{}]:{},{}".format(i,Sk_head[i],Sketch[i]))
print('')

#====================result compare=============================

templi=[[i.ID,i.count] for i in Top]
df=pd.DataFrame(templi,columns=['ID', 'Count'])

path="..\\Caida0\\"
name="My_caida0"+'_'+str(size)+'_'+str(depth)+'_'+str(width)
df.to_csv(path+name+".csv",index=False)

groundtruth='caida_0_ground_truth.csv'
final=name+".csv"

# ====================precision, ARE, AAE====================
grtruth=pd.read_csv(os.path.join(path,groundtruth))
    # compare with Top-k and groundtruth[k]
My_result=pd.read_csv(os.path.join(path,final))

# precision
gt_set=set(grtruth['Element'][:size])
my_set=set(My_result['ID'])
precision=len(gt_set & my_set)/len(my_set)
    # &: set 交集運算
print("Precision: {}".format(precision))
        
# ARE in Top
gt_dict=dict(grtruth.values.tolist())
my_dict=dict(My_result.values.tolist())
distinct=len(gt_dict)
are_error=0
aae_error=0
tp=0
fp=0


for item in my_dict:
    if item in gt_dict:
        are_error+=abs(my_dict[item]-gt_dict[item])/my_dict[item]
        aae_error+=abs(my_dict[item]-gt_dict[item])
        if my_dict[item] == gt_dict[item]:
            tp+=1
        else:
            fp+=1
    else:
        item_col,item_row=position(Tail(item,1))
        are_error+=abs(Sketch[item_row][item_col]-gt_dict[item])/Sketch[item_row][item_col]
        aae_error+=abs(Sketch[item_row][item_col]-gt_dict[item])
        
ARE=are_error/distinct
AAE=aae_error/distinct
print("Find:{}, TP:{}, FP:{}".format(len(gt_set & my_set),tp,fp))
print("ARE: {}".format(ARE))
print("AAE: {}".format(AAE))

EOF
Execution time:777.9099864959717 seconds.
Total memory 17664 bytes
Top:9024 bytes, Sketch:8192 bytes, Sketch_head:448 bytes.
Sk[0]:[total count: 191069, distinct: 0, max: b'$o]\xd7\xd6!+\xfc\xe0Y\x01\xbb\x06'],[ 54 127 197 100  80  83 205  77 183  65 256 249 281 196 154 293 116 263
  32 218 195  98 176 215 210  14  43 235 152 183 238  44 206 301 277 236
  23 130 245 113 163  27 191 118 169 273 295 134 232  23 196  97  82 166
 257 132  89 237 283  45  55 128 276  84 118 253 121 234 181 111  69 249
 100 241 291 243 132 134 195 102  64  21 303 291 211 239  36 304 155 280
 296 221  88 238  60  82 182 197 273 181  39 203  98 101 130 112  58  24
  76 109 122  10 157 247 123 147 110 232 213 196 282 287 182 191  30  29
 295 127 185  61  71 112 292 303 158 224 308 231 103  74  39 260 294 288
  59 231  62 199 299 147 212 243  70 300  48 294 304 293 215 204 249  88
  21  75 189 103 176 160 220 159 232 298 207 251  72  89 249 235 248 173
 232 121 231 255 285 257 262 286  43 227 152 192  85 134

Find:864, TP:146, FP:878
ARE: 0.000672704339863737
AAE: 0.28916457552090674


In [71]:
# hyperloglog

import numpy as np
import spookyhash
import mmh3
import os
import pandas as pd
import time
import operator
import hyperloglog
import sys

# ==========================data structure==========================
class Node():
    def __init__(self,count=0):
        self.count=count
    def add_count(self,count=1):
        self.count+=count
    def __str__(self):
        return 'count: {}'.format(self.count)
    def __repr__(self):
        return ''

class Head(Node):
    def __init__(self,count=1):
        super().__init__(count)
        self.distinct = hyperloglog.HyperLogLog(0.01)
        self.maxID=''
    def __str__(self):
        return '[total count: {}, distinct: {}, max: {}]'.format(self.count,len(self.distinct),self.maxID)
    def __repr__(self):
        return '[total count: {}, distinct: {}, max: {}]'.format(self.count,len(self.distinct),self.maxID)

class Tail(Node):
    def __init__(self,ID,count):
        self.ID = ID
        super().__init__(count)
    def __str__(self):
        return '[ID: {}, count: {}]'.format(self.ID,self.count)
    def __repr__(self):
        return '[ID: {}, count: {}]'.format(self.ID,self.count)

# ==========================UpdateSk==========================
def UpdateSk(element,Sk_head,Sk):
    e_max=get_emax()
    width,depth=get_width_depth()
    col,row=position(element)
        # col / row index of element 
    avg=0
    #print("{} send to Sk[{}][{}]".format(element,row,col))
    # ==========================update sketch==========================
    Sk_head[row].count+=element.count
    Sk_head[row].distinct.add(element.ID)
    Sk[row][col]+=1
    Update_local_max(Sk_head[row],Sk[row],element,col)
    Update_emax(Sk_head,Sk)

'''
    print("e_max:{}".format(e_max))
    for i in range(len(Sk)):
        print("Sk[{}]:{},{}".format(i,Sk_head[i],Sk[i]))
    print('')
'''


# ==========================update local max==========================       
def Update_local_max(head_item,element_list,element,column):
    # local max need only 1 row
    #print("In Update_local_max:")
    width,depth=get_width_depth()
    if head_item.maxID=='':
        head_item.maxID=element.ID
    else:
        # local_max_col=(mmh3.hash(head_item.maxID,signed=False))% ((width*numerator)//denominator)
        local_max_col=(mmh3.hash(head_item.maxID,signed=False))% width
        if element_list[local_max_col]<element_list[column]:
            head_item.maxID=element.ID


# ==========================update e_max==========================
def Update_emax(head,sketch):
    # pass whole array
    #print("In Update_emax:")
    e_max=get_emax()
    width,depth=get_width_depth()
    for i in range(len(head)):
        if head[i].maxID=='':
            continue
        else:
            local_max_col,local_max_row=position(Tail(head[i].maxID,0))
            if sketch[local_max_row][local_max_col]>e_max.count:
                e_max.ID=head[i].maxID
                e_max.count=sketch[local_max_row][local_max_col]

# ========================== BringBack=========================
def BringBack(e_min,head,sketch):
    # bring e_max back to Top
    # e_min=e_max, e_max=Null, delete e_max.count in Sketch, send e_min into Sketch
    e_max=get_emax()
    temp=Tail(e_min.ID,e_min.count)
    e_min.ID=e_max.ID
    e_min.count=e_max.count
    DeleteSk(e_max,head,sketch)
    UpdateSk(temp,head,sketch)

# ==========================DeleteSk=========================
def DeleteSk(element,head,sketch):
    # e_max in sketch: sketch[r][c]=0, total count-=sketch[row][col]
    width,depth=get_width_depth()
    col,row=position(element)
    head[row].count-=e_max.count
        # total_count-=element.count
    sketch[row][col]=0
    head[row].maxID=''
    element.ID=""
    element.count=0
# ==========================Tools=========================    
def get_emax():
    return e_max
def get_width_depth():
    return width,depth

def find(e,element_list):
    # return index of e in element_list
    try:
        index=[ele.ID for ele in element_list].index(e.ID)
    except:
        index=-99
    return index

def position(element):
    numerator,denominator=get_fraction()
    width,depth=get_width_depth()
    hash1=spookyhash.hash32(bytes(str(element.ID),encoding='utf-8'))
        # input: byte
        # output:unsigned- 32 bit int
    hash2=mmh3.hash(element.ID, signed=False)
        # input: str
        # output: unsigned- 32 bit int
    col=hash2 % ((width*numerator)//denominator)
    row=hash1 % depth
    return col,row
def get_fraction():
    return numerator,denominator    
    
# ==========================main=========================    

filename='caida_0.dat'
filepath="..\\dataset\\"
src_data=os.path.join(filepath,filename)
depth=8
width=128
size=512
numerator=1
denominator=1

start=time.time()

Sk_head=[Head(0) for j in range(depth)]
Sketch=np.zeros((depth,width),dtype='int32')
e_max=Tail('',0)
Top=[]

item_count=1000
income=0
with open(src_data,'rb') as file:
     #以binary讀取，資料型態也為byte
    while True:
        e=str(file.read(13))
        if len(e)<13:
            print('EOF')
            break
        else:
            #item_count-=1
            #income+=1
            #print("read {}-th element:{}".format(income,e))
            item=Tail(e,1)            
            index=find(item,Top)
            if index<0:
                if len(Top)<size:
                    Top.append(item)
                else:
                    UpdateSk(item,Sk_head,Sketch)
            else:
                Top[index].count+=1
        Top.sort(key=operator.attrgetter('count'),reverse=True)
        if e_max.count>Top[-1].count:
            BringBack(Top[-1],Sk_head,Sketch)
            #print('Top after BringBack: \n\t{}'.format(Top)) 

end=time.time()
print("Execution time:{} seconds.".format(str(end-start)))
print("Total memory {} bytes".format(sys.getsizeof(Top)+Sketch.nbytes+sys.getsizeof(Sk_head[0])*depth))
print("Top:{} bytes, Sketch:{} bytes, Sketch_head:{} bytes.".format(sys.getsizeof(Top),Sketch.nbytes,sys.getsizeof(Sk_head[0])*depth))
for i in range(len(Sketch)):
    print("Sk[{}]:{},{}".format(i,Sk_head[i],Sketch[i]))
print('')

#====================result compare=============================

templi=[[i.ID,i.count] for i in Top]
df=pd.DataFrame(templi,columns=['ID', 'Count'])

path="..\\Caida0\\"
name="My_caida0"+'_'+str(size)+'_'+str(depth)+'_'+str(width)
df.to_csv(path+name+".csv",index=False)

groundtruth='caida_0_ground_truth.csv'
final=name+".csv"

# ====================precision, ARE, AAE====================
grtruth=pd.read_csv(os.path.join(path,groundtruth))
    # compare with Top-k and groundtruth[k]
My_result=pd.read_csv(os.path.join(path,final))

# precision
gt_set=set(grtruth['Element'][:size])
my_set=set(My_result['ID'])
precision=len(gt_set & my_set)/len(my_set)
    # &: set 交集運算
print("Precision: {}".format(precision))
        
# ====================ARE/AAE in Top====================
gt_dict=dict(grtruth.values.tolist())
my_dict=dict(My_result.values.tolist())
distinct=len(gt_dict)
are_error=0
aae_error=0
tp=0
fp=0
row_distinct=[len(i.distinct) for i in Sk_head]
    # distinct eleemnts in each row of Sketch

for item in my_dict:
    # ARE/AAE in my result
    if item in gt_dict:
        # count in Top
        are_error+=abs(my_dict[item]-gt_dict[item])/my_dict[item]
            # true count- estimate count /estimate count
        aae_error+=abs(my_dict[item]-gt_dict[item])
            # true count- estimate count /estimate count
        if my_dict[item] == gt_dict[item]:
            tp+=1
        else:
            fp+=1
    else:
        # estimate count in Sketch
        item_col,item_row=position(Tail(item,1))
        count=Sketch[item_row][item_col]//(row_distinct[item_row]/width)
        if count<1:
            count=1
        are_error+=abs(count-gt_dict[item])/count
        aae_error+=abs(count-gt_dict[item])
        
ARE=are_error/distinct
AAE=aae_error/distinct
print("Find:{}, TP:{}, FP:{}".format(len(gt_set & my_set),tp,fp))
print("ARE: {}".format(ARE))
print("AAE: {}".format(AAE))

# ====================ARE/AAE of all====================
ARE_all=0
AAE_all=0
for item in gt_dict:
    if item in my_dict:
        are_error+=abs(my_dict[item]-gt_dict[item])/my_dict[item]
            # true count- estimate count /estimate count
        aae_error+=abs(my_dict[item]-gt_dict[item])
            # true count- estimate count /estimate count
    else:
        # estimate count in Sketch
        item_col,item_row=position(Tail(item,1))
        count=Sketch[item_row][item_col]*width//row_distinct[item_row]
            # avg count= (estimated_count/distinct)/width
        if count<1:
            count=1
        are_error+=abs(count-gt_dict[item])/count
        aae_error+=abs(count-gt_dict[item])
        
ARE_all=are_error/distinct
AAE_all=aae_error/distinct
print("ARE_all: {}".format(ARE_all))
print("AAE_all: {}".format(AAE_all))

EOF
Execution time:599.4665660858154 seconds.
Total memory 8816 bytes
Top:4272 bytes, Sketch:4096 bytes, Sketch_head:448 bytes.
Sk[0]:[total count: 222813, distinct: 22279, max: b'\x14\x9e\xcd\x87Aa+\xfc\xe0\xba\x01\xbb\x06'],[495 532 333  37 481 155 199 372 428 426  68 577 117 580 568 459 355 327
 443 549 372 420 361 563 299 401 523 268 142  98 196 443 569   1 135 546
 373 561 373 368 526 268 347 579  57 424 108 404  91 528 376  51  51 552
  22 585 567 524 231 391 344  38  50 152 245 457 339 420 285 512 328 296
 447 292 493  49 541 518 505 269  75 337 488 552 559 268 568 469 170 308
 534 458  91 202 565 314 115 327 185 379 171 118 509 415 320 553 382  20
  72 451 214 312 194 563 366 462 386 312 259 344 360 186 408 174  86  97
 145 181]
Sk[1]:[total count: 213197, distinct: 22258, max: b'\x06\xed9\x95\xcf3\x01`\x90\x05\x01\xbb\x06'],[225 175 145 361 376 408   6 451 219 173 354 553  99 188 332  54 162  30
 311 408 522 369 415 238 244  13 550 407 541 250 337  31 326 396 301 159
   9  29 