In [12]:

import numpy as np
import spookyhash
import mmh3
import os
import pandas as pd
import time
import operator
import hyperloglog
import sys

# ==========================data structure==========================
class Node():
    def __init__(self,count=0):
        self.count=count
    def add_count(self,count=1):
        self.count+=count
    def __str__(self):
        return 'count: {}'.format(self.count)
    def __repr__(self):
        return ''

class Head(Node):
    def __init__(self,count=1):
        super().__init__(count)
        self.distinct = hyperloglog.HyperLogLog(0.01)
        self.maxID=''
    def __str__(self):
        return '[total count: {}, distinct: {}, max: {}]'.format(self.count,len(self.distinct),self.maxID)
    def __repr__(self):
        return '[total count: {}, distinct: {}, max: {}]'.format(self.count,len(self.distinct),self.maxID)

class Tail(Node):
    def __init__(self,ID,count):
        self.ID = ID
        super().__init__(count)
    def __str__(self):
        return '[ID: {}, count: {}]'.format(self.ID,self.count)
    def __repr__(self):
        return '[ID: {}, count: {}]'.format(self.ID,self.count)

# ==========================UpdateSk==========================
def UpdateSk(element,Sk_head,Sk):
    e_max=get_emax()
    width,depth=get_width_depth()
    col,row=position(element)
        # col / row index of element 
    avg=0
    #print("{} send to Sk[{}][{}]".format(element,row,col))
    # ==========================update sketch==========================
    Sk_head[row].count+=element.count
    #Sk_head[row].distinct.add(element.ID)
    Sk[row][col]+=1
    Update_local_max(Sk_head[row],Sk[row],element,col)
    Update_emax(Sk_head,Sk)

'''
    print("e_max:{}".format(e_max))
    for i in range(len(Sk)):
        print("Sk[{}]:{},{}".format(i,Sk_head[i],Sk[i]))
    print('')
'''


# ==========================update local max==========================       
def Update_local_max(head_item,element_list,element,column):
    # local max need only 1 row
    #print("In Update_local_max:")
    width,depth=get_width_depth()
    if head_item.maxID=='':
        head_item.maxID=element.ID
    else:
        # local_max_col=(mmh3.hash(head_item.maxID,signed=False))% ((width*numerator)//denominator)
        local_max_col=(mmh3.hash(head_item.maxID,signed=False))% width
        if element_list[local_max_col]<element_list[column]:
            head_item.maxID=element.ID


# ==========================update e_max==========================
def Update_emax(head,sketch):
    # pass whole array
    #print("In Update_emax:")
    e_max=get_emax()
    width,depth=get_width_depth()
    for i in range(len(head)):
        if head[i].maxID=='':
            continue
        else:
            local_max_col,local_max_row=position(Tail(head[i].maxID,0))
            if sketch[local_max_row][local_max_col]>e_max.count:
                e_max.ID=head[i].maxID
                e_max.count=sketch[local_max_row][local_max_col]

# ========================== BringBack=========================
def BringBack(e_min,head,sketch):
    # bring e_max back to Top
    # e_min=e_max, e_max=Null, delete e_max.count in Sketch, send e_min into Sketch
    e_max=get_emax()
    temp=Tail(e_min.ID,e_min.count)
    e_min.ID=e_max.ID
    e_min.count=e_max.count
    DeleteSk(e_max,head,sketch)
    UpdateSk(temp,head,sketch)

# ==========================DeleteSk=========================
def DeleteSk(element,head,sketch):
    # e_max in sketch: sketch[r][c]=0, total count-=sketch[row][col]
    width,depth=get_width_depth()
    col,row=position(element)
    head[row].count-=e_max.count
        # total_count-=element.count
    sketch[row][col]=0
    head[row].maxID=''
    element.ID=""
    element.count=0
# ==========================Tools=========================    
def get_emax():
    return e_max
def get_width_depth():
    return width,depth

def find(e,element_list):
    # return index of e in element_list
    try:
        index=[ele.ID for ele in element_list].index(e.ID)
    except:
        index=-99
    return index

def position(element):
    numerator,denominator=get_fraction()
    width,depth=get_width_depth()
    hash1=spookyhash.hash32(bytes(str(element.ID),encoding='utf-8'))
        # input: byte
        # output:unsigned- 32 bit int
    hash2=mmh3.hash(element.ID, signed=False)
        # input: str
        # output: unsigned- 32 bit int
    col=hash2 % ((width*numerator)//denominator)
    row=hash1 % depth
    return col,row
def get_fraction():
    return numerator,denominator    
    
# ==========================main=========================    

filename='caida_0.dat'
filepath="..\\dataset\\"
src_data=os.path.join(filepath,filename)
depth=8
width=256
size=1024
numerator=1
denominator=1

start=time.time()

Sk_head=[Head(0) for j in range(depth)]
Sketch=np.zeros((depth,width),dtype='int32')
e_max=Tail('',0)
Top=[]

item_count=1000
income=0
with open(src_data,'rb') as file:
     #以binary讀取，資料型態也為byte
    while True:
        e=str(file.read(13))
        if len(e)<13:
            print('EOF')
            break
        else:
            #item_count-=1
            #income+=1
            #print("read {}-th element:{}".format(income,e))
            item=Tail(e,1)            
            index=find(item,Top)
            if index<0:
                if len(Top)<size:
                    Top.append(item)
                else:
                    UpdateSk(item,Sk_head,Sketch)
            else:
                Top[index].count+=1
        Top.sort(key=operator.attrgetter('count'),reverse=True)
        if e_max.count>Top[-1].count:
            BringBack(Top[-1],Sk_head,Sketch)
            #print('Top after BringBack: \n\t{}'.format(Top)) 

end=time.time()
print("Execution time:{} seconds.".format(str(end-start)))
print("Total memory {} bytes".format(sys.getsizeof(Top)+Sketch.nbytes+sys.getsizeof(Sk_head[0])*depth))
print("Top:{} bytes, Sketch:{} bytes, Sketch_head:{} bytes.".format(sys.getsizeof(Top),Sketch.nbytes,sys.getsizeof(Sk_head[0])*depth))
print("e_max:{}".format(e_max))
for i in range(len(Sketch)):
    print("Sk[{}]:{},{}".format(i,Sk_head[i],Sketch[i]))
print('')


templi=[]
for i in Top:
    templi.append([i.ID,i.count])

df=pd.DataFrame(templi,columns=['ID', 'Count'])
name="My_caida0"+'_'+str(size)+'_'+str(depth)+'_'+str(width)
df.to_csv("..\\Caida0\\"+name+".csv",index=False)
df.head(20)


#====================result compare=============================
path="..\\Caida0\\"
groundtruth='caida_0_ground_truth.csv'
final=name+".csv"

grtruth=pd.read_csv(os.path.join(path,groundtruth),nrows=size)
# compare with Top-k and groundtruth[k]
My_result1=pd.read_csv(os.path.join(path,final))

gli = grtruth.values.tolist()
truthdict=dict(gli)
li1= My_result1.values.tolist()

tp=0
fp=0
find=0
compare=[]
error=0

for item in li1:
    if item[0] in truthdict:
        find+=1
        compare.append([item[0],item[1],abs(truthdict[item[0]]-item[1])])
        # ID, estimated, abs error
        if item[1]==truthdict[item[0]]:
            tp+=1
        else:
            fp+=1

print("Find:{},TP:{},FP:{}".format(find,tp,fp))
print("Precision: {}".format(find/size))
xdf=pd.DataFrame(compare,columns=['ID', 'Count','Error'])
xdf

EOF
Execution time:949.2914493083954 seconds.
Total memory 17664 bytes
Top:9024 bytes, Sketch:8192 bytes, Sketch_head:448 bytes.
e_max:[ID: b'lw\x05t2\xab\x01`\x94\xfb\x01\xbb\x06', count: 310]
Sk[0]:[total count: 191069, distinct: 0, max: b'$o]\xd7\xd6!+\xfc\xe0Y\x01\xbb\x06'],[ 54 127 197 100  80  83 205  77 183  65 256 249 281 196 154 293 116 263
  32 218 195  98 176 215 210  14  43 235 152 183 238  44 206 301 277 236
  23 130 245 113 163  27 191 118 169 273 295 134 232  23 196  97  82 166
 257 132  89 237 283  45  55 128 276  84 118 253 121 234 181 111  69 249
 100 241 291 243 132 134 195 102  64  21 303 291 211 239  36 304 155 280
 296 221  88 238  60  82 182 197 273 181  39 203  98 101 130 112  58  24
  76 109 122  10 157 247 123 147 110 232 213 196 282 287 182 191  30  29
 295 127 185  61  71 112 292 303 158 224 308 231 103  74  39 260 294 288
  59 231  62 199 299 147 212 243  70 300  48 294 304 293 215 204 249  88
  21  75 189 103 176 160 220 159 232 298 207 251  72  89 249 235

Unnamed: 0,ID,Count,Error
0,b'\xd2\x1d9\xfa\x00P\x0f\x13I\x9a\xf89\x06',16889,0
1,b'E.\x1c)\xa4\xb2\x89\xb6\x06\xba\xd2\xad\x06',11647,0
2,b'V+x\xe8\x00Po\xcd\xe4\xde\xccw\x06',10113,0
3,b'\xb7\xf6h\xd0\xc1\x8b+\xfc\xe1\x08\x00P\x06',9000,7
4,b'\x01l|\xff\x8f\r\x01l\xc5\xc8\x01\xbb\x06',8419,1
5,b'\x99\xc1\xe9Z\x00P+\xfc\xe0=\xe7%\x06',6730,136
6,b'c\x0f\xb2\xd4\x80\xd2\x01`\x8d\x12\x01\xbb\x06',6256,12
7,b'\x01l|\xec\xcc\x1b\x01l\xc5\xc8\x01\xbb\x06',5420,0
8,b'E\x0c\xd5\xf7\xd7(\xdd.\xdc\xec\x00P\x06',5194,0
9,b'E\x0c\xd5\xf5\xa4\x14\xdd.\xdc\xec\x00P\x06',5045,1


In [11]:
# ARE

def position(element):
    numerator,denominator=get_fraction()
    width,depth=get_width_depth()
    hash1=spookyhash.hash32(bytes(str(element.ID),encoding='utf-8'))
        # input: byte
        # output:unsigned- 32 bit int
    hash2=mmh3.hash(element.ID, signed=False)
        # input: str
        # output: unsigned- 32 bit int
    col=hash2 % ((width*numerator)//denominator)
    row=hash1 % depth
    return col,row


distinct=177335
path="..\\Caida0\\"
groundtruth='caida_0_ground_truth.csv'
name="My_caida0"+'_'+str(size)+'_'+str(depth)+'_'+str(width)
final=name+".csv"
grtruth=pd.read_csv(os.path.join(path,groundtruth))
My_result1=pd.read_csv(os.path.join(path,final))

gli = grtruth.values.tolist()
itemdict=dict(My_result1.values.tolist())

ARE=0
error_sum=0
count=0
for item in gli[:100]:
    count+=1
    if item[0] in itemdict:
        item_col,item_row= position(Tail(item[0],1))
        print("ID:{},True Count:{}, Estimated: {}".format(item[0],item[1],itemdict[item[0]]))
        error_sum+=abs(item[1]-itemdict[item[0]])/item[1]
    else:
        item_col,item_row= position(Tail(item[0],1))
        print("ID:{},True Count:{}, Estimated: {}".format(item[0],item[1],Sketch[item_row][item_col]))
        error_sum+=abs(Sketch[item_row][item_col]-item[1])/item[1]
ARE=error_sum/distinct
print(ARE,count)



239.57323157082087 177335
