## My algo
- hyperloglog for estimate distinct elements in Other
    - Top 512 + Sketch 4*256, hash indexing
        - Execution time:631.2319478988647 seconds.
        - Total memory 8576 bytes
        - Top:4272 bytes, Sketch:4208 bytes, Sketch_head:96 bytes.
        - Find:358,TP:55,FP:303
    - Top 512 + Sketch 4*128, hash indexing
        - Execution time:587.4053885936737 seconds.
        - Total memory 6528 bytes
        - Top:4272 bytes, Sketch:2160 bytes, Sketch_head:96 bytes.
        - Find:356,TP:44,FP:312

In [None]:
import numpy as np
import spookyhash
import mmh3
import os
import pandas as pd
import time
import operator
import hyperloglog
import sys

# ==========================data structure==========================
class Node():
    def __init__(self,count=0):
        self.count=count
    def add_count(self,count=1):
        self.count+=count
    def __str__(self):
        return 'count: {}'.format(self.count)
    def __repr__(self):
        return ''

class Head(Node):
    def __init__(self,count=1):
        super().__init__(count)
        self.distinct = hyperloglog.HyperLogLog(0.01)
        self.maxID=''
    def __str__(self):
        return '[total count: {}, distinct: {}, max: {}]'.format(self.count,len(self.distinct),self.maxID)
    def __repr__(self):
        return '[total count: {}, distinct: {}, max: {}]'.format(self.count,len(self.distinct),self.maxID)

class Tail(Node):
    def __init__(self,ID,count):
        self.ID = ID
        super().__init__(count)
    def __str__(self):
        return '[ID: {}, count: {}]'.format(self.ID,self.count)
    def __repr__(self):
        return '[ID: {}, count: {}]'.format(self.ID,self.count)

# ==========================UpdateSk==========================
def UpdateSk(element,Sk_head,Sk):
    e_max=get_emax()
    width,depth=get_width_depth()
    col,row=position(element)
        # col / row index of element 
    avg=0
    #print("{} send to Sk[{}][{}]".format(element,row,col))
    # ==========================update sketch==========================
    Sk_head[row].count+=element.count
    if col<width:
        # e in Sketch
        Sk[row][col]+=1
    else:
        # e in Other
        Sk_head[row].distinct.add(element.ID)
    Update_local_max(Sk_head[row],Sk[row],element,col)
    Update_emax(Sk_head,Sk)

'''
    print("e_max:{}".format(e_max))
    for i in range(len(Sk)):
        print("Sk[{}]:{},{}".format(i,Sk_head[i],Sk[i]))
    print('')
'''


# ==========================update local max==========================       
def Update_local_max(head_item,element_list,element,column):
    # local max need only 1 row
    # print("In Update_local_max:")
    # numerator,denominator=get_fraction()
    width,depth=get_width_depth()
    if head_item.maxID=='':
        head_item.maxID=element.ID
    else:
        local_max_col=(mmh3.hash(head_item.maxID,signed=False))% width
        #local_max_col=(mmh3.hash(head_item.maxID,signed=False))% ((width*numerator)//denominator)
        if local_max_col<width:
            # local max in Sketch
            if column<width:
                # e in Sketch
                if element_list[column]>element_list[local_max_col]:
                       head_item.maxID=element.ID
            else:
                # e in Other
                count_sum=sum(i for i in element_list)
                avg=(head_item.count-count_sum)//len(head_item.distinct)
                if avg>element_list[local_max_col]:
                     head_item.maxID=element.ID
        else:
            # local max in Other
            count_sum=sum(i for i in element_list)
            avg=(head_item.count-count_sum)//len(head_item.distinct)
            if column<width:
                # e in Sketch
                if column<width:
                    if element_list[column]>avg:
                           head_item.maxID=element.ID

# ==========================update e_max==========================
def Update_emax(head,sketch):
    # pass whole array
    #print("In Update_emax:")
    e_max=get_emax()
    numerator,denominator=get_fraction()
    width,depth=get_width_depth()
    for i in range(len(head)):
        if head[i].maxID=='':
            continue
        else:
            local_max_col,local_max_row=position(Tail(head[i].maxID,1))
            if local_max_col<width:
                # local max in Sketch
                if sketch[local_max_row][local_max_col]>e_max.count:
                    e_max.ID=head[i].maxID
                    e_max.count=sketch[local_max_row][local_max_col]
            else:
                pass
                '''
                # local max in Other
                count_sum=sum(j for j in sketch[i])
                avg=int((head[i].count-count_sum)//(width*((numerator/denominator)-1)))
                if avg>e_max.count:
                    e_max.ID=head[i].maxID
                    e_max.count=avg
                '''
# ========================== BringBack=========================
def BringBack(e_min,head,sketch):
    # bring e_max back to Top
    # e_min=e_max, e_max=Null, delete e_max.count in Sketch, send e_min into Sketch
    e_max=get_emax()
    temp=Tail(e_min.ID,e_min.count)
    e_min.ID=e_max.ID
    e_min.count=e_max.count
    DeleteSk(e_max,head,sketch)
    UpdateSk(temp,head,sketch)

# ==========================DeleteSk=========================
def DeleteSk(element,head,sketch):
    # e_max in sketch: sketch[r][c]=0, total count-=sketch[row][col]
    # e_max in Other: total count-=e_max.count
    width,depth=get_width_depth()
    col,row=position(element)
    head[row].count-=e_max.count
        # total_count-=element.count

    if col<width:
        # e_max in sketch, need to config sk[r][c]=0
        sketch[row][col]=0
        head[row].maxID=''
    element.ID=""
    element.count=0
# ==========================Tools=========================    
def get_emax():
    return e_max
def get_width_depth():
    return width,depth

def find(e,element_list):
    # return index of e in element_list
    try:
        index=[ele.ID for ele in element_list].index(e.ID)
    except:
        index=-99
    return index

def position(element):
    numerator,denominator=get_fraction()
    width,depth=get_width_depth()
    hash1=spookyhash.hash32(bytes(str(element.ID),encoding='utf-8'))
        # input: byte
        # output:unsigned- 32 bit int
    hash2=mmh3.hash(element.ID, signed=False)
        # input: str
        # output: unsigned- 32 bit int
    col=hash2 % ((width*numerator)//denominator)
    row=hash1 % depth
    return col,row
def get_fraction():
    return numerator,denominator    
    
# ==========================main=========================    

filename='kosarak.dat'
filepath="..\\dataset\\"
src_data=os.path.join(filepath,filename)
depth=4
width=128
size=1024
numerator=10
denominator=10

start=time.time()

Sk_head=[Head(0) for j in range(depth)]
Sketch=np.zeros((depth,width),dtype='int32')
e_max=Tail('',0)
Top=[]

item_count=10000
income=0
with open(src_data,'r') as file:
    while True:
        e=file.readline().strip('\n')
        if not e:
            break
        else:
            #item_count-=1
            #income+=1
            #print("read {}-th element:{}".format(income,e))
            item=Tail(e,1)            
            index=find(item,Top)
            if index<0:
                if len(Top)<size:
                    Top.append(item)
                else:
                    UpdateSk(item,Sk_head,Sketch)
            else:
                Top[index].count+=1
        Top.sort(key=operator.attrgetter('count'),reverse=True)
        if e_max.count>Top[-1].count:
            BringBack(Top[-1],Sk_head,Sketch)
            #print('Top after BringBack: \n\t{}'.format(Top)) 

end=time.time()
print("Execution time:{} seconds.".format(str(end-start)))
print("Total memory {} bytes".format(sys.getsizeof(Top)+sys.getsizeof(Sketch)+sys.getsizeof(Sk_head)*depth))
print("Top:{} bytes, Sketch:{} bytes, Sketch_head:{} bytes.".format(sys.getsizeof(Top),sys.getsizeof(Sketch),sys.getsizeof(Sk_head)*depth))
print("TOP[20]:\n{}".format(Top[:20]))
print("e_max:{}".format(e_max))
for i in range(len(Sketch)):
    print("Sk[{}]:{},{}".format(i,Sk_head[i],Sketch[i]))
print('')


templi=[]
for i in Top:
    templi.append([i.ID,i.count])

df=pd.DataFrame(templi,columns=['ID', 'Count'])
df.to_csv("..\\result\\kosarak\\My_kosarak_hyperloglog"+str(size)+".csv",index=False)
df.head(20)

#====================result compare=============================
path='..\\result\\kosarak'
groundtruth='kosarak_ground_truth.csv'
final="My_kosarak_hyperloglog"+str(size)+".csv"

grtruth=pd.read_csv(os.path.join(path,groundtruth))
My_result1=pd.read_csv(os.path.join(path,final))

gli = grtruth.values.tolist()
li1= My_result1.values.tolist()

tp=0
fp=0
find=0
err=[]
error=0
for item in li1:
    for element in gli[:size]:
        if item[0]==element[0]:
            # print("{},{} vs. {},{}".format(item[0],item[1],element[0],element[1]))
            find+=1
            if item[1]==element[1]:
                tp+=1
            else:
                fp+=1
                error+=abs(item[1]-element[1])/item[1]
print("Find:{},TP:{},FP:{}".format(find,tp,fp))
print("ARE:{}".format(error/606770))


## My algo
- sum in Other/width*ratio
    - Top 512 + Sketch 4*256, hash indexing
        - Execution time:223.44352793693542 seconds.
        - Total memory 8576 bytes
        - Top:4272 bytes, Sketch:4208 bytes, Sketch_head:96 bytes.
        - Find:171,TP:47,FP:124
    - Top 512 + Sketch 4*128, hash indexing
        - Execution time:184.30011439323425 seconds.
        - Total memory 6528 bytes
        - Top:4272 bytes, Sketch:2160 bytes, Sketch_head:96 bytes.
        - Find:154,TP:37,FP:117
    - Top 512 + Sketch 128*4, hash indexing
        - Execution time:552.0288758277893 seconds.
        - Total memory 7680 bytes
        - Top:4272 bytes, Sketch:2160 bytes, Sketch_head:1248 bytes.
        - Find:367,TP:37,FP:330
    - Top 1024 + Sketch 128*4, hash indexing 

In [None]:

import numpy as np
import spookyhash
import mmh3
import os
import pandas as pd
import time
import operator
import hyperloglog
import sys

# ==========================data structure==========================
class Node():
    def __init__(self,count=0):
        self.count=count
    def add_count(self,count=1):
        self.count+=count
    def __str__(self):
        return 'count: {}'.format(self.count)
    def __repr__(self):
        return ''

class Head(Node):
    def __init__(self,count=1):
        super().__init__(count)
        self.distinct = hyperloglog.HyperLogLog(0.01)
        self.maxID=''
    def __str__(self):
        return '[total count: {}, distinct: {}, max: {}]'.format(self.count,len(self.distinct),self.maxID)
    def __repr__(self):
        return '[total count: {}, distinct: {}, max: {}]'.format(self.count,len(self.distinct),self.maxID)

class Tail(Node):
    def __init__(self,ID,count):
        self.ID = ID
        super().__init__(count)
    def __str__(self):
        return '[ID: {}, count: {}]'.format(self.ID,self.count)
    def __repr__(self):
        return '[ID: {}, count: {}]'.format(self.ID,self.count)

# ==========================UpdateSk==========================
def UpdateSk(element,Sk_head,Sk):
    e_max=get_emax()
    width,depth=get_width_depth()
    col,row=position(element)
        # col / row index of element 
    avg=0
    #print("{} send to Sk[{}][{}]".format(element,row,col))
    # ==========================update sketch==========================
    Sk_head[row].count+=element.count
    if col<width:
        # e in Sketch
        Sk[row][col]+=1
    else:
        # e in Other
        pass
    Update_local_max(Sk_head[row],Sk[row],element,col)
    Update_emax(Sk_head,Sk)

'''
    print("e_max:{}".format(e_max))
    for i in range(len(Sk)):
        print("Sk[{}]:{},{}".format(i,Sk_head[i],Sk[i]))
    print('')
'''


# ==========================update local max==========================       
def Update_local_max(head_item,element_list,element,column):
    # local max need only 1 row
    #print("In Update_local_max:")
    numerator,denominator=get_fraction()
    width,depth=get_width_depth()
    if head_item.maxID=='':
        head_item.maxID=element.ID
    else:
        # local_max_col=(mmh3.hash(head_item.maxID,signed=False))% ((width*numerator)//denominator)
        local_max_col=(mmh3.hash(head_item.maxID,signed=False))% width
        if local_max_col<width:
            # local max in Sketch
            if column<width:
                # e in Sketch
                if element_list[column]>element_list[local_max_col]:
                       head_item.maxID=element.ID
            else:
                # e in Other
                count_sum=sum(i for i in element_list)
                # avg=(head_item.count-count_sum)//(width*((numerator/denominator)-1))
                avg=(head_item.count-count_sum)//width
                if avg>element_list[local_max_col]:
                     head_item.maxID=element.ID
        else:
            # local max in Other
            count_sum=sum(i for i in element_list)
            # avg=int((head_item.count-count_sum)//(width*((numerator/denominator)-1)))
            avg=(head_item.count-count_sum)//width
            if column<width:
                # e in Sketch
                if column<width:
                    if element_list[column]>avg:
                           head_item.maxID=element.ID
                else:
                    pass

# ==========================update e_max==========================
def Update_emax(head,sketch):
    # pass whole array
    #print("In Update_emax:")
    e_max=get_emax()
    numerator,denominator=get_fraction()
    width,depth=get_width_depth()
    for i in range(len(head)):
        if head[i].maxID=='':
            continue
        else:
            local_max_col,local_max_row=position(Tail(head[i].maxID,0))
            if local_max_col<width:
                # local max in Sketch
                if sketch[local_max_row][local_max_col]>e_max.count:
                    e_max.ID=head[i].maxID
                    e_max.count=sketch[local_max_row][local_max_col]
            else:
                pass
                '''
                # local max in Other
                count_sum=sum(j for j in sketch[i])
                avg=int((head[i].count-count_sum)//(width*((numerator/denominator)-1)))
                if avg>e_max.count:
                    e_max.ID=head[i].maxID
                    e_max.count=avg
                '''
# ========================== BringBack=========================
def BringBack(e_min,head,sketch):
    # bring e_max back to Top
    # e_min=e_max, e_max=Null, delete e_max.count in Sketch, send e_min into Sketch
    e_max=get_emax()
    temp=Tail(e_min.ID,e_min.count)
    e_min.ID=e_max.ID
    e_min.count=e_max.count
    DeleteSk(e_max,head,sketch)
    UpdateSk(temp,head,sketch)

# ==========================DeleteSk=========================
def DeleteSk(element,head,sketch):
    # e_max in sketch: sketch[r][c]=0, total count-=sketch[row][col]
    # e_max in Other: total count-=e_max.count
    width,depth=get_width_depth()
    col,row=position(element)
    head[row].count-=e_max.count
        # total_count-=element.count

    if col<width:
        # e_max in sketch, need to config sk[r][c]=0
        sketch[row][col]=0
        head[row].maxID=''
    element.ID=""
    element.count=0
# ==========================Tools=========================    
def get_emax():
    return e_max
def get_width_depth():
    return width,depth

def find(e,element_list):
    # return index of e in element_list
    try:
        index=[ele.ID for ele in element_list].index(e.ID)
    except:
        index=-99
    return index

def position(element):
    numerator,denominator=get_fraction()
    width,depth=get_width_depth()
    hash1=spookyhash.hash32(bytes(str(element.ID),encoding='utf-8'))
        # input: byte
        # output:unsigned- 32 bit int
    hash2=mmh3.hash(element.ID, signed=False)
        # input: str
        # output: unsigned- 32 bit int
    col=hash2 % ((width*numerator)//denominator)
    row=hash1 % depth
    return col,row
def get_fraction():
    return numerator,denominator    
    
# ==========================main=========================    

filename='kosarak.dat'
filepath="..\\dataset\\"
src_data=os.path.join(filepath,filename)
depth=4
width=128
size=512
numerator=1
denominator=1

start=time.time()

Sk_head=[Head(0) for j in range(depth)]
Sketch=np.zeros((depth,width),dtype='int32')
e_max=Tail('',0)
Top=[]

item_count=10000
income=0
with open(src_data,'r') as file:
    while True:
        e=file.readline().strip('\n')
        if not e:
            break
        else:
            #item_count-=1
            #income+=1
            #print("read {}-th element:{}".format(income,e))
            item=Tail(e,1)            
            index=find(item,Top)
            if index<0:
                if len(Top)<size:
                    Top.append(item)
                else:
                    UpdateSk(item,Sk_head,Sketch)
            else:
                Top[index].count+=1
        Top.sort(key=operator.attrgetter('count'),reverse=True)
        if e_max.count>Top[-1].count:
            BringBack(Top[-1],Sk_head,Sketch)
            #print('Top after BringBack: \n\t{}'.format(Top)) 

end=time.time()
print("Execution time:{} seconds.".format(str(end-start)))
print("Total memory {} bytes".format(sys.getsizeof(Top)+sys.getsizeof(Sketch)+sys.getsizeof(Sk_head)))
print("Top:{} bytes, Sketch:{} bytes, Sketch_head:{} bytes.".format(sys.getsizeof(Top),sys.getsizeof(Sketch),sys.getsizeof(Sk_head)*depth))
print("TOP[20]:\n{}".format(Top[:20]))
print("e_max:{}".format(e_max))
for i in range(len(Sketch)):
    print("Sk[{}]:{},{}".format(i,Sk_head[i],Sketch[i]))
print('')


templi=[]
for i in Top:
    templi.append([i.ID,i.count])

df=pd.DataFrame(templi,columns=['ID', 'Count'])
df.to_csv("..\\result\\kosarak\\My_kosarak_hash"+str(size)+".csv",index=False)
df.head(20)


#====================result compare=============================
path='..\\result\\kosarak'
groundtruth='kosarak_ground_truth.csv'
final="My_kosarak_hash"+str(size)+".csv"

grtruth=pd.read_csv(os.path.join(path,groundtruth))
My_result1=pd.read_csv(os.path.join(path,final))

gli = grtruth.values.tolist()
li1= My_result1.values.tolist()

tp=0
fp=0
find=0
err=[]
error=0
for item in li1:
    for element in gli[:size]:
        if item[0]==element[0]:
            # print("{},{} vs. {},{}".format(item[0],item[1],element[0],element[1]))
            find+=1
            if item[1]==element[1]:
                tp+=1
            else:
                fp+=1
                error+=abs(item[1]-element[1])/item[1]
print("Find:{},TP:{},FP:{}".format(find,tp,fp))
print("ARE:{}".format(error/606770))

## no Other part

In [30]:

import numpy as np
import spookyhash
import mmh3
import os
import pandas as pd
import time
import operator
import hyperloglog
import sys

# ==========================data structure==========================
class Node():
    def __init__(self,count=0):
        self.count=count
    def add_count(self,count=1):
        self.count+=count
    def __str__(self):
        return 'count: {}'.format(self.count)
    def __repr__(self):
        return ''

class Head(Node):
    def __init__(self,count=1):
        super().__init__(count)
        self.distinct = hyperloglog.HyperLogLog(0.01)
        self.maxID=''
    def __str__(self):
        return '[total count: {}, distinct: {}, max: {}]'.format(self.count,len(self.distinct),self.maxID)
    def __repr__(self):
        return '[total count: {}, distinct: {}, max: {}]'.format(self.count,len(self.distinct),self.maxID)

class Tail(Node):
    def __init__(self,ID,count):
        self.ID = ID
        super().__init__(count)
    def __str__(self):
        return '[ID: {}, count: {}]'.format(self.ID,self.count)
    def __repr__(self):
        return '[ID: {}, count: {}]'.format(self.ID,self.count)

# ==========================UpdateSk==========================
def UpdateSk(element,Sk_head,Sk):
    e_max=get_emax()
    width,depth=get_width_depth()
    col,row=position(element)
        # col / row index of element 
    avg=0
    #print("{} send to Sk[{}][{}]".format(element,row,col))
    # ==========================update sketch==========================
    Sk_head[row].count+=element.count
    #Sk_head[row].distinct.add(element.ID)
    Sk[row][col]+=1

    Update_local_max(Sk_head[row],Sk[row],element,col)
    Update_emax(Sk_head,Sk)

'''
    print("e_max:{}".format(e_max))
    for i in range(len(Sk)):
        print("Sk[{}]:{},{}".format(i,Sk_head[i],Sk[i]))
    print('')
'''


# ==========================update local max==========================       
def Update_local_max(head_item,element_list,element,column):
    # local max need only 1 row
    #print("In Update_local_max:")
    width,depth=get_width_depth()
    if head_item.maxID=='':
        head_item.maxID=element.ID
    else:
        # local_max_col=(mmh3.hash(head_item.maxID,signed=False))% ((width*numerator)//denominator)
        local_max_col=(mmh3.hash(head_item.maxID,signed=False))% width
        if element_list[local_max_col]<element_list[column]:
            head_item.maxID=element.ID


# ==========================update e_max==========================
def Update_emax(head,sketch):
    # pass whole array
    #print("In Update_emax:")
    e_max=get_emax()
    width,depth=get_width_depth()
    for i in range(len(head)):
        if head[i].maxID=='':
            continue
        else:
            local_max_col,local_max_row=position(Tail(head[i].maxID,0))
            if sketch[local_max_row][local_max_col]>e_max.count:
                e_max.ID=head[i].maxID
                e_max.count=sketch[local_max_row][local_max_col]

# ========================== BringBack=========================
def BringBack(e_min,head,sketch):
    # bring e_max back to Top
    # e_min=e_max, e_max=Null, delete e_max.count in Sketch, send e_min into Sketch
    e_max=get_emax()
    temp=Tail(e_min.ID,e_min.count)
    e_min.ID=e_max.ID
    e_min.count=e_max.count
    DeleteSk(e_max,head,sketch)
    UpdateSk(temp,head,sketch)

# ==========================DeleteSk=========================
def DeleteSk(element,head,sketch):
    # e_max in sketch: sketch[r][c]=0, total count-=sketch[row][col]
    width,depth=get_width_depth()
    col,row=position(element)
    head[row].count-=e_max.count
        # total_count-=element.count
    sketch[row][col]=0
    head[row].maxID=''
    element.ID=""
    element.count=0
# ==========================Tools=========================    
def get_emax():
    return e_max
def get_width_depth():
    return width,depth

def find(e,element_list):
    # return index of e in element_list
    try:
        index=[ele.ID for ele in element_list].index(e.ID)
    except:
        index=-99
    return index

def position(element):
    numerator,denominator=get_fraction()
    width,depth=get_width_depth()
    hash1=spookyhash.hash32(bytes(str(element.ID),encoding='utf-8'))
        # input: byte
        # output:unsigned- 32 bit int
    hash2=mmh3.hash(element.ID, signed=False)
        # input: str
        # output: unsigned- 32 bit int
    col=hash2 % ((width*numerator)//denominator)
    row=hash1 % depth
    return col,row
def get_fraction():
    return numerator,denominator    
    
# ==========================main=========================    

filename='kosarak.dat'
filepath="..\\dataset\\"
src_data=os.path.join(filepath,filename)
depth=8
width=128
size=512
numerator=1
denominator=1

start=time.time()

Sk_head=[Head(0) for j in range(depth)]
Sketch=np.zeros((depth,width),dtype='int32')
e_max=Tail('',0)
Top=[]

item_count=1000
income=0
with open(src_data,'r') as file:
    while True:
        e=file.readline().strip('\n')
        if not e:
            break
        else:
            #item_count-=1
            #income+=1
            #print("read {}-th element:{}".format(income,e))
            item=Tail(e,1)            
            index=find(item,Top)
            if index<0:
                if len(Top)<size:
                    Top.append(item)
                else:
                    UpdateSk(item,Sk_head,Sketch)
            else:
                Top[index].count+=1
        Top.sort(key=operator.attrgetter('count'),reverse=True)
        if e_max.count>Top[-1].count:
            BringBack(Top[-1],Sk_head,Sketch)
            #print('Top after BringBack: \n\t{}'.format(Top)) 

end=time.time()
print("Execution time:{} seconds.".format(str(end-start)))
print("Total memory {} bytes".format(sys.getsizeof(Top)+Sketch.nbytes+sys.getsizeof(Sk_head[0])*depth))
print("Top:{} bytes, Sketch:{} bytes, Sketch_head:{} bytes.".format(sys.getsizeof(Top),Sketch.nbytes,sys.getsizeof(Sk_head[0])*depth))
print("TOP[20]:\n{}".format(Top[:20]))
print("e_max:{}".format(e_max))
for i in range(len(Sketch)):
    print("Sk[{}]:{},{}".format(i,Sk_head[i],Sketch[i]))
print('')


templi=[]
for i in Top:
    templi.append([i.ID,i.count])

df=pd.DataFrame(templi,columns=['ID', 'Count'])
df.to_csv("..\\result\\kosarak\\My_kosarak_hash"+str(size)+".csv",index=False)
df.head(20)


#====================result compare=============================
path='..\\result\\kosarak'
groundtruth='kosarak_ground_truth.csv'
final="My_kosarak_hash"+str(size)+".csv"

grtruth=pd.read_csv(os.path.join(path,groundtruth),nrows=size)
My_result1=pd.read_csv(os.path.join(path,final))

gli = grtruth.values.tolist()
li1= My_result1.values.tolist()

tp=0
fp=0
find=0
compare=[]
error=0
for item in li1:
    for element in gli:
        if item[0]==element[0]:
            # print("{},{} vs. {},{}".format(item[0],item[1],element[0],element[1]))
            find+=1
            if item[1]==element[1]:
                compare.append([element[0],item[1],abs(element[1]-item[1])])
                tp+=1
            else:
                fp+=1
print("Find:{},TP:{},FP:{}".format(find,tp,fp))
print("Precision: {}".format(find/size))
xdf=pd.DataFrame(compare,columns=['ID', 'Count','Error'])
# print("ARE:{}".format(error/606770))
xdf

Execution time:145.3813350200653 seconds.
Total memory 8816 bytes
Top:4272 bytes, Sketch:4096 bytes, Sketch_head:448 bytes.
TOP[20]:
[[ID: 3, count: 36133], [ID: 6, count: 30225], [ID: 6 3, count: 21139], [ID: 11 6, count: 12769], [ID: 1, count: 9766], [ID: 11 6 3, count: 9083], [ID: 1 3, count: 6002], [ID: 11, count: 5374], [ID: 1 6, count: 5046], [ID: 4, count: 4290], [ID: 11 3, count: 3756], [ID: 55, count: 3561], [ID: 1 6 3, count: 3510], [ID: 4 3, count: 3000], [ID: 55 3, count: 2377], [ID: 11 1 6, count: 2375], [ID: 4 6, count: 2340], [ID: 4 6 3, count: 1759], [ID: 11 1 6 3, count: 1753], [ID: 6 55, count: 1748]]
e_max:[ID: 1 6 91 3, count: 83]
Sk[0]:[total count: 90326, distinct: 0, max: 1490 1],[11 46 29 76 60 80 69 43 64 23 12 18  6 73 22 63  1 27 55 38 77 17  5  9
 33  8 10 48 56 47 47 46 43 76 83 23 43 61 74 31 39 40 58 74 63 56 10 70
 10 54 25 77 52 28 21 83 59 69 44 28 14 49 36 10 34 46 43 52 16  6 27 27
  5 19 71  0 28 13 58 23 69 15 63  9 35 49  0 39 54 15 58 38 77 21  7