In [1]:
import re
import random
import collections
import matplotlib.pyplot as plt
from collections import defaultdict
from heapq import nlargest
import time

# Generate stream with variable frequency

def generate_stream(num_words=1000000):
    words = ["apple", "banana", "cherry", "date", "elderberry", "fig", "grape", "honeydew"]
    stream = []
    for _ in range(num_words):
        word = random.choices(words, weights=[50, 30, 10, 5, 3, 1, 1, 1])[0]
        stream.append(word)
    return stream

stream = generate_stream()

# Method 1: Using built-in Python hash table (dict)
start_time = time.time()
word_count_builtin = collections.Counter(stream)
top_50_builtin = nlargest(50, word_count_builtin.items(), key=lambda x: x[1])
end_time = time.time()
print("Top 50 words using built-in hash table:")
for word, count in top_50_builtin:
    print(f"{word}: {count}")
print(f"Time taken (built-in hash table): {end_time - start_time:.2f} seconds\n")

# Method 2: Custom hash function and hash table
class CustomHashTable:
    def __init__(self, size=100003):  # A prime number for hash table size
        self.size = size
        self.table = [[] for _ in range(size)]

    def _hash(self, key):
        return sum(ord(c) for c in key) % self.size

    def insert(self, key):
        index = self._hash(key)
        for i, (k, v) in enumerate(self.table[index]):
            if k == key:
                self.table[index][i] = (k, v + 1)
                return
        self.table[index].append((key, 1))

    def get(self, key):
        index = self._hash(key)
        for k, v in self.table[index]:
            if k == key:
                return v
        return 0

    def items(self):
        for bucket in self.table:
            for k, v in bucket:
                yield (k, v)

start_time = time.time()
custom_hash_table = CustomHashTable()
for word in stream:
    custom_hash_table.insert(word)

top_50_custom = nlargest(50, custom_hash_table.items(), key=lambda x: x[1])
end_time = time.time()
print("Top 50 words using custom hash table:")
for word, count in top_50_custom:
    print(f"{word}: {count}")
print(f"Time taken (custom hash table): {end_time - start_time:.2f} seconds\n")

# Method 3: Log-based counter with single byte (approximate counting)
class LogCounter:
    def __init__(self):
        self.counters = defaultdict(int)
        self.base = 1.1

    def increment(self, key):
        value = self.counters[key]
        threshold = self.base ** value
        if random.random() <= 1 / threshold:
            self.counters[key] += 1

    def items(self):
        return self.counters.items()

start_time = time.time()
log_counter = LogCounter()
for word in stream:
    log_counter.increment(word)

approx_top_50 = nlargest(50, log_counter.items(), key=lambda x: x[1])
end_time = time.time()
print("Top 50 words using log-based counter (approximate counting):")
for word, count in approx_top_50:
    print(f"{word}: {count}")
print(f"Time taken (log-based counter): {end_time - start_time:.2f} seconds\n")

# Comparison of results
def compare_results(built_in, custom, log):
    built_in_words = set(word for word, _ in built_in)
    custom_words = set(word for word, _ in custom)
    log_words = set(word for word, _ in log)

    print("\nComparison of top 50 words:")
    print(f"Common words between built-in and custom hash table: {len(built_in_words & custom_words)} out of 50")
    print(f"Common words between built-in and log-based counter: {len(built_in_words & log_words)} out of 50")

compare_results(top_50_builtin, top_50_custom, approx_top_50)


Top 50 words using built-in hash table:
apple: 494836
banana: 297255
cherry: 99098
date: 49685
elderberry: 29435
grape: 10078
fig: 9930
honeydew: 9683
Time taken (built-in hash table): 0.02 seconds

Top 50 words using custom hash table:
apple: 494836
banana: 297255
cherry: 99098
date: 49685
elderberry: 29435
grape: 10078
fig: 9930
honeydew: 9683
Time taken (custom hash table): 0.61 seconds

Top 50 words using log-based counter (approximate counting):
apple: 111
banana: 110
cherry: 96
date: 86
elderberry: 82
grape: 73
fig: 72
honeydew: 71
Time taken (log-based counter): 0.15 seconds


Comparison of top 50 words:
Common words between built-in and custom hash table: 8 out of 50
Common words between built-in and log-based counter: 8 out of 50
