Original query 

```
WITH sampled_data AS (
  SELECT 
    date_trunc(date(date_ts), month) as date_month,
    publisher_formatted as publisher,
    geo,
    device_type_grouped as device_type,
    device_id,
    tier_1 as parent_category,
    category,
    concat('tier', tier) as tier,
    pageviews
  FROM 
    ozone-analytics-dev.ozone.dim_audience_insights,
    UNNEST(content_categories)
  WHERE 
    DATE(date_ts) BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY) AND DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY)
    AND RAND() <= 0.01
)
SELECT 
  date_month,
  publisher,
  geo,
  device_type,
  device_id,
  parent_category,
  category,
  tier,
  sum(pageviews) as pageviews
FROM sampled_data
GROUP BY 
  1,2,3,4,5,6,7,8;

```

In [1]:
import pandas as pd

In [2]:
# import data set 

df = pd.read_csv('/Users/orioldiaz/Downloads/bq-results-20240822-102454-1724322315540.csv')

df.head(2)

Unnamed: 0,date_month,publisher,geo,device_type,device_id,parent_category,category,tier,pageviews
0,2024-08-01,Reach,UK and Ireland,Phone,24YWWG4icTXKhX1zanu7Q9oeV77,business and finance,retail industry,tier3,1
1,2024-08-01,Reach,UK and Ireland,Desktop,2ih2nHmT0x0jNWCvADOP2vjNoC2,news and politics,local news,tier2,1


In [6]:
# understand the size of the data 

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8649806 entries, 0 to 8649805
Data columns (total 9 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   date_month       object
 1   publisher        object
 2   geo              object
 3   device_type      object
 4   device_id        object
 5   parent_category  object
 6   category         object
 7   tier             object
 8   pageviews        int64 
dtypes: int64(1), object(8)
memory usage: 593.9+ MB


In [10]:
df.shape

(8649806, 9)

In [11]:
# Remove NaN values from the device_id column
df_filtered = df.dropna(subset=['device_id'])

In [12]:
df.shape

(8649806, 9)

There's 8.6M rows of data due to the high cardinality of device ID

# Test HLL to get an approximate count of device_id

In [13]:
# convert device ID to string

df['device_id'] = df['device_id'].astype(str)


In [20]:
import time
import hyperloglog

# Define the error percentage (e.g., 1% error)
error_rate = 0.01

# Initialize the HyperLogLog object with the specified error rate
hll = hyperloglog.HyperLogLog(error_rate)

# Start timer for HLL method
start_time_hll = time.time()

# Update HLL with each device_id, ensuring all entries are strings
for device_id in df['device_id']:
    hll.add(str(device_id))

# Get the approximate count
hll_count = len(hll)

# Stop timer for HLL method
end_time_hll = time.time()

# Calculate elapsed time for HLL method
hll_time = end_time_hll - start_time_hll

print(f"HLL Approximate Unique Device IDs: {hll_count}")
print(f"Time taken by HLL method: {hll_time:.4f} seconds")


HLL Approximate Unique Device IDs: 6295292
Time taken by HLL method: 8.1806 seconds


# Test a native python method to get the unique count of device IDs

In [9]:
# Start timer for native method
start_time_native = time.time()

# Use a set to find unique device_ids
unique_device_ids = set(df['device_id'])

# Get the exact count
native_count = len(unique_device_ids)

# Stop timer for native method
end_time_native = time.time()

# Calculate elapsed time for native method
native_time = end_time_native - start_time_native

print(f"Native Python Unique Device IDs: {native_count}")
print(f"Time taken by native method: {native_time:.4f} seconds")


Native Python Unique Device IDs: 6282121
Time taken by native method: 2.0057 seconds


In [31]:
# Comparing results

pct_diff = ((hll_count/native_count)-1)*100
 
print(f"Difference in counts (HLL - Native): {pct_diff:.2f}%")
print(f"Time difference (HLL - Native): {hll_time - native_time:.4f} seconds")


Difference in counts (HLL - Native): 0.21%
Time difference (HLL - Native): 6.1749 seconds


# Test calculating UU by parent_category

In [32]:
import hyperloglog

# Define the error percentage (e.g., 1% error)
error_rate = 0.01

# Start timer for HLL method
start_time_hll = time.time()

# Calculate unique device_id count by parent_category using HLL
hll_counts = {}
for category, group in df.groupby('parent_category'):
    hll = hyperloglog.HyperLogLog(error_rate)
    for device_id in group['device_id'].dropna():
        hll.add(str(device_id))
    hll_counts[category] = len(hll)

# Stop timer for HLL method
end_time_hll = time.time()

# Calculate elapsed time for HLL method
hll_time = end_time_hll - start_time_hll

print("HLL Approximate Unique Device IDs by Parent Category:")
print(hll_counts)
print(f"Time taken by HLL method: {hll_time:.4f} seconds")


HLL Approximate Unique Device IDs by Parent Category:
{'automotive': 105269, 'books and literature': 27996, 'business and finance': 533724, 'careers': 67057, 'education': 37193, 'events and attractions': 268258, 'family and relationships': 571680, 'fine art': 36387, 'food and drink': 176372, 'healthy living': 45784, 'hobbies and interests': 238559, 'home and garden': 111664, 'intentions': 57223, 'medical health': 689619, 'movies': 92403, 'music and audio': 90658, 'news and politics': 2116239, 'personal finance': 184519, 'pets': 25146, 'pop culture': 366086, 'real estate': 42760, 'religion and spirituality': 30825, 'science': 230059, 'shopping': 68448, 'sports': 472170, 'style and fashion': 327807, 'technology and computing': 312397, 'television': 184575, 'travel': 421098, 'uncategorized': 1, 'video gaming': 11259}
Time taken by HLL method: 9.9652 seconds


In [33]:
import time

# Start timer for native method
start_time_native = time.time()

# Calculate unique device_id count by parent_category using a dictionary
native_counts = {}
for category, group in df.groupby('parent_category'):
    native_counts[category] = len(set(group['device_id'].dropna().astype(str)))

# Stop timer for native method
end_time_native = time.time()

# Calculate elapsed time for native method
native_time = end_time_native - start_time_native

print("Native Python Unique Device IDs by Parent Category:")
print(native_counts)
print(f"Time taken by native method: {native_time:.4f} seconds")


Native Python Unique Device IDs by Parent Category:
{'automotive': 104140, 'books and literature': 28176, 'business and finance': 534150, 'careers': 67928, 'education': 37554, 'events and attractions': 268300, 'family and relationships': 564820, 'fine art': 36346, 'food and drink': 176237, 'healthy living': 45903, 'hobbies and interests': 239715, 'home and garden': 111560, 'intentions': 57240, 'medical health': 690526, 'movies': 92765, 'music and audio': 89872, 'news and politics': 2115217, 'personal finance': 182949, 'pets': 25225, 'pop culture': 366845, 'real estate': 42266, 'religion and spirituality': 30868, 'science': 231903, 'shopping': 67892, 'sports': 467261, 'style and fashion': 331047, 'technology and computing': 313440, 'television': 185305, 'travel': 419338, 'uncategorized': 1, 'video gaming': 11414}
Time taken by native method: 3.5888 seconds


create df for comparison 

In [35]:
import pandas as pd

# Create a DataFrame from the native and HLL counts
df_counts = pd.DataFrame({
    'parent_category': native_counts.keys(),
    'native_count': native_counts.values(),
    'hll_count': [hll_counts.get(cat, 0) for cat in native_counts.keys()]
})

# Calculate the percentage difference between native_count and hll_count
df_counts['percent_difference'] = 100 * (df_counts['native_count'] - df_counts['hll_count']) / df_counts['native_count']

# Round the percent_difference column to 1 decimal place
df_counts['percent_difference'] = df_counts['percent_difference'].round(1)

# Display the DataFrame
df_counts.head(5)


Unnamed: 0,parent_category,native_count,hll_count,percent_difference
0,automotive,104140,105269,-1.1
1,books and literature,28176,27996,0.6
2,business and finance,534150,533724,0.1
3,careers,67928,67057,1.3
4,education,37554,37193,1.0


# Test different combinations

In [37]:
# Extract the list of unique parent_category values
parent_categories = df_counts['parent_category'].tolist()

# Generate pairs of adjacent categories
category_pairs = [(parent_categories[i], parent_categories[i+1]) for i in range(len(parent_categories) - 1)]


In [38]:
import hyperloglog

# Define the error percentage (e.g., 1% error)
error_rate = 0.01

# Initialize a dictionary to store HLL counts for each pair
hll_combination_counts = {}

# Start timer for HLL method
start_time_hll_comb = time.time()

# Calculate unique device_id count for each pair of categories using HLL
for pair in category_pairs:
    hll = hyperloglog.HyperLogLog(error_rate)
    combined_df = df[(df['parent_category'] == pair[0]) | (df['parent_category'] == pair[1])]
    for device_id in combined_df['device_id'].dropna():
        hll.add(str(device_id))
    hll_combination_counts[pair] = len(hll)

# Stop timer for HLL method
end_time_hll_comb = time.time()

# Calculate elapsed time for HLL method
hll_comb_time = end_time_hll_comb - start_time_hll_comb

print("HLL Approximate Unique Device IDs for Category Pairs:")
print(hll_combination_counts)
print(f"Time taken by HLL method for combinations: {hll_comb_time:.4f} seconds")


HLL Approximate Unique Device IDs for Category Pairs:
{('automotive', 'books and literature'): 132451, ('books and literature', 'business and finance'): 559905, ('business and finance', 'careers'): 593826, ('careers', 'education'): 103207, ('education', 'events and attractions'): 304615, ('events and attractions', 'family and relationships'): 821851, ('family and relationships', 'fine art'): 607277, ('fine art', 'food and drink'): 212570, ('food and drink', 'healthy living'): 222849, ('healthy living', 'hobbies and interests'): 283117, ('hobbies and interests', 'home and garden'): 350259, ('home and garden', 'intentions'): 169178, ('intentions', 'medical health'): 736831, ('medical health', 'movies'): 780046, ('movies', 'music and audio'): 181138, ('music and audio', 'news and politics'): 2198818, ('news and politics', 'personal finance'): 2279522, ('personal finance', 'pets'): 208610, ('pets', 'pop culture'): 392359, ('pop culture', 'real estate'): 404617, ('real estate', 'religion an

In [39]:
import time

# Initialize a dictionary to store native counts for each pair
native_combination_counts = {}

# Start timer for native method
start_time_native_comb = time.time()

# Calculate unique device_id count for each pair of categories
for pair in category_pairs:
    combined_df = df[(df['parent_category'] == pair[0]) | (df['parent_category'] == pair[1])]
    unique_device_ids = set(combined_df['device_id'].dropna().astype(str))
    native_combination_counts[pair] = len(unique_device_ids)

# Stop timer for native method
end_time_native_comb = time.time()

# Calculate elapsed time for native method
native_comb_time = end_time_native_comb - start_time_native_comb

print("Native Python Unique Device IDs for Category Pairs:")
print(native_combination_counts)
print(f"Time taken by native method for combinations: {native_comb_time:.4f} seconds")


Native Python Unique Device IDs for Category Pairs:
{('automotive', 'books and literature'): 132070, ('books and literature', 'business and finance'): 560618, ('business and finance', 'careers'): 596507, ('careers', 'education'): 105125, ('education', 'events and attractions'): 304861, ('events and attractions', 'family and relationships'): 816469, ('family and relationships', 'fine art'): 599378, ('fine art', 'food and drink'): 212013, ('food and drink', 'healthy living'): 221215, ('healthy living', 'hobbies and interests'): 284456, ('hobbies and interests', 'home and garden'): 349363, ('home and garden', 'intentions'): 167871, ('intentions', 'medical health'): 743868, ('medical health', 'movies'): 776578, ('movies', 'music and audio'): 181096, ('music and audio', 'news and politics'): 2190861, ('news and politics', 'personal finance'): 2272527, ('personal finance', 'pets'): 207738, ('pets', 'pop culture'): 391039, ('pop culture', 'real estate'): 407775, ('real estate', 'religion and 

In [40]:
# Prepare a DataFrame to compare results
comparison_df = pd.DataFrame({
    'category_pair': [f"{pair[0]} + {pair[1]}" for pair in category_pairs],
    'native_count': [native_combination_counts[pair] for pair in category_pairs],
    'hll_count': [hll_combination_counts[pair] for pair in category_pairs]
})

# Calculate the percentage difference between native_count and hll_count
comparison_df['percent_difference'] = 100 * (comparison_df['native_count'] - comparison_df['hll_count']) / comparison_df['native_count']
comparison_df['percent_difference'] = comparison_df['percent_difference'].round(1)

# Display the comparison DataFrame
comparison_df


Unnamed: 0,category_pair,native_count,hll_count,percent_difference
0,automotive + books and literature,132070,132451,-0.3
1,books and literature + business and finance,560618,559905,0.1
2,business and finance + careers,596507,593826,0.4
3,careers + education,105125,103207,1.8
4,education + events and attractions,304861,304615,0.1
5,events and attractions + family and relationships,816469,821851,-0.7
6,family and relationships + fine art,599378,607277,-1.3
7,fine art + food and drink,212013,212570,-0.3
8,food and drink + healthy living,221215,222849,-0.7
9,healthy living + hobbies and interests,284456,283117,0.5


# test pre-hashing device ID 

In [41]:
import hashlib

# Define a function to hash the device_id
def hash_device_id(device_id):
    return hashlib.md5(str(device_id).encode('utf8')).digest()

# Apply the hashing function to the device_id column and store it in a new column
df['hashed_device_id'] = df['device_id'].apply(hash_device_id)

df.head(2)

Unnamed: 0,date_month,publisher,geo,device_type,device_id,parent_category,category,tier,pageviews,hashed_device_id
0,2024-08-01,Reach,UK and Ireland,Phone,24YWWG4icTXKhX1zanu7Q9oeV77,business and finance,retail industry,tier3,1,b'\xbd\xe2x\xcdd\x9a\xa5\xb7\xe0\xee\xbd\x1dj\...
1,2024-08-01,Reach,UK and Ireland,Desktop,2ih2nHmT0x0jNWCvADOP2vjNoC2,news and politics,local news,tier2,1,b'{J{H\xd8\x8c\xec\x01\xe8=\x85\xe3\xe2\xf9\xb...


# try re-running the counting with hashed devices IDs

In [47]:
import hyperloglog

# Define the error percentage (e.g., 1% error)
error_rate = 0.01

# Initialize a dictionary to store HLL counts for each pair
hll_combination_counts = {}

# Start timer for HLL method
start_time_hll_comb = time.time()

# Calculate unique device_id count for each pair of categories using HLL
for pair in category_pairs:
    hll = hyperloglog.HyperLogLog(error_rate)
    combined_df = df[(df['parent_category'] == pair[0]) | (df['parent_category'] == pair[1])]
    for hashed_device_id in combined_df['hashed_device_id'].dropna():
        hll.add(hashed_device_id)
    hll_combination_counts[pair] = len(hll)

# Stop timer for HLL method
end_time_hll_comb = time.time()

# Calculate elapsed time for HLL method
hll_comb_time = end_time_hll_comb - start_time_hll_comb

print("HLL Approximate Unique Device IDs for Category Pairs:")
print(hll_combination_counts)
print(f"Time taken by HLL method for combinations: {hll_comb_time:.4f} seconds")


HLL Approximate Unique Device IDs for Category Pairs:
{('automotive', 'books and literature'): 133529, ('books and literature', 'business and finance'): 559078, ('business and finance', 'careers'): 596492, ('careers', 'education'): 104562, ('education', 'events and attractions'): 300930, ('events and attractions', 'family and relationships'): 813249, ('family and relationships', 'fine art'): 599092, ('fine art', 'food and drink'): 211464, ('food and drink', 'healthy living'): 222251, ('healthy living', 'hobbies and interests'): 284262, ('hobbies and interests', 'home and garden'): 348644, ('home and garden', 'intentions'): 167032, ('intentions', 'medical health'): 747847, ('medical health', 'movies'): 783667, ('movies', 'music and audio'): 179184, ('music and audio', 'news and politics'): 2184243, ('news and politics', 'personal finance'): 2265281, ('personal finance', 'pets'): 208893, ('pets', 'pop culture'): 390040, ('pop culture', 'real estate'): 403982, ('real estate', 'religion an

Try to optimise HLL further

In [48]:
import time
import hyperloglog

# Function to calculate HLL count for a given category pair without additional hashing
def calculate_hll_for_pair(pair):
    hll = hyperloglog.HyperLogLog(0.01)  # Adjust error rate as needed
    combined_df = df[(df['parent_category'] == pair[0]) | (df['parent_category'] == pair[1])]
    for device_id in combined_df['device_id'].dropna():
        hll.add(str(device_id))
    return len(hll)

# Testing performance
start_time_hll = time.time()

hll_combination_counts = {}
for pair in category_pairs:
    hll_combination_counts[pair] = calculate_hll_for_pair(pair)

end_time_hll = time.time()

hll_time = end_time_hll - start_time_hll
print(f"HLL Time: {hll_time:.4f} seconds")

# Native Python method for comparison
start_time_native = time.time()

native_combination_counts = {}
for pair in category_pairs:
    combined_df = df[(df['parent_category'] == pair[0]) | (df['parent_category'] == pair[1])]
    native_combination_counts[pair] = len(set(combined_df['device_id'].dropna().astype(str)))

end_time_native = time.time()

native_time = end_time_native - start_time_native
print(f"Native Python Time: {native_time:.4f} seconds")


HLL Time: 36.1546 seconds
Native Python Time: 23.4705 seconds


try tracking memory usage

In [50]:
from memory_profiler import memory_usage
import hyperloglog
import time

# Function to calculate HLL count for a given category pair
def calculate_hll_for_pair(pair):
    hll = hyperloglog.HyperLogLog(0.01)  # Adjust error rate as needed
    combined_df = df[(df['parent_category'] == pair[0]) | (df['parent_category'] == pair[1])]
    for device_id in combined_df['device_id'].dropna():
        hll.add(str(device_id))
    return len(hll)

# Function to track memory usage for HLL method
def hll_memory_usage():
    hll_combination_counts = {}
    for pair in category_pairs:
        hll_combination_counts[pair] = calculate_hll_for_pair(pair)
    return hll_combination_counts

# Function to track memory usage for Native Python method
def native_memory_usage():
    native_combination_counts = {}
    for pair in category_pairs:
        combined_df = df[(df['parent_category'] == pair[0]) | (df['parent_category'] == pair[1])]
        native_combination_counts[pair] = len(set(combined_df['device_id'].dropna().astype(str)))
    return native_combination_counts

# Measure memory usage for HLL
hll_mem_usage = memory_usage(hll_memory_usage)
print(f"HLL Memory Usage: {max(hll_mem_usage) - min(hll_mem_usage):.2f} MiB")

# Measure memory usage for Native Python method
native_mem_usage = memory_usage(native_memory_usage)
print(f"Native Python Memory Usage: {max(native_mem_usage) - min(native_mem_usage):.2f} MiB")


HLL Memory Usage: 2096.66 MiB
Native Python Memory Usage: 405.00 MiB


# Test Theta Sketch

In [52]:
import datasketches
import time

# Initialize a dictionary to store Theta Sketch counts for each pair
theta_combination_counts = {}

# Start timer for Theta Sketch method
start_time_theta_comb = time.time()

# Calculate unique device_id count for each pair of categories using Theta Sketch
for pair in category_pairs:
    theta_sketch = datasketches.update_theta_sketch()
    combined_df = df[(df['parent_category'] == pair[0]) | (df['parent_category'] == pair[1])]
    for device_id in combined_df['device_id'].dropna():
        theta_sketch.update(str(device_id))
    theta_combination_counts[pair] = theta_sketch.get_estimate()

# Stop timer for Theta Sketch method
end_time_theta_comb = time.time()

# Calculate elapsed time for Theta Sketch method
theta_comb_time = end_time_theta_comb - start_time_theta_comb

print("Theta Sketch Approximate Unique Device IDs for Category Pairs:")
print(theta_combination_counts)
print(f"Time taken by Theta Sketch method for combinations: {theta_comb_time:.4f} seconds")


Theta Sketch Approximate Unique Device IDs for Category Pairs:
{('automotive', 'books and literature'): 132630.80532510087, ('books and literature', 'business and finance'): 556815.3749809879, ('business and finance', 'careers'): 593161.9438767745, ('careers', 'education'): 104940.51420455234, ('education', 'events and attractions'): 303021.2991794706, ('events and attractions', 'family and relationships'): 819203.3543792368, ('family and relationships', 'fine art'): 604252.0386898831, ('fine art', 'food and drink'): 211114.9581911079, ('food and drink', 'healthy living'): 219270.88492477225, ('healthy living', 'hobbies and interests'): 285345.4432498256, ('hobbies and interests', 'home and garden'): 346921.3659568392, ('home and garden', 'intentions'): 167065.82856517527, ('intentions', 'medical health'): 756405.906162473, ('medical health', 'movies'): 795623.7703619212, ('movies', 'music and audio'): 181486.14151134694, ('music and audio', 'news and politics'): 2180818.7808413217, ('

In [53]:
# Native Python method for comparison
start_time_native = time.time()

native_combination_counts = {}
for pair in category_pairs:
    combined_df = df[(df['parent_category'] == pair[0]) | (df['parent_category'] == pair[1])]
    native_combination_counts[pair] = len(set(combined_df['device_id'].dropna().astype(str)))

end_time_native = time.time()

native_time = end_time_native - start_time_native
print(f"Native Python Time: {native_time:.4f} seconds")

# Display the comparison
print("Native Python Unique Device IDs for Category Pairs:")
print(native_combination_counts)


Native Python Time: 24.6906 seconds
Native Python Unique Device IDs for Category Pairs:
{('automotive', 'books and literature'): 132070, ('books and literature', 'business and finance'): 560618, ('business and finance', 'careers'): 596507, ('careers', 'education'): 105125, ('education', 'events and attractions'): 304861, ('events and attractions', 'family and relationships'): 816469, ('family and relationships', 'fine art'): 599378, ('fine art', 'food and drink'): 212013, ('food and drink', 'healthy living'): 221215, ('healthy living', 'hobbies and interests'): 284456, ('hobbies and interests', 'home and garden'): 349363, ('home and garden', 'intentions'): 167871, ('intentions', 'medical health'): 743868, ('medical health', 'movies'): 776578, ('movies', 'music and audio'): 181096, ('music and audio', 'news and politics'): 2190861, ('news and politics', 'personal finance'): 2272527, ('personal finance', 'pets'): 207738, ('pets', 'pop culture'): 391039, ('pop culture', 'real estate'): 40

In [56]:
# Prepare a DataFrame to compare results
comparison_df = pd.DataFrame({
    'category_pair': [f"{pair[0]} + {pair[1]}" for pair in category_pairs],
    'native_count': [native_combination_counts[pair] for pair in category_pairs],
    'theta_count': [theta_combination_counts[pair] for pair in category_pairs]
})

# Calculate the percentage difference between native_count and hll_count
comparison_df['percent_difference'] = 100 * (comparison_df['native_count'] - comparison_df['theta_count']) / comparison_df['native_count']
comparison_df['percent_difference'] = comparison_df['percent_difference'].round(1)

# Convert 'theta_count' from float64 to int64
comparison_df['theta_count'] = comparison_df['theta_count'].astype('int64')

# Display the comparison DataFrame
comparison_df

Unnamed: 0,category_pair,native_count,theta_count,percent_difference
0,automotive + books and literature,132070,132630,-0.4
1,books and literature + business and finance,560618,556815,0.7
2,business and finance + careers,596507,593161,0.6
3,careers + education,105125,104940,0.2
4,education + events and attractions,304861,303021,0.6
5,events and attractions + family and relationships,816469,819203,-0.3
6,family and relationships + fine art,599378,604252,-0.8
7,fine art + food and drink,212013,211114,0.4
8,food and drink + healthy living,221215,219270,0.9
9,healthy living + hobbies and interests,284456,285345,-0.3


Try to find overlapping UU

In [61]:
import datasketches
import time
import pandas as pd

# Initialize a dictionary to store Theta Sketch overlap counts for each pair
theta_overlap_counts = {}

# Start timer for Theta Sketch method
start_time_theta_overlap = time.time()

# Calculate overlapping device_id count for each pair of categories using Theta Sketch
for pair in category_pairs:
    theta_sketch_1 = datasketches.update_theta_sketch()
    theta_sketch_2 = datasketches.update_theta_sketch()

    df_1 = df[df['parent_category'] == pair[0]]
    df_2 = df[df['parent_category'] == pair[1]]

    # Update the first sketch
    for device_id in df_1['device_id'].dropna():
        theta_sketch_1.update(str(device_id))
    
    # Update the second sketch
    for device_id in df_2['device_id'].dropna():
        theta_sketch_2.update(str(device_id))
    
    # Create an intersection object and update it with both sketches
    intersection_sketch = datasketches.theta_intersection()
    intersection_sketch.update(theta_sketch_1.compact())
    intersection_sketch.update(theta_sketch_2.compact())

    # Get the result of the intersection and then get the estimate
    result_sketch = intersection_sketch.get_result()
    intersection_estimate = result_sketch.get_estimate()

    # Store the intersection estimate
    theta_overlap_counts[pair] = intersection_estimate

# Stop timer for Theta Sketch method
end_time_theta_overlap = time.time()

# Calculate elapsed time for Theta Sketch method
theta_overlap_time = end_time_theta_overlap - start_time_theta_overlap

print("Theta Sketch Overlap (Common Device IDs) for Category Pairs:")
print(theta_overlap_counts)
print(f"Time taken by Theta Sketch method for overlap calculations: {theta_overlap_time:.4f} seconds")


Theta Sketch Overlap (Common Device IDs) for Category Pairs:
{('automotive', 'books and literature'): 298.4858043237358, ('books and literature', 'business and finance'): 2091.06879233469, ('business and finance', 'careers'): 4986.394812490415, ('careers', 'education'): 299.95959693077515, ('education', 'events and attractions'): 1112.60804044455, ('events and attractions', 'family and relationships'): 16971.424418690836, ('family and relationships', 'fine art'): 2310.663418507203, ('fine art', 'food and drink'): 575.1288714984762, ('food and drink', 'healthy living'): 575.1288714984762, ('healthy living', 'hobbies and interests'): 1031.3789010002092, ('hobbies and interests', 'home and garden'): 2664.3954942505407, ('home and garden', 'intentions'): 951.0308091029083, ('intentions', 'medical health'): 4911.4680562790045, ('medical health', 'movies'): 6753.268577383631, ('movies', 'music and audio'): 1316.8245163570343, ('music and audio', 'news and politics'): 13798.066136243382, ('ne

In [62]:
# Initialize a dictionary to store native Python overlap counts for each pair
native_overlap_counts = {}

# Start timer for native Python method
start_time_native_overlap = time.time()

# Calculate overlapping device_id count for each pair of categories using set intersection
for pair in category_pairs:
    df_1 = df[df['parent_category'] == pair[0]]
    df_2 = df[df['parent_category'] == pair[1]]

    set_1 = set(df_1['device_id'].dropna().astype(str))
    set_2 = set(df_2['device_id'].dropna().astype(str))
    
    # Calculate the intersection of the two sets
    native_overlap_counts[pair] = len(set_1.intersection(set_2))

# Stop timer for native Python method
end_time_native_overlap = time.time()

# Calculate elapsed time for native Python method
native_overlap_time = end_time_native_overlap - start_time_native_overlap

print("Native Python Overlap (Common Device IDs) for Category Pairs:")
print(native_overlap_counts)
print(f"Time taken by Native Python method for overlap calculations: {native_overlap_time:.4f} seconds")


Native Python Overlap (Common Device IDs) for Category Pairs:
{('automotive', 'books and literature'): 246, ('books and literature', 'business and finance'): 1708, ('business and finance', 'careers'): 5571, ('careers', 'education'): 357, ('education', 'events and attractions'): 993, ('events and attractions', 'family and relationships'): 16651, ('family and relationships', 'fine art'): 1788, ('fine art', 'food and drink'): 570, ('food and drink', 'healthy living'): 925, ('healthy living', 'hobbies and interests'): 1162, ('hobbies and interests', 'home and garden'): 1912, ('home and garden', 'intentions'): 929, ('intentions', 'medical health'): 3898, ('medical health', 'movies'): 6713, ('movies', 'music and audio'): 1541, ('music and audio', 'news and politics'): 14228, ('news and politics', 'personal finance'): 25639, ('personal finance', 'pets'): 436, ('pets', 'pop culture'): 1031, ('pop culture', 'real estate'): 1336, ('real estate', 'religion and spirituality'): 111, ('religion and 

In [64]:
# Prepare a DataFrame to compare the results
comparison_df = pd.DataFrame({
    'category_pair': [f"{pair[0]} + {pair[1]}" for pair in category_pairs],
    'native_overlap_count': [native_overlap_counts[pair] for pair in category_pairs],
    'theta_overlap_count': [theta_overlap_counts[pair] for pair in category_pairs]
})

# Calculate the percentage difference between native_overlap_count and theta_overlap_count
comparison_df['percent_difference'] = 100 * (comparison_df['native_overlap_count'] - comparison_df['theta_overlap_count']) / comparison_df['native_overlap_count']
comparison_df['percent_difference'] = comparison_df['percent_difference'].round(1)

# Convert theta_overlap_count to int64 if desired
comparison_df['theta_overlap_count'] = comparison_df['theta_overlap_count'].astype('int64')

# Display the comparison DataFrame
comparison_df


Unnamed: 0,category_pair,native_overlap_count,theta_overlap_count,percent_difference
0,automotive + books and literature,246,298,-21.3
1,books and literature + business and finance,1708,2091,-22.4
2,business and finance + careers,5571,4986,10.5
3,careers + education,357,299,16.0
4,education + events and attractions,993,1112,-12.0
5,events and attractions + family and relationships,16651,16971,-1.9
6,family and relationships + fine art,1788,2310,-29.2
7,fine art + food and drink,570,575,-0.9
8,food and drink + healthy living,925,575,37.8
9,healthy living + hobbies and interests,1162,1031,11.2
