In [28]:
import requests
import json
import pandas as pd
import psycopg2
import time

Ingest

In [2]:
DATA_DIR = "../../../data"
NUMBER_OF_ROWS = 50000
daily_revenue_df = pd.read_excel(
    f"{DATA_DIR}/Doanh _thu_ngay.xlsx", 
    parse_dates=True,
    nrows=NUMBER_OF_ROWS,
)

daily_revenue_df['ISSUE_DATE'] = pd.to_datetime(
    daily_revenue_df['ISSUE_DATE'], 
)

In [None]:

idx_range = set(range(0, 50000))
for idx, row in daily_revenue_df.iterrows():
    if idx in idx_range:
        sub_id = row["SUB_ID"]
        issue_date: pd.Timestamp = row['ISSUE_DATE']

        sub_id = str(sub_id)
        issue_date = str(issue_date)

        body = {
            "key": issue_date,
            "value": sub_id,
        }

        # print(f"Ingest data of date {issue_date}", json.dumps(body))
        
        res = requests.post(
            "http://127.0.0.1:5000/hyperbloom/hash", 
            data=json.dumps(body),
            headers={
                "Content-Type": "application/json"
            }
        )

        if res.status_code >= 400:
            print("Got the error:", res.text)
            break
        
    else:
        continue

    if idx % 50 == 0:
        print("Record", idx, row["ISSUE_DATE"], row["SUB_ID"])

    if idx % 5000 == 0:
        time.sleep(1)
    

Check existence

In [17]:
body = {"key": "2023-06-13 00:00:00", "value": "30012648"}
res = requests.post(
    "http://127.0.0.1:5000/hyperbloom/exists", 
    data=json.dumps(body),
    headers={
        "Content-Type": "application/json"
    }
)
print(res.text)

(30012648) ⪽ (2023-06-13 00:00:00) = true



Check cardinality

In [24]:
body = {"key": "2023-01-06 00:00:00"}
res = requests.get(
    f"http://127.0.0.1:5000/hyperbloom/card?key={body['key']}",
)
print(res.text)

Cardinality (bloom, hyperloglog) = (1524, 1526)


Check sim

In [38]:
body = {
    "key_1": "2023-11-07 00:00:00",
    "key_2": "2023-11-08 00:00:00",
}
res = requests.post(
    f"http://127.0.0.1:5000/hyperbloom/sim",
    data=json.dumps(body),
    headers={
        "Content-Type": "application/json"
    }
)
print(res.text)

Jaccard similarity = 0.422801


Check chaining exists 

In [20]:
body = {
    "keys": [
        "2023-01-07 00:00:00",
        "2023-01-08 00:00:00",
        "2023-01-06 00:00:00",
    ],
    "value": "43914221",
    "operator": "AND",
}
res = requests.post(
    f"http://127.0.0.1:5000/hyperbloom/exists/chaining",
    data=json.dumps(body),
    headers={
        "Content-Type": "application/json"
    }
)
print(res.text)

AND chaining exists = false


In [21]:
body = {
    "keys": [
        "2023-01-07 00:00:00",
        "2023-01-08 00:00:00",
        "2023-01-06 00:00:00",
    ],
    "value": "148186785",
    "operator": "OR",
}
res = requests.post(
    f"http://127.0.0.1:5000/hyperbloom/exists/chaining",
    data=json.dumps(body),
    headers={
        "Content-Type": "application/json"
    }
)
print(res.text)

OR chaining exists = true


Check bitwise exists 

In [34]:
body = {
    "keys": [
        "2023-01-07 00:00:00",
        "2023-01-08 00:00:00",
        "2023-01-06 00:00:00",
    ],
    "value": "43914221",
    "operator": "AND",
}
res = requests.post(
    f"http://127.0.0.1:5000/hyperbloom/exists/bitwise",
    data=json.dumps(body),
    headers={
        "Content-Type": "application/json"
    }
)
print(res.text)

AND bitwise exists = false


In [35]:
body = {
    "keys": [
        "2023-01-07 00:00:00",
        "2023-01-08 00:00:00",
        "2023-01-06 00:00:00",
    ],
    "value": "148186785",
    "operator": "OR",
}
res = requests.post(
    f"http://127.0.0.1:5000/hyperbloom/exists/bitwise",
    data=json.dumps(body),
    headers={
        "Content-Type": "application/json"
    }
)
print(res.text)

OR bitwise exists = true


Archive data

In [36]:
host     = "127.0.0.1" 
port     = 5432
user     = "admin" 
password = "123" 
dbname   = "postgres"

connstr = f"postgresql://{user}:{password}@{host}:{port}/{dbname}"
conn = psycopg2.connect(connstr)
cursor = conn.cursor()

with (
        open("hyperblooms.txt", "w") as hb_file, 
        open("hyperblooms_metadata.txt", "w") as hb_meta_file
    ):
    cursor.copy_to(hb_file, "hyperblooms")
    cursor.copy_to(hb_meta_file, "hyperblooms_metadata")


