In [10]:
import requests
import json
import pandas as pd
import psycopg2

Ingest

In [None]:
DATA_DIR = "../../data"
NUMBER_OF_ROWS = None
daily_revenue_df = pd.read_excel(
    f"{DATA_DIR}/Doanh _thu_ngay.xlsx", 
    parse_dates=True,
    nrows=NUMBER_OF_ROWS,
)

daily_revenue_df['ISSUE_DATE'] = pd.to_datetime(
    daily_revenue_df['ISSUE_DATE'], 
)

In [None]:
for idx, row in daily_revenue_df.iterrows():
    if idx >= 217950:
        sub_id = row["SUB_ID"]
        issue_date: pd.Timestamp = row['ISSUE_DATE']

        sub_id = str(sub_id)
        issue_date = str(issue_date)

        body = {
            "key": issue_date,
            "value": sub_id,
        }

        # print(f"Ingest data of date {issue_date}", json.dumps(body))
        
        res = requests.post(
            "http://127.0.0.1:5000/hyperbloom/hash", 
            data=json.dumps(body),
            headers={
                "Content-Type": "application/json"
            }
        )

        if res.status_code >= 400:
            print("Got the error:", res.text)
            break
        
    if idx % 50 == 0:
        print("Record", idx, row["ISSUE_DATE"], row["SUB_ID"])

Check existence

In [3]:
body = {"key": "2023-06-06 00:00:00", "value": "40751872"}
res = requests.post(
    "http://127.0.0.1:5000/hyperbloom/exists", 
    data=json.dumps(body),
    headers={
        "Content-Type": "application/json"
    }
)
print(res.text)

(40751872) ⪽ (2023-06-06 00:00:00) = true



Check cardinality

In [4]:
body = {"key": "2023-11-07 00:00:00"}
res = requests.get(
    f"http://127.0.0.1:5000/hyperbloom/card?key={body['key']}",
)
print(res.text)

Cardinality (bloom, hyperloglog) = (1510, 1504)


Check sim

In [7]:
body = {
    "key_1": "2023-11-07 00:00:00",
    "key_2": "2023-11-08 00:00:00",
}
res = requests.post(
    f"http://127.0.0.1:5000/hyperbloom/sim",
    data=json.dumps(body),
    headers={
        "Content-Type": "application/json"
    }
)
print(res.text)

Jaccard similarity = 0.423215


Check chaining exists 

In [5]:
body = {
    "keys": [
        "2023-01-07 00:00:00",
        "2023-01-08 00:00:00",
        # "2023-01-06 00:00:00",
    ],
    "value": "43914221",
    "operator": "AND",
}
res = requests.post(
    f"http://127.0.0.1:5000/hyperbloom/exists/chaining",
    data=json.dumps(body),
    headers={
        "Content-Type": "application/json"
    }
)
print(res.text)

AND chaining exists = true


In [6]:
body = {
    "keys": [
        "2023-01-07 00:00:00",
        "2023-01-08 00:00:00",
        # "2023-01-06 00:00:00",
    ],
    "value": "43914221",
    "operator": "OR",
}
res = requests.post(
    f"http://127.0.0.1:5000/hyperbloom/exists/chaining",
    data=json.dumps(body),
    headers={
        "Content-Type": "application/json"
    }
)
print(res.text)

OR chaining exists = true


Archive data

In [13]:
query = "COPY (SELECT * FROM bloom_filters) TO stdout DELIMITER ',' CSV HEADER"

host     = "127.0.0.1" 
port     = 5432
user     = "admin" 
password = "123" 
dbname   = "postgres"

connstr = f"postgresql://{user}:{password}@{host}:{port}/{dbname}"
conn = psycopg2.connect(connstr)
cursor = conn.cursor()

with open("blooms.txt", "w") as file:
    cursor.copy_to(file, "bloom_filters")
