## setup

In [8]:
import redis
import json
from multiprocessing import Process, cpu_count
import json
import pandas as pd
import numpy as np

from typing import List

In [9]:
dataset_path = "../data/datasets/careful"
properties_path = "../data/outputs/careful.json"
metric = "pitch_histogram"

In [10]:
# FT.CREATE idx:table ON JSON PREFIX 1 cmp: SCHEMA $.sim AS sim NUMERIC $.row_file AS row_file TEXT $.col_file AS col_file TEXT $.metric AS metric TEXT

In [12]:
# redis setup
redis_url = "redis://localhost:6379"
r = redis.Redis(redis_url)


# load from fs
properties = {}
with open(properties_path, "r") as f:
    properties = json.load(f)

names = list(properties.keys())
names.sort()

num_processes = cpu_count()
rows_per_process = len(names) // num_processes  # type: ignore
extra_rows = len(names) % num_processes  # type: ignore
print(f"{len(names)} & {num_processes} -> {rows_per_process} + {extra_rows}")

3868 & 12 -> 322 + 4


## general tests

In [None]:
name1 = "20240121-70-06_0096-0104.mid"  # names[0]
name2 = "20240227-76-05_0128-0136.mid"  # names[-1]
print(f"{name1} {name2} {metric}")
r.json().get(f"cmp:{name1}:{name2}:{metric}")

In [None]:
def scan_keys(r, pattern):
    cursor = 0
    keys = []
    while True:
        cursor, new_keys = r.scan(cursor, match=pattern)
        keys.extend(new_keys)
        if cursor == 0:
            break
    return keys


# Pattern to match
pattern = "20231220-80-01_0000-0008.mid:*:pitch_histogram"

# Get all keys matching the pattern
# matching_keys = scan_keys(r, pattern)
# print(f"Keys matching pattern '{pattern}': {matching_keys}")

In [None]:
def process_json_keys(redis_conn):
    cursor = "0"
    while cursor != 0:
        cursor, keys = redis_conn.scan(cursor=cursor, count=1000)
        for key in keys:
            key_type = redis_conn.execute_command("TYPE", key)
            if key_type == b"ReJSON-RL":
                value = redis_conn.json().get(key)

                if value:
                    row_file, col_file, metric = str(key).split(":")

                    value["row_file"] = row_file[2:]
                    value["col_file"] = col_file
                    value["metric"] = metric

                    # print(f"Key: {key}, Data: {value}")
                    r.json().set(key, "$", value)

            else:
                # Ignore non-JSON objects
                continue
        print(f"finished section {cursor}")


# Call the function
process_json_keys(r)
print("DONE")

In [6]:
import redis
import pandas as pd
import numpy as np
import time

In [10]:
r = redis.Redis(host="localhost", port=6379, db=0)

In [5]:
r.json().set("test", "$", {"sim": 0.5}, nx=True)
print(r.json().get("test"))

{'sim': 0.5}


In [9]:
n = 4000
data = np.zeros((n, n), dtype=np.float16)

# Measure the time taken to populate the DataFrame
start_time = time.time()

for i in range(n):
    pipeline = r.pipeline()
    for j in range(n):
        pipeline.execute_command("JSON.GET", "test", "$.sim")

    results = pipeline.execute()

    data[i, :] = [float(result[0]) for result in results]

df = pd.DataFrame(data)
memory_usage = df.memory_usage(index=True).sum()
df = df.astype(np.float16)

end_time = time.time()
elapsed_time = end_time - start_time

# Verify the DataFrame
print(df.head())
print(f"Time taken to generate DataFrame: {elapsed_time:.2f} seconds")
print(f"Memory usage of DataFrame: {memory_usage / (1024 * 1024):.2f} MB")
del df

   0     1     2     3     4     5     6     7     8     9     ...  3990  \
0   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5  ...   0.5   
1   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5  ...   0.5   
2   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5  ...   0.5   
3   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5  ...   0.5   
4   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5  ...   0.5   

   3991  3992  3993  3994  3995  3996  3997  3998  3999  
0   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5  
1   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5  
2   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5  
3   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5  
4   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5   0.5  

[5 rows x 4000 columns]
Time taken to generate DataFrame: 79.40 seconds
Memory usage of DataFrame: 30.52 MB
