In [16]:
import cudf
import time
from tqdm import trange
import pandas as pd
import json

In [17]:
dtypes = {
    "Stkcd" : "int32",
    "Opnprc": "float32",
    "Hiprc": "float32",
    "Loprc": "float32",
    "Clsprc": "float32"
}
df_list = []
for i in range(6):
    df = cudf.read_csv(f"../../data/TRD_Dalyr{i}.csv", dtype=dtypes)
    df_list.append(df)

df = cudf.concat(df_list, axis=0, ignore_index=True)
df["Trddt"] = cudf.to_datetime(df["Trddt"])
test_round = 100

In [18]:
benchmark_results = {
    "metadata": {
        "test_rounds": test_round,
        "data_size": len(df),
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
    },
    "where": {},
    "apply": {},
    "sort": {},
    "join": {},
    "groupby": {},
}

In [19]:

start_time = time.time()
for i in trange(test_round, desc="Where on Stkcd"):
    t = df[df["Stkcd"] <= 20000]
benchmark_results["where"]["Stkcd"] = (time.time() - start_time) * 1000 / test_round

start_time = time.time()
for i in trange(test_round, desc="Where on Clsprc"):
    t = df[df["Clsprc"] > 15.0]
benchmark_results["where"]["Clsprc"] = (time.time() - start_time) * 1000 / test_round


Where on Stkcd: 100%|██████████| 100/100 [00:00<00:00, 107.84it/s]
Where on Clsprc: 100%|██████████| 100/100 [00:00<00:00, 106.97it/s]


In [20]:
start_time = time.time()
for i in trange(test_round, desc="Apply on Stkcd"):
    t = (df["Clsprc"] - df["Opnprc"]) / (df["Hiprc"] - df["Loprc"])
benchmark_results["apply"]["Cls-Opn"] = (time.time() - start_time) * 1000 / test_round

start_time = time.time()
for i in trange(test_round, desc="Apply on Opnprc"):
    t = (df["Clsprc"] - df["Opnprc"]) * 100
benchmark_results["apply"]["Delta"] = (time.time() - start_time) * 1000 / test_round

Apply on Stkcd: 100%|██████████| 100/100 [00:00<00:00, 133.48it/s]
Apply on Opnprc: 100%|██████████| 100/100 [00:00<00:00, 208.14it/s]


In [None]:
# def double_value(x):
#     return x * 2

# start_time = time.time()
# for i in trange(test_round, desc="Apply on Stkcd"):
#     t = df["Stkcd"].apply(double_value)
# benchmark_results["apply"]["Stkcd"] = (time.time() - start_time) * 1000 / test_round

# start_time = time.time()
# for i in trange(test_round, desc="Apply on Opnprc"):
#     t = df["Opnprc"].apply(double_value)
# benchmark_results["apply"]["Opnprc"] = (time.time() - start_time) * 1000 / test_round

Apply on Stkcd: 100%|██████████| 100/100 [00:00<00:00, 142.89it/s]
Apply on Opnprc: 100%|██████████| 100/100 [00:02<00:00, 47.78it/s]


In [22]:
benchmark_results["apply"]

{'Cls-Opn': 7.505254745483398,
 'Delta': 4.813425540924072,
 'Stkcd': 7.009472846984863,
 'Opnprc': 20.93878746032715}

In [None]:
# Sort 操作
start_time = time.time()
for i in trange(test_round, desc="Sort on Stkcd"):
    t = df.sort_values("Stkcd")
benchmark_results["sort"]["Stkcd"] = (time.time() - start_time) * 1000 / test_round

start_time = time.time()
for i in trange(test_round, desc="Sort on Clsprc"):
    t = df.sort_values("Clsprc")
benchmark_results["sort"]["Clsprc"] = (time.time() - start_time) * 1000 / test_round

start_time = time.time()
for i in trange(test_round, desc="Sort on Hiprc"):
    t = df.sort_values("Hiprc")
benchmark_results["sort"]["Hiprc"] = (time.time() - start_time) * 1000 / test_round

In [None]:
stkcd_span = 1000
df_subset_0_20 = df[df["Stkcd"].between(0, stkcd_span)]
df_subset_20_40 = df[df["Stkcd"].between(stkcd_span, stkcd_span * 2)]

start_time = time.time()
for i in trange(test_round, desc="Join on Stkcd"):
    t = df_subset_0_20.merge(df_subset_20_40, on="Trddt", how="left")
benchmark_results["join"]["Stkcd_0_20_vs_20_40"] = (time.time() - start_time) * 1000 / test_round

In [None]:
agg_map = {
    "Clsprc": ["max", "min", "mean"],
    "Opnprc": ["max", "min", "mean"],
    "Hiprc": ["max", "min", "mean"],
    "Loprc": ["max", "min", "mean"],
}
start_time = time.time()
for i in trange(test_round, desc="Groupby on Stkcd (Clsprc)"):
    t = df.groupby("Stkcd").agg(agg_map)
benchmark_results["groupby"]["Stkcd_Clsprc"] = (time.time() - start_time) * 1000 / test_round

In [None]:
with open("../../results/cudf_benchmark_results.json", "w") as f:
    json.dump(benchmark_results, f, indent=4)