In [8]:
import pandas as pd
import time
from tqdm import trange
import json

In [9]:
dtypes = {
    "Stkcd": "int32",
    "Opnprc": "float32",
    "Hiprc": "float32",
    "Loprc": "float32",
    "Clsprc": "float32"
}
df_list = []
for i in range(6):
    df = pd.read_csv(f"../../data/TRD_Dalyr{i}.csv", dtype=dtypes)
    df_list.append(df)

df = pd.concat(df_list, axis=0, ignore_index=True)
df["Trddt"] = pd.to_datetime(df["Trddt"])
test_round = 100

In [10]:
benchmark_results = {
    "metadata": {
        "test_rounds": test_round,
        "data_size": len(df),
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
    },
    "where": {},
    "apply": {},
    "sort": {},
    "join": {},
    "groupby": {},
    "interval": {}
}

In [11]:
start_time = time.time()
for i in trange(test_round, desc="Where on Stkcd"):
    t = df[df["Stkcd"] <= 20000]
benchmark_results["where"]["Stkcd"] = (time.time() - start_time) * 1000 / test_round

start_time = time.time()
for i in trange(test_round, desc="Where on Clsprc"):
    t = df[df["Clsprc"] > 15.0]
benchmark_results["where"]["Clsprc"] = (time.time() - start_time) * 1000 / test_round

Where on Stkcd: 100%|██████████| 100/100 [00:02<00:00, 47.24it/s]
Where on Clsprc: 100%|██████████| 100/100 [00:03<00:00, 32.87it/s]


In [12]:
start_time = time.time()
for i in trange(test_round, desc="Apply on Stkcd"):
    t = (df["Clsprc"] - df["Opnprc"]) / (df["Hiprc"] - df["Loprc"])
benchmark_results["apply"]["Cls-Opn"] = (time.time() - start_time) * 1000 / test_round

start_time = time.time()
for i in trange(test_round, desc="Apply on Opnprc"):
    t = (df["Clsprc"] - df["Opnprc"]) * 100
benchmark_results["apply"]["Delta"] = (time.time() - start_time) * 1000 / test_round

Apply on Stkcd: 100%|██████████| 100/100 [00:01<00:00, 94.52it/s]
Apply on Opnprc: 100%|██████████| 100/100 [00:00<00:00, 166.78it/s]


In [13]:
def double_value(x):
    return x * 2

start_time = time.time()
for i in trange(test_round, desc="Apply on Stkcd"):
    t = df["Stkcd"].apply(double_value)
benchmark_results["apply"]["Stkcd"] = (time.time() - start_time) * 1000 / test_round

start_time = time.time()
for i in trange(test_round, desc="Apply on Opnprc"):
    t = df["Opnprc"].apply(double_value)
benchmark_results["apply"]["Opnprc"] = (time.time() - start_time) * 1000 / test_round

Apply on Stkcd: 100%|██████████| 100/100 [01:22<00:00,  1.21it/s]
Apply on Opnprc: 100%|██████████| 100/100 [00:53<00:00,  1.87it/s]


In [14]:
benchmark_results["apply"]

{'Cls-Opn': 10.591812133789062,
 'Delta': 6.005873680114746,
 'Stkcd': 824.2635464668274,
 'Opnprc': 533.8800764083862}

In [6]:
# Sort 操作
start_time = time.time()
for i in trange(test_round, desc="Sort on Stkcd"):
    t = df.sort_values("Stkcd")
benchmark_results["sort"]["Stkcd"] = (time.time() - start_time) * 1000 / test_round

start_time = time.time()
for i in trange(test_round, desc="Sort on Clsprc"):
    t = df.sort_values("Clsprc")
benchmark_results["sort"]["Clsprc"] = (time.time() - start_time) * 1000 / test_round

start_time = time.time()
for i in trange(test_round, desc="Sort on Hiprc"):
    t = df.sort_values("Hiprc")
benchmark_results["sort"]["Hiprc"] = (time.time() - start_time) * 1000 / test_round

Sort on Stkcd: 100%|██████████| 100/100 [00:17<00:00,  5.57it/s]
Sort on Clsprc: 100%|██████████| 100/100 [00:29<00:00,  3.40it/s]
Sort on Hiprc: 100%|██████████| 100/100 [00:29<00:00,  3.39it/s]


In [7]:
# Join 操作
stkcd_span = 1000
df_subset_0_20 = df[df["Stkcd"].between(0, stkcd_span)]
df_subset_20_40 = df[df["Stkcd"].between(stkcd_span, stkcd_span * 2)]

start_time = time.time()
for i in trange(test_round, desc="Join on Stkcd"):
    t = df_subset_0_20.merge(df_subset_20_40, on="Trddt", how="left")
benchmark_results["join"]["Stkcd_0_20_vs_20_40"] = (time.time() - start_time) * 1000 / test_round

Join on Stkcd: 100%|██████████| 100/100 [01:37<00:00,  1.02it/s]


In [8]:
agg_dict = {
    "Clsprc": ["max", "min", "mean"],
    "Opnprc": ["max", "min", "mean"],
    "Hiprc": ["max", "min", "mean"],
    "Loprc": ["max", "min", "mean"]
}
start_time = time.time()
for i in trange(test_round, desc="Groupby on Stkcd (Clsprc)"):
    t = df.groupby("Stkcd").agg(agg_dict)
benchmark_results["groupby"]["Stkcd_Clsprc"] = (time.time() - start_time) * 1000 / test_round

Groupby on Stkcd (Clsprc): 100%|██████████| 100/100 [00:21<00:00,  4.65it/s]


In [9]:
df = df.set_index("Trddt")
interval_test_round = test_round // int(10)

In [10]:
start_time = time.time()
for i in trange(interval_test_round, desc="Interval 30D"):
    t = df.groupby("Stkcd").resample("30D").agg(agg_dict)
benchmark_results["interval"]["30D"] = (time.time() - start_time) * 1000 / interval_test_round

Interval 30D: 100%|██████████| 10/10 [04:20<00:00, 26.01s/it]


In [11]:
start_time = time.time()
for i in trange(interval_test_round, desc="Interval 90D"):
    t = df.groupby("Stkcd").resample("90D").agg(agg_dict)
benchmark_results["interval"]["90D"] = (time.time() - start_time) * 1000 / interval_test_round

Interval 90D: 100%|██████████| 10/10 [04:18<00:00, 25.83s/it]


In [12]:
start_time = time.time()
for i in trange(interval_test_round, desc="Interval 365D"):
    t = df.groupby("Stkcd").resample("365D").agg(agg_dict)
benchmark_results["interval"]["365D"] = (time.time() - start_time) * 1000 / interval_test_round

Interval 365D: 100%|██████████| 10/10 [04:18<00:00, 25.86s/it]


In [13]:
with open("../../results/pandas_benchmark_results.json", "w") as f:
    json.dump(benchmark_results, f, indent=4)

# 打印结果
print("Benchmark results saved to 'pandas_benchmark_results.json'")
print(benchmark_results)

Benchmark results saved to 'pandas_benchmark_results.json'
{'metadata': {'test_rounds': 100, 'data_size': 5897417, 'timestamp': '2025-03-25 18:38:22'}, 'where': {'Stkcd': 19.680538177490234, 'Clsprc': 31.834053993225098}, 'apply': {'Stkcd': 807.4906301498413, 'Opnprc': 548.7237620353699}, 'sort': {'Stkcd': 179.55297708511353, 'Clsprc': 294.1118812561035, 'Hiprc': 295.24622917175293}, 'join': {'Stkcd_0_20_vs_20_40': 976.5320324897766}, 'groupby': {'Stkcd_Clsprc': 215.29688358306885}, 'interval': {'30D': 26009.100246429443, '90D': 25825.72753429413, '365D': 25861.178874969482}}
