In [1]:
import pandas as pd
import time
from tqdm import trange
import json

In [2]:
dtypes = {
    "Stkcd": "int32",
    "Opnprc": "float32",
    "Hiprc": "float32",
    "Loprc": "float32",
    "Clsprc": "float32",
    "PrevClsprc": "float32",
}

df = pd.read_csv(f"../../data/TRD_Dalyr_with_PrevClsprc.csv", dtype=dtypes)

df["Trddt"] = pd.to_datetime(df["Trddt"])
test_round = 100

In [3]:
benchmark_results = {
    "metadata": {
        "test_rounds": test_round,
        "data_size": len(df),
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
    },
    "Result": {},
}

In [4]:

start_time = time.time()
for i in trange(test_round, desc="Where on Stkcd"):
    df["DailyReturn"] = (df["Clsprc"] - df["PrevClsprc"]) / df["PrevClsprc"] * 100.0
    t = df[df["DailyReturn"] > 0.0]
    agg_dict = {
        "DailyReturn": ["max", "mean"],
        "Trddt": ["count",]
    }
    t = t.groupby("Stkcd").agg(agg_dict)
    t = t.sort_values(("DailyReturn", "mean"), ascending=False)
benchmark_results["Result"]["DailyReturn"] = (time.time() - start_time) * 1000 / test_round
benchmark_results["Result"]["DailyReturn"]

Where on Stkcd: 100%|██████████| 100/100 [00:11<00:00,  8.85it/s]


113.02371263504028

In [5]:
df = df.drop("DailyReturn", axis=1)
# df.groupby("Stkcd").resample("90D").agg(agg_dict)
df.head()

Unnamed: 0,Stkcd,Trddt,Opnprc,Hiprc,Loprc,Clsprc,PrevClsprc
0,1,2020-02-19,15.1,15.37,15.08,15.24,
1,1,2020-02-20,15.27,15.62,15.1,15.59,15.24
2,1,2020-02-21,15.49,15.72,15.45,15.58,15.59
3,1,2020-02-24,15.46,15.46,15.15,15.23,15.58
4,1,2020-02-25,15.0,15.13,14.78,15.04,15.23


In [6]:
agg_dict = {
    "Clsprc": ["mean"],
    "Opnprc": ["mean"],
    "Hiprc": ["mean"],
    "Loprc": ["mean"],
    "PrevClsprc": ["mean"]
}

In [7]:
df.head()

Unnamed: 0,Stkcd,Trddt,Opnprc,Hiprc,Loprc,Clsprc,PrevClsprc
0,1,2020-02-19,15.1,15.37,15.08,15.24,
1,1,2020-02-20,15.27,15.62,15.1,15.59,15.24
2,1,2020-02-21,15.49,15.72,15.45,15.58,15.59
3,1,2020-02-24,15.46,15.46,15.15,15.23,15.58
4,1,2020-02-25,15.0,15.13,14.78,15.04,15.23


In [8]:
df = df.set_index("Trddt")
test_round = 10

In [9]:
start_time = time.time()
for i in trange(test_round, desc="Where on Stkcd"):
    t = df.groupby("Stkcd").resample("7D").agg(agg_dict)
    t["WeeklyReturn"] = (t[("Clsprc", "mean")] - t[("PrevClsprc", "mean")]) / t[("PrevClsprc", "mean")] * 100.0
    t = t[t["WeeklyReturn"] > 0.0]
    agg_dict2 = {
        ("WeeklyReturn", ""): ["max", "mean"],
        ('Clsprc', 'mean'): ["count",]
    }
    t = t.groupby("Stkcd").agg(agg_dict2)
    t = t.sort_values(("WeeklyReturn", "", "mean"), ascending=False)
benchmark_results["Result"]["WeeklyReturn"] = (time.time() - start_time) * 1000 / test_round
benchmark_results["Result"]["WeeklyReturn"]

Where on Stkcd: 100%|██████████| 10/10 [01:52<00:00, 11.21s/it]


11214.395880699158

In [10]:
start_time = time.time()
for i in trange(test_round, desc="Where on Stkcd"):
    t = df.groupby("Stkcd").resample("30D").agg(agg_dict)
    t["MonthlyReturn"] = (t[("Clsprc", "mean")] - t[("PrevClsprc", "mean")]) / t[("PrevClsprc", "mean")] * 100.0
    t = t[t["MonthlyReturn"] > 0.0]
    agg_dict2 = {
        ("MonthlyReturn", ""): ["max", "mean"],
        ('Clsprc', 'mean'): ["count",]
    }
    t = t.groupby("Stkcd").agg(agg_dict2)
    t = t.sort_values(("MonthlyReturn", "", "mean"), ascending=False)
benchmark_results["Result"]["MonthlyReturn"] = (time.time() - start_time) * 1000 / test_round
benchmark_results["Result"]["MonthlyReturn"]

Where on Stkcd: 100%|██████████| 10/10 [01:50<00:00, 11.01s/it]


11006.831574440002

In [11]:
start_time = time.time()
for i in trange(test_round, desc="Where on Stkcd"):
    t = df.groupby("Stkcd").resample("365D").agg(agg_dict)
    t["YearlyReturn"] = (t[("Clsprc", "mean")] - t[("PrevClsprc", "mean")]) / t[("PrevClsprc", "mean")] * 100.0
    t = t[t["YearlyReturn"] > 0.0]
    agg_dict2 = {
        ("YearlyReturn", ""): ["max", "mean"],
        ('Clsprc', 'mean'): ["count",]
    }
    t = t.groupby("Stkcd").agg(agg_dict2)
    t = t.sort_values(("YearlyReturn", "", "mean"), ascending=False)
benchmark_results["Result"]["YearlyReturn"] = (time.time() - start_time) * 1000 / test_round
benchmark_results["Result"]["YearlyReturn"]

Where on Stkcd: 100%|██████████| 10/10 [01:49<00:00, 10.94s/it]


10940.002036094666

In [12]:
with open("../../results/pandas_real_benchmark_results.json", "w") as f:
    json.dump(benchmark_results, f, indent=4)

# 打印结果
print("Benchmark results saved to 'pandas_real_benchmark_results.json'")
print(benchmark_results)

Benchmark results saved to 'pandas_real_benchmark_results.json'
{'metadata': {'test_rounds': 100, 'data_size': 5897417, 'timestamp': '2025-05-08 21:09:44'}, 'Result': {'DailyReturn': 113.02371263504028, 'WeeklyReturn': 11214.395880699158, 'MonthlyReturn': 11006.831574440002, 'YearlyReturn': 10940.002036094666}}
