In [1]:
import cudf
import time
from tqdm import trange
import pandas as pd
import json

In [2]:
dtypes = {
    "Stkcd": "int32",
    "Opnprc": "float32",
    "Hiprc": "float32",
    "Loprc": "float32",
    "Clsprc": "float32",
    "PrevClsprc": "float32",
}

df = cudf.read_csv(f"../../data/TRD_Dalyr_with_PrevClsprc.csv", dtype=dtypes)

df["Trddt"] = cudf.to_datetime(df["Trddt"])
test_round = 100

In [3]:
benchmark_results = {
    "metadata": {
        "test_rounds": test_round,
        "data_size": len(df),
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
    },
    "Result": {},
}

In [4]:

start_time = time.time()
for i in trange(test_round, desc="Where on Stkcd"):
    df["DailyReturn"] = (df["Clsprc"] - df["PrevClsprc"]) / df["PrevClsprc"] * 100.0
    t = df[df["DailyReturn"] > 0.0]
    agg_dict = {
        "DailyReturn": ["max", "mean"],
        "Trddt": ["count",]
    }
    t = t.groupby("Stkcd").agg(agg_dict)
    t = t.sort_values(("DailyReturn", "mean"), ascending=False)
benchmark_results["Result"]["DailyReturn"] = (time.time() - start_time) * 1000 / test_round
benchmark_results["Result"]["DailyReturn"]

Where on Stkcd: 100%|██████████| 100/100 [00:01<00:00, 81.61it/s]


12.270216941833496

In [5]:
df = df.drop("DailyReturn", axis=1)
# df.groupby("Stkcd").resample("90D").agg(agg_dict)
df.head()

Unnamed: 0,Stkcd,Trddt,Opnprc,Hiprc,Loprc,Clsprc,PrevClsprc
0,1,2020-02-19,15.1,15.37,15.08,15.24,
1,1,2020-02-20,15.27,15.620001,15.1,15.59,15.23999977
2,1,2020-02-21,15.49,15.72,15.45,15.58,15.59000015
3,1,2020-02-24,15.46,15.46,15.150001,15.23,15.57999992
4,1,2020-02-25,15.0,15.13,14.78,15.04,15.22999954


In [6]:
with open("../../results/cudf_real_benchmark_results.json", "w") as f:
    json.dump(benchmark_results, f, indent=4)