In [7]:
import cudf
import time
from tqdm import trange
import pandas as pd
import json

In [8]:
float_count = 4
int_count = 4

dtypes = {}
for i in range(1, float_count + 1):
    dtypes[f'float_{i}'] = 'float32'
for i in range(1, int_count + 1):
    dtypes[f'int_{i}'] = 'int32'

print(dtypes)

df = cudf.read_csv(f"../../data/synthesis_data.csv", dtype=dtypes)

df["timestamp"] = cudf.to_datetime(df["timestamp"])
test_round = 100

print(df.head(), len(df))

{'float_1': 'float32', 'float_2': 'float32', 'float_3': 'float32', 'float_4': 'float32', 'int_1': 'int32', 'int_2': 'int32', 'int_3': 'int32', 'int_4': 'int32'}
            timestamp    float_1    float_2   float_3    float_4  int_1  \
0 2020-01-01 00:00:00  14.830987  15.723351  7.764221   9.139441      8   
1 2020-01-01 00:01:00  15.277333  14.383782  8.050470   9.241591      9   
2 2020-01-01 00:02:00  14.407417  14.133512  7.785035  11.045417      8   
3 2020-01-01 00:03:00  14.416102  14.438385  7.607110   8.510004      9   
4 2020-01-01 00:04:00  15.331453  13.149238  8.170056   8.315616      8   

   int_2  int_3  int_4  
0      7      8      8  
1      7      8      8  
2      7      9      9  
3      7      9      9  
4      7      9      8   3680641


In [9]:
benchmark_results = {
    "metadata": {
        "test_rounds": test_round,
        "data_size": len(df),
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
    },
    "where": {},
    "apply": {},
    "sort": {},
    "join": {},
    "groupby": {},
}

In [10]:

for i in range(1, int_count + 1):
    start_time = time.time()
    for _ in trange(test_round, desc=f"Where on int_{i}"):
        t = df[df[f"int_{i}"] <= 100]
    benchmark_results["where"][f"int_{i}"] = (time.time() - start_time) * 1000 / test_round

for i in range(1, float_count + 1):
    start_time = time.time()
    for _ in trange(test_round, desc=f"Where on float_{i}"):
        t = df[df[f"float_{i}"] > -55.0]
    benchmark_results["where"][f"float_{i}"] = (time.time() - start_time) * 1000 / test_round

print(benchmark_results["where"])

Where on int_1: 100%|██████████| 100/100 [00:00<00:00, 284.28it/s]
Where on int_2: 100%|██████████| 100/100 [00:00<00:00, 285.83it/s]
Where on int_3: 100%|██████████| 100/100 [00:00<00:00, 284.32it/s]
Where on int_4: 100%|██████████| 100/100 [00:00<00:00, 285.29it/s]
Where on float_1: 100%|██████████| 100/100 [00:00<00:00, 241.26it/s]
Where on float_2: 100%|██████████| 100/100 [00:00<00:00, 285.48it/s]
Where on float_3: 100%|██████████| 100/100 [00:00<00:00, 210.06it/s]
Where on float_4: 100%|██████████| 100/100 [00:00<00:00, 248.45it/s]

{'int_1': 3.5241293907165527, 'int_2': 3.5050010681152344, 'int_3': 3.5245347023010254, 'int_4': 3.511991500854492, 'float_1': 4.152727127075195, 'float_2': 3.5090255737304688, 'float_3': 4.767255783081055, 'float_4': 4.031405448913574}





In [None]:
def double_value(x):
    return x * 2


for i in range(1, float_count + 1):
    start_time = time.time()
    for _ in trange(test_round, desc=f"Apply on float_{i}"):
        t = df[f"float_{i}"].apply(double_value)
    benchmark_results["apply"][f"float_{i}"] = (time.time() - start_time) * 1000 / test_round
    
for i in range(1, int_count + 1):
    start_time = time.time()
    for _ in trange(test_round, desc=f"Apply on int_{i}"):
        t = df[f"int_{i}"].apply(double_value)
    benchmark_results["apply"][f"int_{i}"] = (time.time() - start_time) * 1000 / test_round
    
print(benchmark_results["apply"])

Apply on float_1: 100%|██████████| 100/100 [00:00<00:00, 605.90it/s]
Apply on float_2: 100%|██████████| 100/100 [00:00<00:00, 618.82it/s]
Apply on float_3: 100%|██████████| 100/100 [00:00<00:00, 620.50it/s]
Apply on float_4: 100%|██████████| 100/100 [00:00<00:00, 620.33it/s]
Apply on int_1: 100%|██████████| 100/100 [00:00<00:00, 779.64it/s]
Apply on int_2: 100%|██████████| 100/100 [00:00<00:00, 778.43it/s]
Apply on int_3: 100%|██████████| 100/100 [00:00<00:00, 779.00it/s]
Apply on int_4: 100%|██████████| 100/100 [00:00<00:00, 778.61it/s]

{'float_1': 1.660611629486084, 'float_2': 1.6222858428955078, 'float_3': 1.619102954864502, 'float_4': 1.618056297302246, 'int_1': 1.2892913818359375, 'int_2': 1.2910127639770508, 'int_3': 1.2903904914855957, 'int_4': 1.2905073165893555}





In [None]:
# Sort 操作

for i in range(1, float_count + 1):
    start_time = time.time()
    for _ in trange(test_round, desc=f"Sort on float_{i}"):
        t = df.sort_values(f"float_{i}")
    benchmark_results["sort"][f"float_{i}"] = (time.time() - start_time) * 1000 / test_round
for i in range(1, int_count + 1):
    start_time = time.time()
    for _ in trange(test_round, desc=f"Sort on int_{i}"):
        t = df.sort_values(f"int_{i}")
    benchmark_results["sort"][f"int_{i}"] = (time.time() - start_time) * 1000 / test_round
print(benchmark_results["sort"])

Sort on float_1: 100%|██████████| 100/100 [00:00<00:00, 114.42it/s]
Sort on float_2: 100%|██████████| 100/100 [00:00<00:00, 112.45it/s]
Sort on float_3: 100%|██████████| 100/100 [00:00<00:00, 111.90it/s]
Sort on float_4: 100%|██████████| 100/100 [00:00<00:00, 110.97it/s]
Sort on int_1: 100%|██████████| 100/100 [00:00<00:00, 137.14it/s]
Sort on int_2: 100%|██████████| 100/100 [00:00<00:00, 137.09it/s]
Sort on int_3: 100%|██████████| 100/100 [00:00<00:00, 137.01it/s]
Sort on int_4: 100%|██████████| 100/100 [00:00<00:00, 136.97it/s]

{'float_1': 8.754463195800781, 'float_2': 8.899462223052979, 'float_3': 8.944213390350342, 'float_4': 9.023559093475342, 'int_1': 7.304623126983643, 'int_2': 7.300686836242676, 'int_3': 7.306938171386719, 'int_4': 7.312383651733398}





In [None]:
# split data by int_1
int_1_span = 7
df_subset_l = df[df["int_1"].between(0, int_1_span)]
df_subset_r = df[df["int_1"].between(int_1_span, int_1_span * 2)]

# print subset len
print(len(df_subset_l))
print(len(df_subset_r))

start_time = time.time()
for i in trange(test_round, desc="Join on int_1"):
    t = df_subset_l.merge(df_subset_r, on="int_2", how="left")
benchmark_results["join"]["int_1"] = (time.time() - start_time) * 1000 / test_round

print(benchmark_results["join"])

6676
190860


Join on int_1: 100%|██████████| 100/100 [00:11<00:00,  8.35it/s]

{'int_1_0_10_vs_10_20': 119.7813868522644}





In [None]:
agg_map = {
    "float_1": ["max", "min", "mean", "sum"],
    "float_2": ["max", "min", "mean", "sum"],
    "float_3": ["max", "min", "mean", "sum"],
    "float_4": ["max", "min", "mean", "sum"],
    "int_2": ["max", "min", "mean", "sum"],
    "int_3": ["max", "min", "mean", "sum"],
    "int_4": ["max", "min", "mean", "sum"],
}

start_time = time.time()
for i in trange(test_round, desc="Groupby on Stkcd"):
    t = df.groupby("int_1").agg(agg_map)
benchmark_results["groupby"]["int_1"] = (time.time() - start_time) * 1000 / test_round
print(benchmark_results["groupby"])

Groupby on Stkcd: 100%|██████████| 100/100 [00:01<00:00, 50.03it/s]

{'int_1': 19.998531341552734}





In [15]:
with open("../../results/cudf_syn_benchmark_results.json", "w") as f:
    json.dump(benchmark_results, f, indent=4)