In [3]:
import pandas as pd
import plotly.express as px

In [4]:
df = pd.read_csv("./benchmark_results.csv", sep=",", header=None, names=["engine", "file", "size","run", "time"])
df

Unnamed: 0,engine,file,size,run,time
0,pandas,csv,500000,1,0.059160
1,pandas,csv,500000,2,0.046396
2,pandas,csv,500000,3,0.039489
3,pandas,csv,500000,4,0.041445
4,pandas,csv,500000,5,0.038567
...,...,...,...,...,...
1243,cudf,parquet,200000000,8,0.403310
1244,cudf,parquet,200000000,9,0.426259
1245,cudf,parquet,200000000,10,1.072460
1246,cudf,parquet,200000000,11,2.461838


In [5]:
df_max = df.groupby(['engine','file','size']).agg({'time':'max'})
df_max['min_max'] = 1
df_min = df.groupby(['engine','file','size']).agg({'time':'min'})
df_min['min_max'] = 1
df_min_max = pd.concat([df_min,df_max])
df_min_max = df_min_max.reset_index()
df_min_max

Unnamed: 0,engine,file,size,time,min_max
0,cudf,csv,500000,0.004747,1
1,cudf,csv,1000000,0.007611,1
2,cudf,csv,5000000,0.024487,1
3,cudf,csv,10000000,0.046887,1
4,cudf,csv,15000000,0.070954,1
...,...,...,...,...,...
203,pandas,parquet,100000000,1.612192,1
204,pandas,parquet,125000000,1.863088,1
205,pandas,parquet,150000000,2.222078,1
206,pandas,parquet,175000000,2.577395,1


In [6]:
df_filtered = (
    df
    .merge(df_min_max, on=['engine','file','size','time'], how='left')
)

df_filtered = df_filtered[df_filtered['min_max'].isnull()]
df_filtered = df_filtered.drop('min_max', axis=1)
df_filtered

Unnamed: 0,engine,file,size,run,time
1,pandas,csv,500000,2,0.046396
2,pandas,csv,500000,3,0.039489
3,pandas,csv,500000,4,0.041445
4,pandas,csv,500000,5,0.038567
5,pandas,csv,500000,6,0.041425
...,...,...,...,...,...
1242,cudf,parquet,200000000,7,0.403584
1243,cudf,parquet,200000000,8,0.403310
1244,cudf,parquet,200000000,9,0.426259
1245,cudf,parquet,200000000,10,1.072460


In [7]:
df_mean = (
    df_filtered
    .groupby(['engine','file','size'])
    .agg(
        {
            'time':'mean'
        }
    )
)
df_mean = df_mean.reset_index()
df_mean['engine_file'] = df_mean['engine']+'-'+df_mean['file']
df_mean

Unnamed: 0,engine,file,size,time,engine_file
0,cudf,csv,500000,0.005047,cudf-csv
1,cudf,csv,1000000,0.008387,cudf-csv
2,cudf,csv,5000000,0.027677,cudf-csv
3,cudf,csv,10000000,0.051828,cudf-csv
4,cudf,csv,15000000,0.072990,cudf-csv
...,...,...,...,...,...
99,pandas,parquet,100000000,1.476023,pandas-parquet
100,pandas,parquet,125000000,1.686946,pandas-parquet
101,pandas,parquet,150000000,1.946875,pandas-parquet
102,pandas,parquet,175000000,2.278523,pandas-parquet


In [13]:
fig = px.line(
    df_mean,
    x='size',
    y='time',
    color='engine_file',
    height=800,
    width=1200,
    title='Média de tempo de execução - 10 runs',
    markers=True,
     labels={
        'time': 'Tempo(s)',
        'size': 'Quantidade de Linhas',
        'engine_file': 'Engine - Arquivo'
    }
)

fig.show()