# Benchmark Pandas vs Cudf
- Using *timeit*

### System details

#### GPU

In [1]:
!nvidia-smi -q



Timestamp                           : Wed Jun  5 11:18:00 2019
Driver Version                      : 410.79
CUDA Version                        : 10.0

Attached GPUs                       : 1
GPU 00000000:00:1E.0
    Product Name                    : Tesla V100-SXM2-16GB
    Product Brand                   : Tesla
    Display Mode                    : Enabled
    Display Active                  : Disabled
    Persistence Mode                : Disabled
    Accounting Mode                 : Disabled
    Accounting Mode Buffer Size     : 4000
    Driver Model
        Current                     : N/A
        Pending                     : N/A
    Serial Number                   : 0323818196295
    GPU UUID                        : GPU-7ea818f2-ecd9-c372-11fe-643f104cc2e6
    Minor Number                    : 0
    VBIOS Version                   : 88.00.4F.00.09
    MultiGPU Board                  : No
    Board ID                        : 0x1e
    GPU Part Number                 : 900-2

#### CPU

In [2]:
!less /proc/cpuinfo

processor       : 0
vendor_id       : GenuineIntel
cpu family      : 6
model           : 79
model name      : Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
stepping        : 1
microcode       : 0xb000036
cpu MHz         : 2702.949
cache size      : 46080 KB
physical id     : 0
siblings        : 8
core id         : 0
cpu cores       : 4
apicid          : 0
initial apicid  : 0
fpu             : yes
fpu_exception   : yes
cpuid level     : 13
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand[7m/proc/cpuinfo[m[K

## Preperations
- Imports
- Variables

In [4]:
import cudf
import pandas as pd

In [5]:
parquet_file = 'logs_sample/haproxy_logs_10k.parquet'
nlargest = 300

## Benchmark

### Flow
- Read parquet
- Compute aggregations
- get nlargest()

#### cuDF

In [6]:
%%timeit
gdf = cudf.read_parquet(parquet_file)
gdf.groupby(['log_ip']).agg({'feconn':'mean',
                                'beconn':'mean',
                                'time_backend_response':'max',
                                'time_backend_response':'mean',
                                'time_queue':'mean',
                                'time_duration': 'mean',
                                'time_request': 'mean',
                                'time_backend_connect':'mean'
                               })
nlarge = gdf.nlargest(nlargest, 'time_backend_response')

219 ms ± 585 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Pandas

In [7]:
%%timeit
pdf = pd.read_parquet(parquet_file)
pdf.groupby(['log_ip']).agg({'feconn':'mean',
                                'beconn':'mean',
                                'time_backend_response':'max',
                                'time_backend_response':'mean',
                                'time_queue':'mean',
                                'time_duration': 'mean',
                                'time_request': 'mean',
                                'time_backend_connect':'mean'
                               })
nlarge = pdf.nlargest(nlargest, 'time_backend_response')

13.6 ms ± 81.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Test loading times

#### cudf

In [8]:
%%timeit
gdf = cudf.read_parquet(parquet_file)

29.9 ms ± 109 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### Pandas

In [9]:
%%timeit
pdf = pd.read_parquet(parquet_file)

6.48 ms ± 13.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Test aggregation

In [10]:
gdf = cudf.read_parquet(parquet_file)
pdf = pd.read_parquet(parquet_file)

#### cudf

In [11]:
%%timeit
gdf.groupby(['log_ip']).agg({'feconn':'mean',
                                'beconn':'mean',
                                'time_backend_response':'max',
                                'time_backend_response':'mean',
                                'time_queue':'mean',
                                'time_duration': 'mean',
                                'time_request': 'mean',
                                'time_backend_connect':'mean'
                               })
nlarge = gdf.nlargest(nlargest, 'time_backend_response')

184 ms ± 221 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


#### Pandas

In [12]:
%%timeit
pdf.groupby(['log_ip']).agg({'feconn':'mean',
                                'beconn':'mean',
                                'time_backend_response':'max',
                                'time_backend_response':'mean',
                                'time_queue':'mean',
                                'time_duration': 'mean',
                                'time_request': 'mean',
                                'time_backend_connect':'mean'
                               })
nlarge = pdf.nlargest(nlargest, 'time_backend_response')

6.59 ms ± 17.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
