# Benchmark Pandas vs Cudf
- Using *timeit*

### System details

#### GPU

In [1]:
!nvidia-smi -q



Timestamp                           : Wed Jul  3 11:02:34 2019
Driver Version                      : 418.56
CUDA Version                        : 10.1

Attached GPUs                       : 1
GPU 00000000:00:1E.0
    Product Name                    : Tesla V100-SXM2-16GB
    Product Brand                   : Tesla
    Display Mode                    : Enabled
    Display Active                  : Disabled
    Persistence Mode                : Enabled
    Accounting Mode                 : Disabled
    Accounting Mode Buffer Size     : 4000
    Driver Model
        Current                     : N/A
        Pending                     : N/A
    Serial Number                   : 0420219038723
    GPU UUID                        : GPU-11bfb470-bf49-6b8e-8d36-de2470d215dc
    Minor Number                    : 0
    VBIOS Version                   : 88.00.4F.00.09
    MultiGPU Board                  : No
    Board ID                        : 0x1e
    GPU Part Number                 : 900-2G

#### CPU

In [2]:
!less /proc/cpuinfo

processor       : 0
vendor_id       : GenuineIntel
cpu family      : 6
model           : 79
model name      : Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz
stepping        : 1
microcode       : 0xb000037
cpu MHz         : 2699.804
cache size      : 46080 KB
physical id     : 0
siblings        : 8
core id         : 0
cpu cores       : 4
apicid          : 0
initial apicid  : 0
fpu             : yes
fpu_exception   : yes
cpuid level     : 13
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f1[7m/proc/cpuinfo[m[K

## Preperations
- Imports
- Variables

In [6]:
import os

In [4]:
import cudf
import pandas as pd

In [68]:
# Source files
raw_parquet_file = 'logs_sample/haproxy_logs_10k.parquet'
raw_json_file = 'logs_sample/haproxy_json_logs_raw_original.txt'

# Benchmark configuration
nlargest = 300

## Target file size validation
Set target size (in MB) for the test file

In [70]:
# Source to be used
source_file = raw_json_file 

# Target file for multiplication
target_file = 'logs_sample/1gb.file' 

In [71]:
target_test_size_in_mb = 1000

In [72]:
multiplication_factor = target_test_size_in_mb // (os.path.getsize(source_file) >> 20)
print(f'Multiply target file by {multiplication_factor}')

Multiply target file by 200


In [73]:
with open(source_file, 'r') as source:
    with open(target_file, 'w') as target:
        [target.write(line) 
         for line in source.readlines() 
         for m in range(multiplication_factor)]

In [76]:
!ls -lah logs_sample/

total 0
drwxr-xr-x 2 50 nogroup    0 Jul  3 11:02 .ipynb_checkpoints
-rw-r--r-- 1 50 nogroup 1.2G Jul  3 11:22 1gb.file
-rw-r--r-- 1 50 nogroup 5.9M Jul  3 11:02 haproxy_json_logs_raw_original.txt
-rw-r--r-- 1 50 nogroup  89K Jul  3 11:00 haproxy_logs_10k.parquet


## Benchmark

### Flow
- Read file
- Compute aggregations
- get nlargest()

In [77]:
benchmark_file = target_file

#### cuDF

In [79]:
%%timeit

# Read file
# gdf = cudf.read_parquet(benchmark_file)
gdf = cudf.read_json(benchmark_file, lines=True)

# Perform aggregation
gdf.groupby(['log_ip']).agg({'feconn':'mean',
                                'beconn':'mean',
                                'time_backend_response':'max',
                                'time_backend_response':'mean',
                                'time_queue':'mean',
                                'time_duration': 'mean',
                                'time_request': 'mean',
                                'time_backend_connect':'mean'
                               })

# Select top N
nlarge = gdf.nlargest(nlargest, 'time_backend_response')

4.98 s ± 88.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Pandas

In [80]:
%%timeit

# Read file
# gdf = pd.read_parquet(benchmark_file)
gdf = pd.read_json(benchmark_file, lines=True)

# Perform aggregation
gdf.groupby(['log_ip']).agg({'feconn':'mean',
                                'beconn':'mean',
                                'time_backend_response':'max',
                                'time_backend_response':'mean',
                                'time_queue':'mean',
                                'time_duration': 'mean',
                                'time_request': 'mean',
                                'time_backend_connect':'mean'
                               })

# Select top N
nlarge = gdf.nlargest(nlargest, 'time_backend_response')

54.5 s ± 692 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Test loading times

#### cudf

In [81]:
%%timeit
# gdf = cudf.read_parquet(benchmark_file)
gdf = cudf.read_json(benchmark_file, lines=True)

4.76 s ± 34.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Pandas

In [82]:
%%timeit
# gdf = pd.read_parquet(benchmark_file)
gdf = pd.read_json(benchmark_file, lines=True)

52.6 s ± 187 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Test aggregation

In [83]:
# gdf = cudf.read_parquet(benchmark_file)
gdf = cudf.read_json(benchmark_file, lines=True)

# pdf = pd.read_parquet(benchmark_file)
pdf = pd.read_json(benchmark_file, lines=True)

#### cudf

In [84]:
%%timeit
gdf.groupby(['log_ip']).agg({'feconn':'mean',
                                'beconn':'mean',
                                'time_backend_response':'max',
                                'time_backend_response':'mean',
                                'time_queue':'mean',
                                'time_duration': 'mean',
                                'time_request': 'mean',
                                'time_backend_connect':'mean'
                               })
nlarge = gdf.nlargest(nlargest, 'time_backend_response')

264 ms ± 21.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### Pandas

In [None]:
%%timeit
pdf.groupby(['log_ip']).agg({'feconn':'mean',
                                'beconn':'mean',
                                'time_backend_response':'max',
                                'time_backend_response':'mean',
                                'time_queue':'mean',
                                'time_duration': 'mean',
                                'time_request': 'mean',
                                'time_backend_connect':'mean'
                               })
nlarge = pdf.nlargest(nlargest, 'time_backend_response')