# Benchmarking Read Parquet

In [None]:
import modin.dataframe as md

import pandas as pd
import numpy as np

import time
import json
from datetime import datetime
import os
from itertools import product

In [None]:
N_ROWS = [int(1e4), int(1e5), int(1e6)]
N_COLS = [10, 20, 50]

In [None]:
def benchmark(rows, cols, fname):
    generated_data = pd.DataFrame({
        # parquet must have string column names
        str(name): np.random.randn(rows) for name in np.arange(cols)
    })
    generated_data.to_parquet('data.pq')
    
    durations = {}
    for i in range(5):
        start = time.time()
        df = md.read_parquet('data.pq')
        md.ray.wait(list(df._block_partitions.flatten()))
        end = time.time()
        durations[i] = end-start
        md.ray.worker.global_worker.plasma_client.evict(2*1024*1024)
        
        
    with open(fname,'w+') as f:
        f.write(json.dumps({
            'time': str(datetime.now()),
            'params': {
                'rows': rows,
                'cols': cols,
                'type': 'floats',
                'size': int(generated_data.memory_usage().sum())
            },
            'data': durations,
            'summary': {
                'mean': np.mean(list(durations.values())),
                'std': np.std(list(durations.values()))
            }
        }))

In [None]:
# RESULT_DIR = 'old' # Running on master

RESULT_DIR = 'new' # Running on new branch


!mkdir $RESULT_DIR

In [None]:
for rows, cols in product(N_ROWS, N_COLS):
    print(f"Working on {rows} x {cols}")
    benchmark(rows, cols, f"{RESULT_DIR}/{rows}_{cols}.json")