### NanoCube Benchmark - Pandas DataFrame vs. Cube on OLAP-style point queries
This benchmark compares the execution time of Pandas vs. NanoCube for a single semi-complex point query.
Use your own DataFrames and see what performance improvements you will gain.  

In [5]:
import random
import timeit
import string
from datetime import datetime
import pandas as pd
from nanocube import Cube

rows = 1_000_000
loops = 100

# Create a larger dataframe with 1M records
print(f"Creating Dataframe with {rows:,} rows and 5 columns ", end="")
start = datetime.now()
customers = string.ascii_uppercase
df = pd.DataFrame({'customer': random.choices(customers, weights=range(len(customers), 0, -1), k=rows),  # 'A', 'B', 'C', ...
                   'product':  random.choices([f'P{i}' for i in range(100)], weights=range(100, 0, -1), k=rows),  # 'P1', 'P2', 'P3', ...
                   'promo':    random.choices([True, False], k=rows),
                   'sales':    [int(random.random()*100) for _ in range(rows)],
                   'cost':     [int(random.random()*100) for _ in range(rows)]})
print(f"in {(datetime.now() - start).total_seconds():.5f} sec.")
print (df.head())

Creating Dataframe with 1,000,000 rows and 5 columns in 0.75192 sec.
  customer product  promo  sales  cost
0        C     P27  False      5    38
1        H     P66   True     42    10
2        F     P26   True     52    54
3        N     P47   True     98    25
4        H     P71   True     63     3


In [6]:
# Create a cube
print(f"\nCreating and preparing Cube from Dataframe ", end="")
start = datetime.now()
cube = Cube(df)
print(f"in {(datetime.now() - start).total_seconds():.5f} sec.")



Creating and preparing Cube from Dataframe in 0.90003 sec.


In [7]:
# OLAP query using Pandas dataframe
q1 = 'df[(df["customer"] == "A") & (df["product"] == "P1")][["sales", "cost"]].sum()'
print(f"\nRunning OLAP-Queries with Pandas. Please wait...")
print(f"\tQuery 1: {q1}")
print(f"\tResult: {dict(df[(df['customer'] == 'A') & (df['product'] == 'P1')][['sales', 'cost']].sum().items())}")
q1_pd = timeit.timeit(q1, globals=globals(), number=loops)
print(f"\t{loops}x queries executed in {q1_pd:.5f} sec, avg. {q1_pd/loops:.5f} sec/query")


Running OLAP-Queries with Pandas. Please wait...
	Query 1: df[(df["customer"] == "A") & (df["product"] == "P1")][["sales", "cost"]].sum()
	Result: {'sales': 73996, 'cost': 73007}
	100x queries executed in 5.50388 sec, avg. 0.05504 sec/query


In [8]:

# OLAP query using Cube
q1 = 'cube.get(customer="A", product="P1")'
print(f"\nRunning OLAP-Queries with Cube. Don't wait...")
print(f"\tQuery 1: {q1}")
print(f"\tResult: {cube.get(customer='A', product='P1')}")
q1_cube = timeit.timeit(q1, globals=globals(), number=loops)
print(f"\t{loops}x queries executed in {q1_cube:.5f} sec, avg. {q1_cube/loops:.5f} sec/query")

print(f"\nBelieve it or not: Cube is {q1_pd/q1_cube:.0f}x times faster than Pandas DataFrame for OLAP queries.")


Running OLAP-Queries with Cube. Don't wait...
	Query 1: cube.get(customer="A", product="P1")
	Result: {'sales': 73996, 'cost': 73007}
	100x queries executed in 0.01581 sec, avg. 0.00016 sec/query

Believe it or not: Cube is 348x times faster than Pandas DataFrame for OLAP queries.
