## {insert names of team members}
*
*
*

In [None]:
# !uv pip list

In [None]:
import os
import pandas as pd
import polars as pl
import numpy as np
from timeit import Timer

In [None]:
num_cores = os.cpu_count()
print(num_cores)

# limit the total number of threads used with POLARS_MAX_THREADS
#### first, start with the default (commented out)
#### then, set it to 1

In [None]:
# os.environ['POLARS_MAX_THREADS'] = '1'  # kernel must be restarted to overwrite existing env variable

In [None]:
pl.thread_pool_size()

# Generate dummy data

In [None]:
def generate_data(number_of_rows):
    rng = np.random.default_rng()

    return {
        "order_id": range(1, number_of_rows + 1),
        "region": rng.choice(
            ["North", "South", "East", "West"], size=number_of_rows
        ),
        "sales_person": rng.choice(
            ["Armstrong", "Aldrin", "Collins"], size=number_of_rows
        ),
        "product": rng.choice(
            ["Helmet", "Oxygen", "Boots", "Gloves"], size=number_of_rows
        ),
        "sales_income": np.ones(number_of_rows),
    }

# Define DataFrame Creation Functions

In [None]:
def create_pandas_dataframe(test_data):
    return pd.DataFrame(test_data)

def create_polars_dataframe(test_data):
    return pl.DataFrame(test_data)

def create_polars_lazyframe(test_data):
    return pl.LazyFrame(test_data)

# Display First 10 rows

In [None]:
display_data = generate_data(10)
create_pandas_dataframe(display_data)

# Create DataFrames & Print Memory Consumption 

In [None]:
test_data = generate_data(number_of_rows=100000)

# Timing for GroupBy operations
# pandas
pandas_df = create_pandas_dataframe(test_data)
size_bytes = pandas_df.memory_usage(index=True).sum()
size_mb = size_bytes / (1024 ** 2)
print(f"Pandas DataFrame size: {size_mb:.2f} MB")

# polars eager dataframe
polars_df = create_polars_dataframe(test_data)
size_bytes = polars_df.estimated_size()
size_mb = size_bytes / (1024 ** 2)
print(f"Polars Eager DataFrame size: {size_mb:.2f} MB")

# polars lazyframe 
polars_lf = create_polars_lazyframe(test_data)


# Define groupby & agg functions (can be parallelized)

In [None]:
def groupby_pandas_dataframe(pandas_df):
    return pandas_df.groupby(["region", "product", "sales_person"])[
        "sales_income"
    ].sum()

def groupby_polars_dataframe(polars_df):
    return polars_df.group_by(["region", "product", "sales_person"]).agg(
        total_sales=pl.col("sales_income").sum()
    )

def groupby_polars_lazyframe(polars_lf):
    return polars_lf.group_by(["region", "product", "sales_person"]).agg(
        total_sales=pl.col("sales_income").sum()
    ).collect()

# Run Groupby BenchMarking (can be parallelized) 

In [None]:
pandas_timer = Timer(lambda: groupby_pandas_dataframe(pandas_df))
print(f"Pandas Analysis Time: {pandas_timer.timeit(number=100):.6f} seconds")

polars_timer = Timer(lambda: groupby_polars_dataframe(polars_df))
print(f"Polars Analysis Time: {polars_timer.timeit(number=100):.6f} seconds")

polars_lazyframe_timer = Timer(lambda: groupby_polars_lazyframe(polars_lf))
print(f"Polars LazyFrame Analysis Time: {polars_lazyframe_timer.timeit(number=100):.6f} seconds")

## {Insert a new operation to benchmark that can also be parallelized}

In [None]:
# your code

## {Insert a new operation to benchmark that can _not_ be parallelized}

In [None]:
# your code

## {Insert benchmark figures comparing mean (of 100 runs)}
### consider adding error bars for std of your 100 runs

In [None]:
# your code