In [1]:
import grizzly

# Load data from any source
df = grizzly.read_csv("data.csv")

# Perform manipulations
df_filtered = df.filter_eq("category", "electronics")
df_sorted = df_filtered.sort("price", ascending=True)

# Aggregate results
report = df_sorted.groupby_sum("brand", "sales")

# Export to your desired format
report.to_parquet("summary.parquet")

In [3]:
df.show(n=3)

+-------------+---------+-------+-------+
| category    | brand   | price | sales |
+-------------+---------+-------+-------+
| Utf8        | Utf8    | Int64 | Int64 |
| electronics | Apple   | 1000  | 50    |
| electronics | Samsung | 800   | 100   |
| electronics | Apple   | 1200  | 30    |
+-------------+---------+-------+-------+


## Left Join Example
Joining customers and orders data.

In [2]:
customers = grizzly.read_csv("customers.csv")
orders = grizzly.read_csv("orders.csv")

joined_df = customers.join(orders, on="customer_id", how="left")
joined_df.show()

+-------------+---------+-------------+----------------+--------------+
| customer_id | name    | city        | order_id_right | amount_right |
+-------------+---------+-------------+----------------+--------------+
| Int64       | Utf8    | Utf8        | Int64          | Float64      |
| 1           | Alice   | New York    | 101            | 250.0        |
| 1           | Alice   | New York    | 103            | 300.0        |
| 2           | Bob     | Los Angeles | 102            | 150.0        |
| 3           | Charlie | Chicago     |                |              |
| 4           | David   | Houston     |                |              |
+-------------+---------+-------------+----------------+--------------+


## Performance Comparison: Pandas vs Grizzly
Measuring time and memory for 15 million records and 20 columns.

In [4]:
import pandas as pd
import numpy as np
import time

# Parameters
n_records = 15_000_000
n_columns = 20
file_name = "perf_test.csv"

start_time = time.time()

# Generate random integer data
data = np.random.randint(0, 100, size=(n_records, n_columns))
columns = [f'col{i}' for i in range(n_columns)]

# Create DataFrame
df = pd.DataFrame(data, columns=columns)
print(df.shape)

# Export to CSV
df.to_csv(file_name, index=False)

print(f"CSV generated and saved as '{file_name}' in {time.time() - start_time:.2f} seconds.")

CSV generated and saved as 'perf_test.csv' in 51.80 seconds.


In [2]:
import pandas as pd
import polars as pl
import grizzly
import time
import os
import psutil

def get_memory_usage():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / (1024 * 1024)  # MB

file_path = "perf_test.csv"

# --- Pandas Performance ---
print("--- Pandas ---")
start_mem = get_memory_usage()
start_time = time.time()

df_pd = pd.read_csv(file_path)
df_pd_sorted = df_pd.sort_values(by="col0")

end_time = time.time()
end_mem = get_memory_usage()

print(f"Time: {end_time - start_time:.2f} seconds")
print(f"Memory used: {end_mem - start_mem:.2f} MB")


# --- Grizzly Performance ---
print("\n--- Grizzly ---")
start_mem = get_memory_usage()
start_time = time.time()

df_gr = grizzly.read_csv(file_path)
df_gr_sorted = df_gr.sort("col0")

end_time = time.time()
end_mem = get_memory_usage()

print(f"Time: {end_time - start_time:.2f} seconds")
print(f"Memory used: {end_mem - start_mem:.2f} MB")

# ---- Polars ----
print("\n--- polars ---")
start_mem = get_memory_usage()
start_time = time.time()
df_pl = pl.read_csv(file_path)
df_pl_sorted = df_pl.sort("col0")

end_time = time.time()
end_mem = get_memory_usage()

print(f"Time: {end_time - start_time:.2f} seconds")
print(f"Memory used: {end_mem - start_mem:.2f} MB")

--- Pandas ---
Time: 19.82 seconds
Memory used: 2291.18 MB

--- Grizzly ---
Time: 15.69 seconds
Memory used: 4654.62 MB

--- polars ---
Time: 7.49 seconds
Memory used: 6593.77 MB
