In [1]:
import polars as pl
import time

# Sample large dataset
large_df = pl.DataFrame({
    'id': range(1000000),
    'category': ['A', 'B', 'C', 'D'] * 250000,
    'value': range(1000000),
    'amount': [i * 0.1 for i in range(1000000)]
})

# Eager evaluation (immediate execution)
start_time = time.time()
eager_result = (
    large_df
    .filter(pl.col('category') == 'A')
    .group_by('category')
    .agg(pl.col('amount').sum())
    .sort('amount')
)
eager_time = time.time() - start_time

# Lazy evaluation (deferred execution)
start_time = time.time()
lazy_result = (
    large_df.lazy()
    .filter(pl.col('category') == 'A')
    .group_by('category')
    .agg(pl.col('amount').sum())
    .sort('amount')
    .collect()  # Only executes here
)
lazy_time = time.time() - start_time

print(f"Eager execution time: {eager_time:.4f}s")
print(f"Lazy execution time: {lazy_time:.4f}s")
print(f"Speedup: {eager_time/lazy_time:.2f}x")

Eager execution time: 0.0823s
Lazy execution time: 0.0125s
Speedup: 6.59x


In [2]:
# Projection pushdown: Select only needed columns early
# This reduces memory usage and I/O

# Inefficient: Read all columns, then select
# result = pl.read_csv('large_file.csv').select(['col1', 'col2', 'col3'])

# Efficient: Use lazy API with projection pushdown
efficient_result = (
    pl.scan_csv('large_file.csv')
    .select(['col1', 'col2', 'col3'])  # Only these columns are read
    .filter(pl.col('col1') > 100)
    .collect()
)

# Example with in-memory DataFrame
result = (
    large_df.lazy()
    .select(['id', 'category', 'amount'])  # Project early
    .filter(pl.col('amount') > 500)
    .group_by('category')
    .agg(pl.col('amount').mean())
    .collect()
)

print("Projection pushdown applied automatically in lazy evaluation")
print(result)


FileNotFoundError: The system cannot find the file specified. (os error 2): large_file.csv

This error occurred with the following context stack:
	[1] 'csv scan'
	[2] 'select'
	[3] 'filter'
	[4] 'sink'


In [3]:
# Predicate pushdown: Apply filters as early as possible
# This reduces the amount of data processed in subsequent operations

# Polars automatically pushes predicates down in lazy evaluation
lazy_query = (
    pl.scan_csv('sales_data.csv')
    .join(
        pl.scan_csv('customers.csv'),
        on='customer_id'
    )
    .filter(pl.col('purchase_date') >= '2024-01-01')  # Filter pushed down
    .filter(pl.col('amount') > 100)  # Multiple filters combined
    .group_by('customer_segment')
    .agg([
        pl.col('amount').sum().alias('total_sales'),
        pl.col('amount').count().alias('transaction_count')
    ])
)

# View the optimized query plan
print("Optimized query plan:")
print(lazy_query.explain())

# Execute the optimized query
result = lazy_query.collect()
print(result)

Optimized query plan:


FileNotFoundError: The system cannot find the file specified. (os error 2): sales_data.csv

This error occurred with the following context stack:
	[1] 'csv scan'
	[2] 'join left'
	[3] 'join'
	[4] 'filter'
	[5] 'filter'
	[6] 'group_by'


In [None]:
# Use appropriate data types to reduce memory usage
def optimize_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    """Optimize data types for memory efficiency."""
    return df.with_columns([
        # Use smaller integer types when possible
        pl.when(pl.col('id').max() < 32767)
        .then(pl.col('id').cast(pl.Int16))
        .when(pl.col('id').max() < 2147483647)
        .then(pl.col('id').cast(pl.Int32))
        .otherwise(pl.col('id'))
        .alias('id'),
        
        # Use categorical for repeated strings
        pl.col('category').cast(pl.Categorical),
        
        # Use Float32 instead of Float64 when precision allows
        pl.col('amount').cast(pl.Float32)
    ])

# Apply optimizations
optimized_df = optimize_dtypes(large_df)

# Compare memory usage
original_memory = large_df.estimated_size('mb')
optimized_memory = optimized_df.estimated_size('mb')

print(f"Original memory usage: {original_memory:.2f} MB")
print(f"Optimized memory usage: {optimized_memory:.2f} MB")
print(f"Memory savings: {((original_memory - optimized_memory) / original_memory * 100):.1f}%")

In [None]:
# For datasets larger than memory, use streaming
# Note: Streaming is available for certain operations

# Example: Process a very large CSV file in chunks
def process_large_file_streaming(file_path: str):
    """
    Process a large file using streaming to handle data larger than memory.
    """
    result = (
        pl.scan_csv(file_path)
        .filter(pl.col('amount') > 1000)
        .group_by('category')
        .agg([
            pl.col('amount').sum().alias('total_amount'),
            pl.col('amount').count().alias('count')
        ])
        .collect(streaming=True)  # Enable streaming
    )
    return result

# Streaming aggregations work well for:
# - Group by operations
# - Simple filters and projections
# - Window functions (in some cases)

print("Streaming enabled for large dataset processing")

In [None]:
# Optimize joins for better performance

# Create sample DataFrames for join examples
left_df = pl.DataFrame({
    'key': range(100000),
    'left_value': range(100000)
})

right_df = pl.DataFrame({
    'key': range(0, 100000, 2),  # Every other key
    'right_value': range(50000)
})

# Efficient join with lazy evaluation
efficient_join = (
    left_df.lazy()
    .join(
        right_df.lazy(),
        on='key',
        how='inner'
    )
    .filter(pl.col('left_value') > 1000)  # Filter after join
    .select(['key', 'left_value', 'right_value'])
    .collect()
)

# For very large joins, consider:
# 1. Sorting both DataFrames on join key
# 2. Using appropriate join strategy
# 3. Filtering before joining when possible

# Pre-filter before join for better performance
optimized_join = (
    left_df.lazy()
    .filter(pl.col('left_value') > 1000)  # Filter before join
    .join(
        right_df.lazy(),
        on='key',
        how='inner'
    )
    .collect()
)

print(f"Efficient join result shape: {efficient_join.shape}")
print(f"Optimized join result shape: {optimized_join.shape}")


In [None]:
# Always prefer vectorized operations over Python loops

# Inefficient: Python loop
def slow_calculation(df):
    results = []
    for row in df.iter_rows(named=True):
        if row['value'] > 500000:
            results.append(row['value'] * 2)
        else:
            results.append(row['value'])
    return results

# Efficient: Vectorized operation
def fast_calculation(df):
    return df.with_columns(
        pl.when(pl.col('value') > 500000)
        .then(pl.col('value') * 2)
        .otherwise(pl.col('value'))
        .alias('calculated_value')
    )

# Time comparison
start_time = time.time()
slow_result = slow_calculation(large_df.head(10000))
slow_time = time.time() - start_time

start_time = time.time()
fast_result = fast_calculation(large_df.head(10000))
fast_time = time.time() - start_time

print(f"Loop-based calculation: {slow_time:.4f}s")
print(f"Vectorized calculation: {fast_time:.4f}s")
print(f"Speedup: {slow_time/fast_time:.2f}x")


In [None]:
# Analyze and understand query execution plans

complex_query = (
    large_df.lazy()
    .filter(pl.col('category').is_in(['A', 'B']))
    .with_columns([
        (pl.col('value') * pl.col('amount')).alias('total_value')
    ])
    .group_by('category')
    .agg([
        pl.col('total_value').sum().alias('sum_total_value'),
        pl.col('total_value').mean().alias('avg_total_value'),
        pl.count().alias('count')
    ])
    .filter(pl.col('count') > 1000)
    .sort('sum_total_value', descending=True)
)

# View the optimized execution plan
print("Query execution plan:")
print(complex_query.explain())

# View the unoptimized plan (for comparison)
print("\nUnoptimized plan:")
print(complex_query.explain(optimized=False))

# Execute the query
result = complex_query.collect()
print(f"\nQuery result shape: {result.shape}")
print(result)


In [None]:
# Configure Polars for optimal parallel processing

# Check current thread configuration
print(f"Available threads: {pl.thread_pool_size()}")

# Set thread pool size (usually auto-detected optimally)
# pl.set_thread_pool_size(8)  # Set to specific number if needed

# Polars automatically parallelizes:
# - Column operations
# - Group by operations  
# - Joins
# - Aggregations
# - I/O operations

# Example of parallel group by
parallel_groupby = (
    large_df.lazy()
    .group_by('category')
    .agg([
        pl.col('value').sum(),
        pl.col('value').mean(),
        pl.col('value').std(),
        pl.col('value').min(),
        pl.col('value').max()
    ])
    .collect()
)

print("Parallel group by completed")
print(parallel_groupby)

In [None]:
# Choose the right file format for performance

# Parquet is generally fastest for analytical workloads
start_time = time.time()
parquet_df = pl.scan_parquet('data.parquet').collect()
parquet_time = time.time() - start_time

# CSV is slower but more universal
start_time = time.time()
csv_df = pl.scan_csv('data.csv').collect()
csv_time = time.time() - start_time

print(f"Parquet read time: {parquet_time:.4f}s")
print(f"CSV read time: {csv_time:.4f}s")
print(f"Parquet speedup: {csv_time/parquet_time:.2f}x")

# For writing, also prefer Parquet
large_df.write_parquet('output.parquet', compression='snappy')
print("Data written to Parquet with Snappy compression")