In [16]:
import polars as pl
import numpy as np  
from datetime import datetime, timedelta

np.random.seed(42)
dates= [datetime(2024, 1, 1) + timedelta(days=i) for i in range(100)]

sales_df= pl.DataFrame({
    'date':np.random.choice(dates, 1000),
    'product_id':np.random.choice(['a','b','c','d','e'], 1000),
    'customer_id':np.random.randint(1, 101, 1000),
    'quantity':np.random.randint(1, 10, 1000),
    'unit_price':np.random.uniform(10,100,1000),
    'region':np.random.choice(['North','South','East','West'], 1000)
})

print(sales_df.head())

customer_df= pl.DataFrame({
    'customer_id':range(1, 101),
    'customer_name':[f'Customer_{i}' for i in range(1, 101)],
    'segment':np.random.choice(['Premium','Standard','Basic'], 100)
})


print(customer_df.head())

shape: (5, 6)
┌─────────────────────┬────────────┬─────────────┬──────────┬────────────┬────────┐
│ date                ┆ product_id ┆ customer_id ┆ quantity ┆ unit_price ┆ region │
│ ---                 ┆ ---        ┆ ---         ┆ ---      ┆ ---        ┆ ---    │
│ object              ┆ str        ┆ i32         ┆ i32      ┆ f64        ┆ str    │
╞═════════════════════╪════════════╪═════════════╪══════════╪════════════╪════════╡
│ 2024-02-21 00:00:00 ┆ b          ┆ 65          ┆ 4        ┆ 93.963238  ┆ East   │
│ 2024-04-02 00:00:00 ┆ a          ┆ 55          ┆ 3        ┆ 84.475569  ┆ East   │
│ 2024-01-15 00:00:00 ┆ d          ┆ 90          ┆ 6        ┆ 72.748844  ┆ East   │
│ 2024-03-12 00:00:00 ┆ c          ┆ 43          ┆ 7        ┆ 74.289397  ┆ North  │
│ 2024-03-01 00:00:00 ┆ b          ┆ 74          ┆ 4        ┆ 51.554453  ┆ North  │
└─────────────────────┴────────────┴─────────────┴──────────┴────────────┴────────┘
shape: (5, 3)
┌─────────────┬───────────────┬─────────┐
│ cust

In [17]:
#filtering data

high_quantity_sales= sales_df.filter(pl.col('quantity') > 5)
print(high_quantity_sales.head())

recent_high_value_sales= sales_df.filter(
    (pl.col('unit_price') > 50)
)
print(recent_high_value_sales.head())

special_products = sales_df.filter(
    pl.col('product_id').is_in(['a', 'c', 'e'])
)
print(special_products.head())
northorsouth = sales_df.filter(
    pl.col('region').is_in(['North', 'South'])
)   




shape: (5, 6)
┌─────────────────────┬────────────┬─────────────┬──────────┬────────────┬────────┐
│ date                ┆ product_id ┆ customer_id ┆ quantity ┆ unit_price ┆ region │
│ ---                 ┆ ---        ┆ ---         ┆ ---      ┆ ---        ┆ ---    │
│ object              ┆ str        ┆ i32         ┆ i32      ┆ f64        ┆ str    │
╞═════════════════════╪════════════╪═════════════╪══════════╪════════════╪════════╡
│ 2024-01-15 00:00:00 ┆ d          ┆ 90          ┆ 6        ┆ 72.748844  ┆ East   │
│ 2024-03-12 00:00:00 ┆ c          ┆ 43          ┆ 7        ┆ 74.289397  ┆ North  │
│ 2024-01-21 00:00:00 ┆ a          ┆ 30          ┆ 7        ┆ 92.889507  ┆ South  │
│ 2024-03-23 00:00:00 ┆ e          ┆ 82          ┆ 9        ┆ 72.513588  ┆ North  │
│ 2024-04-09 00:00:00 ┆ c          ┆ 76          ┆ 8        ┆ 27.571654  ┆ South  │
└─────────────────────┴────────────┴─────────────┴──────────┴────────────┴────────┘
shape: (5, 6)
┌─────────────────────┬────────────┬────────────

In [18]:
#grouping and aggregation

product_summary = sales_df.group_by("product_id").agg([
    pl.col("quantity").sum().alias("total_quantity"),
    pl.col("unit_price").mean().alias("average_unit_price"),
    pl.count().alias("transaction_count"),
    pl.col("unit_price").min().alias("min_unit_price"),
    pl.col("unit_price").max().alias("max_unit_price")
])
print(product_summary)

shape: (5, 6)
┌────────────┬────────────────┬─────────────────┬────────────────┬────────────────┬────────────────┐
│ product_id ┆ total_quantity ┆ average_unit_pr ┆ transaction_co ┆ min_unit_price ┆ max_unit_price │
│ ---        ┆ ---            ┆ ice             ┆ unt            ┆ ---            ┆ ---            │
│ str        ┆ i32            ┆ ---             ┆ ---            ┆ f64            ┆ f64            │
│            ┆                ┆ f64             ┆ u32            ┆                ┆                │
╞════════════╪════════════════╪═════════════════╪════════════════╪════════════════╪════════════════╡
│ c          ┆ 1023           ┆ 55.182486       ┆ 200            ┆ 10.972998      ┆ 99.786547      │
│ b          ┆ 979            ┆ 53.298644       ┆ 201            ┆ 10.478213      ┆ 99.960193      │
│ a          ┆ 1056           ┆ 56.454078       ┆ 201            ┆ 10.619017      ┆ 99.751411      │
│ d          ┆ 962            ┆ 54.202104       ┆ 192            ┆ 10.058805 

(Deprecated in version 0.20.5)
  pl.count().alias("transaction_count"),


In [19]:
sales_with_customer = sales_df.join(
    customer_df,
    on="customer_id",
    how="inner"
)

all_segment_sales = sales_df.join(
    customer_df,
    on="customer_id",
    how="left"
)

customer_stats = sales_df.group_by("customer_id").agg([
    (pl.col("quantity") * pl.col("unit_price")).sum().alias("total_spent"),
    pl.len().alias("transaction_count")
])

print(customer_stats.head())




shape: (5, 3)
┌─────────────┬─────────────┬───────────────────┐
│ customer_id ┆ total_spent ┆ transaction_count │
│ ---         ┆ ---         ┆ ---               │
│ i32         ┆ f64         ┆ u32               │
╞═════════════╪═════════════╪═══════════════════╡
│ 65          ┆ 2835.373152 ┆ 9                 │
│ 77          ┆ 2145.307979 ┆ 8                 │
│ 75          ┆ 3351.582926 ┆ 12                │
│ 54          ┆ 2539.23265  ┆ 8                 │
│ 28          ┆ 2283.102561 ┆ 7                 │
└─────────────┴─────────────┴───────────────────┘


In [20]:
customer_enriched = customer_df.join(
    customer_stats,
    on="customer_id",
    how="left"
).with_columns([
    pl.col("total_spent").fill_null(0),
    pl.col("transaction_count").fill_null(0)
])

print(customer_enriched.head())


shape: (5, 5)
┌─────────────┬───────────────┬─────────┬─────────────┬───────────────────┐
│ customer_id ┆ customer_name ┆ segment ┆ total_spent ┆ transaction_count │
│ ---         ┆ ---           ┆ ---     ┆ ---         ┆ ---               │
│ i64         ┆ str           ┆ str     ┆ f64         ┆ u32               │
╞═════════════╪═══════════════╪═════════╪═════════════╪═══════════════════╡
│ 1           ┆ Customer_1    ┆ Basic   ┆ 1790.495711 ┆ 7                 │
│ 2           ┆ Customer_2    ┆ Basic   ┆ 4980.221253 ┆ 14                │
│ 3           ┆ Customer_3    ┆ Premium ┆ 2353.765093 ┆ 11                │
│ 4           ┆ Customer_4    ┆ Basic   ┆ 4901.1398   ┆ 12                │
│ 5           ┆ Customer_5    ┆ Basic   ┆ 3425.11899  ┆ 12                │
└─────────────┴───────────────┴─────────┴─────────────┴───────────────────┘


In [21]:
sales_with_windows = sales_df.with_columns([
    pl.col("quantity")*pl.col("unit_price").rank(method="dense",descending=True).over("region").alias("revenue_region_in_rank"),
    pl.col("unit_price").rolling_mean(window_size=3).over("customer_id").alias("customer_avg_price_3_transactions")
])

print(sales_with_windows.head())

shape: (5, 7)
┌────────────┬────────────┬─────────────┬──────────┬────────────┬────────┬─────────────────────────┐
│ date       ┆ product_id ┆ customer_id ┆ quantity ┆ unit_price ┆ region ┆ customer_avg_price_3_tr │
│ ---        ┆ ---        ┆ ---         ┆ ---      ┆ ---        ┆ ---    ┆ ansacti…                │
│ object     ┆ str        ┆ i32         ┆ i64      ┆ f64        ┆ str    ┆ ---                     │
│            ┆            ┆             ┆          ┆            ┆        ┆ f64                     │
╞════════════╪════════════╪═════════════╪══════════╪════════════╪════════╪═════════════════════════╡
│ 2024-02-21 ┆ b          ┆ 65          ┆ 84       ┆ 93.963238  ┆ East   ┆ null                    │
│ 00:00:00   ┆            ┆             ┆          ┆            ┆        ┆                         │
│ 2024-04-02 ┆ a          ┆ 55          ┆ 144      ┆ 84.475569  ┆ East   ┆ null                    │
│ 00:00:00   ┆            ┆             ┆          ┆            ┆        ┆   

In [22]:
# Creating calculated columns with complex logic
transformed_sales = sales_df.with_columns([
    # Revenue calculation
    (pl.col('quantity') * pl.col('unit_price')).alias('revenue'),
    
    # Price category
    pl.when(pl.col('unit_price') < 30)
      .then(pl.lit('Low'))
      .when(pl.col('unit_price') < 70)
      .then(pl.lit('Medium'))
      .otherwise(pl.lit('High'))
      .alias('price_category')
])
    
    # Season based on date


# Pivot table equivalent
region_product_pivot = sales_df.with_columns(
    (pl.col('quantity') * pl.col('unit_price')).alias('revenue')
).group_by(['region', 'product_id']).agg(
    pl.col('revenue').sum()
).pivot(
    values='revenue',
    index='region',
    columns='product_id'
).fill_null(0)

print("Region-Product Revenue Pivot:")
print(region_product_pivot)

Region-Product Revenue Pivot:
shape: (4, 6)
┌────────┬──────────────┬──────────────┬──────────────┬──────────────┬──────────────┐
│ region ┆ c            ┆ b            ┆ a            ┆ d            ┆ e            │
│ ---    ┆ ---          ┆ ---          ┆ ---          ┆ ---          ┆ ---          │
│ str    ┆ f64          ┆ f64          ┆ f64          ┆ f64          ┆ f64          │
╞════════╪══════════════╪══════════════╪══════════════╪══════════════╪══════════════╡
│ East   ┆ 15397.692089 ┆ 14070.284181 ┆ 16311.103087 ┆ 15425.098438 ┆ 17358.719308 │
│ North  ┆ 12427.733625 ┆ 11495.515809 ┆ 11205.000192 ┆ 12522.246418 ┆ 11880.225489 │
│ West   ┆ 16771.408385 ┆ 14181.448704 ┆ 16489.173653 ┆ 6604.114635  ┆ 11451.99057  │
│ South  ┆ 10303.875822 ┆ 12410.829157 ┆ 15978.487711 ┆ 16986.675514 ┆ 13431.003858 │
└────────┴──────────────┴──────────────┴──────────────┴──────────────┴──────────────┘


  ).pivot(


In [23]:
# String operations on customer names
customer_processed = customers_df.with_columns([
    # Extract customer number
    pl.col('customer_name')
    .str.extract(r'Customer_(\d+)', 1)
    .cast(pl.Int32)
    .alias('customer_number'),
    
    # Create customer code
    (pl.col('segment').str.slice(0, 3).str.to_uppercase() + 
     pl.col('customer_id').cast(pl.Utf8).str.zfill(3))
    .alias('customer_code')
])

# Date operations
date_analysis = sales_df.with_columns([
    pl.col('date').dt.year().alias('year'),
    pl.col('date').dt.month().alias('month'),
    pl.col('date').dt.day().alias('day'),
    pl.col('date').dt.weekday().alias('weekday'),
    pl.col('date').dt.quarter().alias('quarter'),
    (pl.col('date').dt.weekday() >= 6).alias('is_weekend')
])

print("Date analysis sample:")
print(date_analysis.select(['date', 'year', 'month', 'quarter', 'is_weekend']).head())

NameError: name 'customers_df' is not defined

In [24]:
# Using lazy evaluation for complex transformations
lazy_result = (
    pl.scan_csv('large_sales_data.csv')  # Use scan for large files
    .filter(pl.col('date') >= datetime(2024, 1, 1))
    .with_columns([
        (pl.col('quantity') * pl.col('unit_price')).alias('revenue')
    ])
    .group_by(['region', 'product_id'])
    .agg([
        pl.col('revenue').sum().alias('total_revenue'),
        pl.col('quantity').mean().alias('avg_quantity')
    ])
    .filter(pl.col('total_revenue') > 1000)
    .sort('total_revenue', descending=True)
    .collect()  # Execute the lazy query
)

# For in-memory DataFrames, you can also use lazy
lazy_memory_result = (
    sales_df.lazy()
    .filter(pl.col('region') == 'North')
    .group_by('product_id')
    .agg(pl.col('quantity').sum())
    .collect()
)

print("Lazy evaluation result:")
print(lazy_memory_result)



FileNotFoundError: The system cannot find the file specified. (os error 2): large_sales_data.csv

This error occurred with the following context stack:
	[1] 'csv scan'
	[2] 'filter'
	[3] 'with_columns'
	[4] 'group_by'
	[5] 'filter'
	[6] 'select'
	[7] 'sink'
