In [1]:
import numpy as np
import polars as pl

num_rows = 5000
rng = np.random.default_rng(seed=7)

buildings_data = {
     "sqft": rng.exponential(scale=1000, size=num_rows),
      "year": rng.integers(low=1995, high=2023, size=num_rows),
     "building_type": rng.choice(["A", "B", "C"], size=num_rows),
  }
buildings = pl.DataFrame(buildings_data)
buildings

sqft,year,building_type
f64,i64,str
707.529256,1996,"""C"""
1025.203348,2020,"""C"""
568.548657,2012,"""A"""
895.109864,2000,"""A"""
206.532754,2011,"""A"""
3383.637351,2018,"""B"""
9.753627,2007,"""A"""
2809.215763,2004,"""B"""
575.332756,2021,"""A"""
300.534013,2003,"""A"""


In [2]:
buildings.schema

OrderedDict([('sqft', Float64), ('year', Int64), ('building_type', String)])

In [3]:
buildings.head()

sqft,year,building_type
f64,i64,str
707.529256,1996,"""C"""
1025.203348,2020,"""C"""
568.548657,2012,"""A"""
895.109864,2000,"""A"""
206.532754,2011,"""A"""


In [4]:
buildings.describe()

describe,sqft,year,building_type
str,f64,f64,str
"""count""",5000.0,5000.0,"""5000"""
"""null_count""",0.0,0.0,"""0"""
"""mean""",994.094456,2008.5258,
"""std""",1016.641569,8.062353,
"""min""",1.133256,1995.0,"""A"""
"""25%""",286.807549,2001.0,
"""50%""",669.406964,2009.0,
"""75%""",1342.909782,2015.0,
"""max""",9307.793917,2022.0,"""C"""


In [5]:
buildings.select("sqft")

sqft
f64
707.529256
1025.203348
568.548657
895.109864
206.532754
3383.637351
9.753627
2809.215763
575.332756
300.534013


In [6]:
buildings.select(pl.col("sqft"))

sqft
f64
707.529256
1025.203348
568.548657
895.109864
206.532754
3383.637351
9.753627
2809.215763
575.332756
300.534013


In [7]:
buildings.select(pl.col("sqft").sort() / 1000)

sqft
f64
0.001133
0.001152
0.001429
0.001439
0.001505
0.001597
0.001747
0.00195
0.002005
0.002024


In [9]:
after_2015 = buildings.filter(pl.col("year") > 2015)
after_2015.select(pl.col("year").min())

year
i64
2016


In [10]:
buildings.groupby("building_type").agg(
 [
      pl.mean("sqft").alias("mean_sqft"),
        pl.median("year").alias("median_year"),
        pl.count(),
    ]
  )

  buildings.groupby("building_type").agg(


building_type,mean_sqft,median_year,count
str,f64,f64,u32
"""A""",989.539918,2009.0,1653
"""B""",992.754444,2009.0,1655
"""C""",999.854722,2009.0,1692


In [None]:
lazy_query = (
    buildings_lazy
     .with_columns(
...         (pl.col("price") / pl.col("sqft")).alias("price_per_sqft")
...     )
...     .filter(pl.col("price_per_sqft") > 100)
...     .filter(pl.col("year") < 2010)
...  )
>>> lazy_query