In [3]:
import polars as pl
from datetime import datetime
import numpy as np

# Series are a 1-dimensional data structure. Within a series all elements have the same Data Type .
s = pl.Series("a", [1, 2, 3, 4, 5])

# A DataFrame is a 2-dimensional data structure that is backed by a Series,
# and it can be seen as an abstraction of a collection (e.g. list) of Series. Operations
# that can be executed on a DataFrame are very similar to what is done in a SQL like query.
# You can GROUP BY, JOIN, PIVOT, but also define custom functions.


df = pl.DataFrame(
    {
        "integer": [1, 2, 3, 4, 5],
        "date": [
            datetime(2022, 1, 1),
            datetime(2022, 1, 2),
            datetime(2022, 1, 3),
            datetime(2022, 1, 4),
            datetime(2022, 1, 5),
        ],
        "float": [4.0, 5.0, 6.0, 7.0, 8.0],
    }
)


# Polars has developed its own Domain Specific Language (DSL) for transforming data.
# The language is very easy to use and allows for complex queries that remain human
# readable. The two core components of the language are Contexts and Expressions,
# the latter we will cover in the next section.

# A context, as implied by the name, refers to the context in which an expression needs
# to be evaluated. There are three main contexts 1:

# Selection: df.select([..]), df.with_columns([..])
# Filtering: df.filter()
# Group by / Aggregation: df.group_by(..).agg([..])
# The examples below are performed on the following DataFrame:

df = pl.DataFrame(
    {
        "nrs": [1, 2, 3, None, 5],
        "names": ["foo", "ham", "spam", "egg", None],
        "random": np.random.rand(5),
        "groups": ["A", "A", "B", "C", "B"],
    }
)

# Select
# In the select context the selection applies expressions over columns.
# The expressions in this context must produce Series that are all the same length or have a length of 1.

# A Series of a length of 1 will be broadcasted to match the height of the DataFrame.
# Note that a select may produce new columns that are aggregations, combinations of expressions, or literals.

out = df.select(
    pl.sum("nrs"),
    pl.col("names").sort(),
    pl.col("names").first().alias("first name"),
    (pl.mean("nrs") * 10).alias("10xnrs"),
)

# As you can see from the query the select context is very powerful and allows
# you to perform arbitrary expressions independent (and in parallel) of each other.

# Similarly to the select statement there is the with_columns statement which also
# is an entrance to the selection context. The main difference is that with_columns retains
# the original columns and adds new ones while select drops the original columns.
df = df.with_columns(
    pl.sum("nrs").alias("nrs_sum"),
    pl.col("random").count().alias("count"),
)
# Filter
# In the filter context you filter the existing dataframe based on arbitrary expression which evaluates to the Boolean data type.

out = df.filter(pl.col("nrs") > 2)

# Group by / aggregation
# In the group_by context, expressions work on groups and thus may yield results of any length (a group may have many members).


out = df.group_by("groups").agg(
    pl.sum("nrs"),  # sum nrs by groups
    pl.col("random").count().alias("count"),  # count group members
    # sum random where name != null
    pl.col("random").filter(pl.col("names").is_not_null()).sum().name.suffix("_sum"),
    pl.col("names").reverse().alias("reversed names"),
)

# As you can see from the result all expressions are applied to the group defined by the group_by context.
# Besides the standard group_by, group_by_dynamic, and group_by_rolling are also entrances to the group by context.


# Expressions
# Polars has a powerful concept called expressions that is central to its very fast performance.

# Expressions are at the core of many data science operations:

# taking a sample of rows from a column
# multiplying values in a column
# extracting a column of years from dates
# convert a column of strings to lowercase
# and so on!


# However, expressions are also used within other operations:

# taking the mean of a group in a group_by operation
# calculating the size of groups in a group_by operation
# taking the sum horizontally across columns
# Polars performs these core data transformations very quickly by:

# automatic query optimization on each expression
# automatic parallelization of expressions on many columns
# Polars expressions are a mapping from a series to a series
# (or mathematically Fn(Series) -> Series). As expressions have a Series as an input
# and a Series as an output then it is straightforward to do a sequence of
# expressions (similar to method chaining in Pandas).

# Examples
# The following is an expression:

pl.col("foo").sort().head(2)

# The snippet above says:

# Select column "foo"
# Then sort the column (not in reversed order)
# Then take the first two values of the sorted output
# The power of expressions is that every expression produces
# a new expression, and that they can be piped together. You can run an
# expression by passing them to one of Polars execution contexts.

# Here we run two expressions by running df.select:

df.select(pl.col("foo").sort().head(2), pl.col("bar").filter(pl.col("foo") == 1).sum())

# All expressions are run in parallel, meaning that separate Polars expressions are embarrassingly parallel.
# Note that within an expression there may be more parallelization going on.