Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 35 additions & 35 deletions polars/09_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
app = marimo.App(width="medium")


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand All @@ -30,7 +30,7 @@ def _(mo):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand All @@ -43,7 +43,7 @@ def _(mo):
return


@app.cell(hide_code=True)
@app.cell
def _(pl):
pip_metadata_raw_df = pl.DataFrame(
[
Expand All @@ -56,7 +56,7 @@ def _(pl):
return (pip_metadata_raw_df,)


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""We can use the [`json_decode`](https://docs.pola.rs/api/python/stable/reference/series/api/polars.Series.str.json_decode.html) expression to parse the raw JSON strings into Polars-native structs and we can use the [unnest](https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.unnest.html) dataframe operation to have a dedicated column per parsed attribute.""")
return
Expand All @@ -69,13 +69,13 @@ def _(pip_metadata_raw_df, pl):
return (pip_metadata_df,)


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""This is already a much friendlier representation of the data we started out with, but note that since the JSON entries had only string attributes, all values are strings, even the temporal `released_at` and numerical `size_mb` columns.""")
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""As we know that the `size_mb` column should have a decimal representation, we go ahead and use [`to_decimal`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.to_decimal.html#polars.Expr.str.to_decimal) to perform the conversion.""")
return
Expand All @@ -91,7 +91,7 @@ def _(pip_metadata_df, pl):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand Down Expand Up @@ -127,7 +127,7 @@ def _(pip_metadata_df, pl):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""Alternatively, instead of using three different functions to perform the conversion to date, we can use a single one, [`strptime`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.strptime.html) which takes the desired temporal data type as its first parameter.""")
return
Expand All @@ -145,7 +145,7 @@ def _(pip_metadata_df, pl):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""And to wrap up this section on parsing and conversion, let's consider a final scenario. What if we don't want to parse the entire raw JSON string, because we only need a subset of its attributes? Well, in this case we can leverage the [`json_path_match`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.json_path_match.html) expression to extract only the desired attributes using standard [JSONPath](https://goessner.net/articles/JsonPath/) syntax.""")
return
Expand All @@ -163,7 +163,7 @@ def _(pip_metadata_raw_df, pl):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand Down Expand Up @@ -217,7 +217,7 @@ def list_expr_meta() -> list[dict]:
return expressions_df, list_expr_meta, list_members


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""As the following visualization shows, `str` is one of the richest Polars expression namespaces with multiple dozens of functions in it.""")
return
Expand All @@ -232,7 +232,7 @@ def _(alt, expressions_df):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand Down Expand Up @@ -260,7 +260,7 @@ def _(expressions_df, pl):
return (docstring_length_df,)


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""As the dataframe preview above and the scatterplot below show, the docstring length measured in bytes is almost always bigger than the length expressed in characters. This is due to the fact that the docstrings include characters which require more than a single byte to represent, such as "╞" for displaying dataframe header and body separators.""")
return
Expand All @@ -276,7 +276,7 @@ def _(alt, docstring_length_df):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand All @@ -298,7 +298,7 @@ def _(expressions_df, pl):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand Down Expand Up @@ -338,7 +338,7 @@ def _(mo, padded_df, padding):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand All @@ -362,7 +362,7 @@ def _(expressions_df, pl):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand All @@ -388,7 +388,7 @@ def _(expressions_df, pl):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand All @@ -412,7 +412,7 @@ def _(expressions_df, pl):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand All @@ -434,7 +434,7 @@ def _(expressions_df, pl):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand All @@ -460,7 +460,7 @@ def _(expressions_df, pl):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""For scenarios where we want to combine multiple substrings to check for, we can use the [`contains`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.contains.html) expression to check for the presence of various patterns.""")
return
Expand All @@ -476,7 +476,7 @@ def _(expressions_df, pl):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand Down Expand Up @@ -506,7 +506,7 @@ def _(expressions_df, pl):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""A related application example is to *find* the first index where a particular pattern is present, so that it can be used for downstream processing such as slicing. Below we use the [`find`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.find.html) expression to determine the index at which a code example starts in the docstring - identified by the Python shell substring `">>>"`.""")
return
Expand All @@ -522,7 +522,7 @@ def _(expressions_df, pl):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand Down Expand Up @@ -562,7 +562,7 @@ def _(mo, slice, sliced_df):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand All @@ -589,7 +589,7 @@ def _(expressions_df, pl):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""As a more practical example, we can use the `split` expression with some aggregation to count the number of times a particular word occurs in member names across all namespaces. This enables us to create a word cloud of the API members' constituents!""")
return
Expand Down Expand Up @@ -643,7 +643,7 @@ def _(alt, expressions_df, pl, random, wordcloud_height, wordcloud_width):
return wordcloud, wordcloud_df


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand Down Expand Up @@ -677,7 +677,7 @@ def _(expressions_df, pl):
return (descriptions_df,)


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand Down Expand Up @@ -706,7 +706,7 @@ def _(descriptions_df, mo, pl):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand Down Expand Up @@ -734,7 +734,7 @@ def _(expressions_df, pl):
return (url_pattern,)


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand All @@ -758,7 +758,7 @@ def _(expressions_df, pl):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand All @@ -783,7 +783,7 @@ def _(expressions_df, pl):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand All @@ -807,7 +807,7 @@ def _(expressions_df, pl):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand All @@ -830,7 +830,7 @@ def _(expressions_df, pl):
return (encoded_df,)


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""And of course, you can convert back into a human-readable representation using the [`decode`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.decode.html) expression.""")
return
Expand All @@ -845,7 +845,7 @@ def _(encoded_df, pl):
return


@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(
r"""
Expand Down
1 change: 1 addition & 0 deletions polars/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ You can also open notebooks in our online playground by appending marimo.app/ to
Thanks to all our notebook authors!

* [Koushik Khan](https://github.com/koushikkhan)
* [Péter Gyarmati](https://github.com/peter-gy)