In [None]:
import os
from pathlib import Path

import duckdb
import numpy as np
import polars as pl
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.dataset as ds
from tqdm.auto import tqdm, trange

In [None]:
os.environ["POLARS_VERBOSE"] = "1"
os.environ["RUST_BACKTRACE"] = "1"

# pl.lit in list.eval

In [None]:
df = pl.DataFrame(
    {
        "col1": [["a", "b", "c"], [], ["a", "d"]],
    },
)
res = df.select(pl.col("col1")).with_columns(
    pl.col("col1").list.eval(pl.lit("c")).alias("col1_alt"),
)
print(res)

# Option::unwrap panic simple

In [None]:
with pl.StringCache():
    df = pl.DataFrame(
        {
            "col1": [
                np.random.choice(["a", "b", "c", "d"], np.random.randint(1, 5))
                for _ in range(10)
            ],
            "col2": np.random.randint(0, 2, size=10),
        },
        schema={"col1": pl.List(pl.Categorical), "col2": pl.Int64},
    )
    res = (
        df.select(pl.col("col1", "col2")).with_columns(
            pl.col("col1")
            # .list.shift(1)
            # .list.eval(pl.lit("c"))
            .list.set_intersection(["a", "b", "c"])
            .cast(
                pl.List(pl.Categorical)
            )  # necessary because https://github.com/pola-rs/polars/issues/11730
            .alias("col1_alt"),
        )
        # .with_columns(
        #     pl.when(pl.col("col2") == 1)
        #     .then(pl.col("col1_alt"))
        #     .when(pl.col("col2") == 0)
        #     .then(pl.col("col1"))
        #     .alias("col1_combined")
        # )
    )
    print(res)

# Old

In [None]:
# df = pl.scan_pyarrow_dataset(arrow_ds2)
df = pl.DataFrame(
    {
        "col1": [
            np.random.choice(["a", "b", "c", "d"], np.random.randint(1, 5))
            for _ in range(10)
        ],
        "col2": np.random.randint(0, 2, size=10),
    }
)
df2 = pl.DataFrame(
    {
        "col1": [
            np.random.choice(["a", "b", "c", "d"], np.random.randint(1, 5))
            for _ in range(10)
        ],
        "col2": np.random.randint(0, 2, size=10),
    },
    schema={"col1": pl.List(pl.Categorical), "col2": pl.Int64},
)
with pl.StringCache():
    res = (
        df2.select(pl.col("col1", "col2"))
        .with_columns(
            pl.col("col1")
            .list.set_intersection(["a", "b", "c"])
            .cast(pl.List(pl.Categorical))
            .alias("col1_subset"),
        )
        .with_columns(
            pl.col("col1_subset")
            .list.eval(pl.element().map_dict({"a": "x", "b": "y", "c": "z"}))
            .alias("col1_subset_reversed"),
        )
        # .with_columns(
        #     pl.when(pl.col("col2") == 1)
        #     .then(pl.col("col1_subset"))
        #     # .then(None)
        #     .when(pl.col("col2") == 0)
        #     .then(pl.col("col1_subset_reversed"))
        #     .alias("col1_subset_normalized")
        # )
    )
# res

In [None]:
df = pl.DataFrame(
    {
        "col1": [
            np.random.choice(["a2", "b2", "c2", "d2"], np.random.randint(1, 5))
            for _ in range(10)
        ],
        "col2": np.random.randint(0, 2, size=10),
    },
    schema={"col1": pl.List(pl.Categorical), "col2": pl.Int64},
)
with pl.StringCache():
    res = (
        df.select(pl.col("col1", "col2"))
        .with_columns(
            pl.col("col1")
            .list.set_intersection(["a2", "b2", "c2"])
            .cast(
                pl.List(pl.Categorical)
            )  # necessary because https://github.com/pola-rs/polars/issues/11730
            .alias("col1_subset"),
        )
        .with_columns(
            pl.col("col1").list.eval(
                pl.element().map_dict({"a2": "x", "b2": "y", "c2": "z"})
            )
            # .list.eval(pl.element().cast(pl.Utf8).str.strip_chars_end("2"))
            # .cast(pl.List(pl.Categorical))
            .alias("col1_subset_reversed"),
        )
        .with_columns(
            pl.when(pl.col("col2") == 1)
            .then(pl.col("col1"))
            .when(pl.col("col2") == 0)
            .then(pl.col("col1_subset_reversed"))
            .alias("col1_subset_normalized")
        )
    )
print(res)

In [None]:
z = res.to_dict()

In [None]:
z.keys()

In [None]:
zz = z["col1_subset_normalized"]

In [None]:
zzz = zz.to_numpy()

In [None]:
dir(res)

In [None]:
res[2]

# concat_str

In [None]:
import numpy as np
import polars as pl

df = pl.DataFrame(
    {
        "col1": [
            np.random.choice(["a", "b", "c", "d"], np.random.randint(1, 5))
            for _ in range(10)
        ],
    },
)
df2 = pl.DataFrame(
    {
        "col1": [
            np.random.choice(["a", "b", "c", "d"], np.random.randint(1, 5))
            for _ in range(10)
        ]
    },
    schema={"col1": pl.List(pl.Categorical)},
)

In [None]:
res = df.select(pl.col("col1")).with_columns(
    pl.col("col1").list.eval(pl.concat_str([pl.element(), pl.lit("_")])).alias("col2")
)

In [None]:
with pl.StringCache():
    res = df.select(pl.col("col1")).with_columns(
        pl.col("col1")
        .list.eval(pl.concat_str([pl.element(), pl.lit("_")]))
        .alias("col2")
    )

In [None]:
with pl.StringCache():
    res = df.select(pl.col("col1")).with_columns(
        pl.col("col1")
        .list.eval(pl.concat_str([pl.element().cast(pl.Utf8), pl.lit("_")]))
        .alias("col2")
    )

# Option::unwrap panic

In [None]:
# df = pl.scan_pyarrow_dataset(arrow_ds2)
df = pl.DataFrame(
    {
        "col1": [
            np.random.choice(["a", "b", "c", "d"], np.random.randint(1, 5))
            for _ in range(10)
        ],
        "col2": np.random.randint(0, 2, size=10),
    }
)
df2 = pl.DataFrame(
    {
        "col1": [
            np.random.choice(["a", "b", "c", "d"], np.random.randint(1, 5))
            for _ in range(10)
        ]
    },
    schema={"col1": pl.List(pl.Categorical)},
)
with pl.StringCache():
    res = (
        df.select(pl.col("col1", "col2"))
        .with_columns(
            pl.col("col1")
            .list.set_intersection(["a", "b", "c"])
            .cast(pl.List(pl.Categorical))
            .alias("col1_subset"),
        )
        .with_columns(
            pl.col("col1_subset")
            .list.eval(pl.element().map_dict({"a": "x", "b": "y", "c": "z"}))
            .alias("col1_subset_reversed"),
        )
        .with_columns(
            pl.when(pl.col("col2") == 1)
            .then(pl.col("col1_subset"))
            # .then(None)
            .when(pl.col("col2") == 0)
            .then(pl.col("col1_subset_reversed"))
            .alias("col1_subset_normalized")
        )
    )
res

In [None]:
df = pl.DataFrame(
    {
        "col1": [
            np.random.choice(["a", "b", "c", "d"], np.random.randint(1, 5))
            for _ in range(10)
        ],
        "col2": np.random.randint(0, 2, size=10),
    }
)

In [None]:
with pl.StringCache():
    res = (
        df.select(pl.col("col1", "col2"))
        .with_columns(
            pl.col("col1")
            # .list.set_intersection(["a", "b", "c"])
            # .cast(pl.List(pl.Categorical))
            .alias("col1_subset"),
        )
        .with_columns(
            pl.col("col1_subset")
            # .list.reverse()
            # .list.eval(pl.element().map_dict({"a": "x", "b": "y", "c": "z"}))
            .list.eval(pl.element() + "_").alias("col1_subset_reversed"),
        )
        .with_columns(
            pl.when(pl.col("col2") == 1)
            .then(pl.col("col1_subset"))
            # .then(None)
            .when(pl.col("col2") == 0)
            .then(pl.col("col1_subset_reversed"))
            .alias("col1_subset_normalized")
        )
    )
res

In [None]:
# df = pl.scan_pyarrow_dataset(arrow_ds2)
df = pl.DataFrame(
    {
        "col1": [
            np.random.choice(["a", "b", "c", "d"], np.random.randint(1, 5))
            for _ in range(10)
        ],
        "col2": np.random.randint(0, 2, size=10),
    }
)
with pl.StringCache():
    res = (
        df.select(pl.col("col1", "col2")).with_columns(
            pl.col("col1")
            # .list.reverse()
            # .list.eval(pl.element().map_dict({"a": "x", "b": "y", "c": "z"}))
            .list.eval(pl.concat_str([pl.element(), pl.lit("_")])).alias("col3"),
        )
        # .with_columns(
        #     pl.when(pl.col("col2") == 1)
        #     .then(pl.col("col1_subset"))
        #     # .then(None)
        #     .when(pl.col("col2") == 0)
        #     .then(pl.col("col1_subset_reversed"))
        #     .alias("col1_subset_normalized")
        # )
    )
# res

In [None]:
# df = pl.scan_pyarrow_dataset(arrow_ds2)
with pl.StringCache():
    res = (
        df.select(pl.col("path"))
        .limit(10)
        .with_columns(
            pl.col("path")
            .list.set_intersection(bc_segments)
            .cast(pl.List(pl.Categorical))
            .alias("path_subset"),
        )
        .with_columns(
            pl.col("path_subset")
            # .list.reverse()
            .list.eval(pl.element().map_dict(reverse_path_mapping))
            # .cast(pl.List(pl.Categorical))
            # .list.reverse()
            .alias("path_subset_reversed"),
            # (
            #     pl.col("path_subset")
            #     .list.set_intersection(bc_segments_oriented[0])
            #     .list.len()
            #     > 0
            # ).alias("is_forward"),
            # (
            #     pl.col("path_subset")
            #     .list.set_intersection(bc_segments_oriented[1])
            #     .list.len()
            #     > 0
            # ).alias("is_reverse"),
        )
        .with_columns(
            (
                pl.col("path_subset")
                .list.set_intersection(bc_segments_oriented[0])
                .list.len()
                > 0
            ).alias("is_forward"),
            # (
            #     pl.col("path_subset")
            #     .list.set_intersection(bc_segments_oriented[1])
            #     .list.len()
            #     > 0
            # ).alias("is_reverse"),
        )
        .with_columns(
            pl.when(pl.col("is_forward") == True)
            .then(pl.col("path_subset"))
            # .then(None)
            .when(pl.col("is_forward") == False)
            .then(pl.col("path_subset_reversed"))
            .alias("path_subset_normalized")
        )
    ).collect()
res

# List functions on List(Categorical)

In [None]:
data_dir = Path(
    "/home/jqs1/scratch/jqs1/sequencing/230930_alignment_test/230707_repressilators/"
)

In [None]:
arrow_ds = ds.dataset(list(data_dir.glob("*.arrow")), format="arrow")

In [None]:
arrow_ds2 = ds.dataset(list(data_dir.glob("*.arrow"))[:2], format="arrow")

In [None]:
batch = next(arrow_ds2.to_batches())

In [None]:
x = batch.column("path")

In [None]:
x.type

In [None]:
%%time
df = pl.scan_pyarrow_dataset(arrow_ds2)

In [None]:
%%time
df.select(pl.col("path")).limit(5).with_columns(
    pl.col("path")
    .list.set_intersection(["<UNS9", "<BC:T7_TERM"])
    .cast(pl.List(pl.Categorical))
    .alias("path_subset")
).collect()

In [None]:
%%time
df = pl.scan_pyarrow_dataset(arrow_ds2)
with pl.StringCache():
    res = (
        df.select(pl.col("path"))
        .limit(5)
        .with_columns(
            # pl.col("path").list.set_intersection(["<UNS9", "<BC:T7_TERM"]).cast(pl.List(pl.Categorical)).alias("path_subset"),
            pl.col("path")
            .list.reverse()
            .list.eval(
                pl.element().map_dict(
                    {">BC:JUNCTION": "<BC:BIT0=0"}, default=pl.first()
                )
            )
            .alias("path_reversed")
        )
        .collect()
    )
res

# List functions for categoricals

In [None]:
col = pa.array(
    [np.random.choice(["a", "b", "c"], np.random.randint(1, 5)) for _ in range(10)],
    type=pa.list_(pa.dictionary(pa.int16(), pa.string())),
)
tbl = pa.Table.from_pydict(dict(col=col))

In [None]:
df = pl.from_arrow(tbl)
res = df.select(pl.col("col")).with_columns(
    pl.col("col").list.set_union(["b", "c"]).alias("col_subset")
)

In [None]:
df = pl.DataFrame({"foo": [["x", "y", "z"], [], ["x", "a"]]})
df.select(pl.col("foo").cast(pl.List(pl.Categorical)).list.get(0))

In [None]:
df = pl.DataFrame({"foo": [["x", "y", "z"], [], ["x", "a"]]})
df.select(pl.col("foo").list.get(0))

In [None]:
res = df.select(pl.col("col")).with_columns(
    pl.col("col").list.get(0).alias("col_subset")
)

In [None]:
res = df.select(pl.col("col")).with_columns(
    pl.col("col").list.first().alias("col_subset")
)

In [None]:
res

In [None]:
df = pl.from_arrow(tbl)
res2 = df.select(pl.col("col")).with_columns(
    pl.col("col")
    .list.set_intersection(["b", "c"])
    .cast(pl.List(pl.Categorical))
    .alias("col_subset")
)

In [None]:
res2

# map_dict

In [None]:
col = pa.array(
    [np.random.choice(["a", "b", "c"], np.random.randint(1, 5)) for _ in range(10)],
    type=pa.list_(pa.dictionary(pa.int16(), pa.string())),
)
tbl = pa.Table.from_pydict(dict(col=col))

In [None]:
with pl.StringCache():
    df = pl.from_arrow(tbl)
    res = df.select(pl.col("col")).with_columns(
        pl.col("col")
        .list.eval(pl.element().map_dict({"a": "z"}, default=pl.first()))
        .alias("col_mapped")
    )

In [None]:
res

# count_matches

In [None]:
import numpy as np
import polars as pl
import pyarrow as pa

df = pl.DataFrame(
    {
        "col": [
            np.random.choice(["a", "b", "c"], np.random.randint(1, 5))
            for _ in range(10)
        ]
    },
    schema={"col": pl.List(pl.Categorical)},
)
res = df.select(pl.col("col")).with_columns(
    pl.col("col").list.count_matches("b").alias("count")
)

In [None]:
import numpy as np
import polars as pl
import pyarrow as pa

col = pa.array(
    [np.random.choice(["a", "b", "c"], np.random.randint(1, 5)) for _ in range(10)],
    type=pa.list_(pa.dictionary(pa.int16(), pa.string())),
)
col2 = pa.array(
    [
        np.random.choice(["a", "b", "c", "d"], np.random.randint(1, 5))
        for _ in range(10)
    ],
    type=pa.list_(pa.dictionary(pa.int16(), pa.string())),
)
tbl = pa.Table.from_pydict(dict(col=col, col2=col2))
with pl.StringCache():
    df = pl.from_arrow(tbl)
    res = df.select(pl.col("col", "col2")).with_columns(
        pl.col("col").list.count_matches("b").alias("col_subset")
    )

In [None]:
res

In [None]:
res

# list.get/.first/.last

In [None]:
import numpy as np
import polars as pl
import pyarrow as pa

with pl.StringCache():
    df = pl.DataFrame(
        {
            "col": [
                np.random.choice(["a", "b", "c"], np.random.randint(1, 5))
                for _ in range(10)
            ]
        },
        schema={"col": pl.List(pl.Categorical)},
    )
    res = df.select(pl.col("col")).with_columns(pl.col("col").list.get(0).alias("col2"))

In [None]:
res

# list.set_*

In [None]:
import numpy as np
import polars as pl
import pyarrow as pa

# col = pa.array(
#     [np.random.choice(["a", "b", "c"], np.random.randint(1, 5)) for _ in range(10)],
#     type=pa.list_(pa.dictionary(pa.int16(), pa.string())),
# )
# tbl = pa.Table.from_pydict(dict(col=col))
# df = pl.from_arrow(tbl)
with pl.StringCache():
    df = pl.DataFrame(
        {
            "col": [
                np.random.choice(["a", "b", "c"], np.random.randint(1, 5))
                for _ in range(10)
            ]
        },
        schema={"col": pl.List(pl.Categorical)},
    )
    res = df.select(pl.col("col")).with_columns(
        pl.col("col").list.set_intersection(["b", "c"]).alias("col_subset")
    )
    res2 = df.select(pl.col("col")).with_columns(
        pl.col("col")
        .list.set_intersection(["b", "c"])
        .cast(pl.List(pl.Categorical))
        .alias("col_subset")
    )

In [None]:
res

In [None]:
res2

In [None]:
pl.show_versions()