In [1]:
import os
from datetime import datetime
from pathlib import Path

import pandas as pd
import numpy as np

import dask
from dask import dataframe as dd
from dask import array as da

import polars as pl

data_path = Path(os.getcwd()).parent/"data"

In [2]:
pf = pl.scan_csv(data_path/"credit_card_transactions-ibm_v2.csv")

In [6]:
pf.collect_schema()

Schema([('User', Int64),
        ('Card', Int64),
        ('Year', Int64),
        ('Month', Int64),
        ('Day', Int64),
        ('Time', String),
        ('Amount', String),
        ('Use Chip', String),
        ('Merchant Name', Int64),
        ('Merchant City', String),
        ('Merchant State', String),
        ('Zip', Float64),
        ('MCC', Int64),
        ('Errors?', String),
        ('Is Fraud?', String)])

In [51]:
(pf
 .select(pl.col("Amount"),
         pl.col("Amount").str.extract(r"(\d[\d.]*)").cast(pl.Float64).alias("amount"))
 .head()
 ).collect()

Amount,amount
str,f64
"""$134.09""",134.09
"""$38.48""",38.48
"""$120.34""",120.34
"""$128.95""",128.95
"""$104.71""",104.71


In [52]:
(pf
 .with_columns(
      pl.col("Amount").str.extract(r"(\d[\d.]*)").cast(pl.Float64).alias("amount"))
 .group_by("Zip")
 .agg(pl.len(), 
      pl.col("User").n_unique(), 
      pl.col("amount").sum()
      )
 ).collect()

Zip,len,User,amount
f64,u32,u32,f64
95969.0,1610,27,85268.52
4860.0,15,4,394.98
48334.0,152,20,16776.63
66872.0,10,5,522.37
85546.0,144,20,10198.56
…,…,…,…
45682.0,85,6,3697.2
90242.0,551,127,247202.1
32063.0,82,4,3520.68
46750.0,11634,15,682713.01
