# Chapter 12: Testing and Debugging in Polars

## Debugging chained operations

### How it works...

In [None]:
import polars as pl

In [None]:
lf = pl.scan_csv('../data/pokemon.csv')

In [None]:
(
    lf
    .with_columns(
        pl.col('Attack').rank(method='dense').alias('Atk Rank'),
        pl.col('Defense').rank(method='dense').alias('Def Rank'),
        pl.col('Speed').rank(method='dense').alias('Spe Rank'),
    )
    .select(
        'Name',
        'Total',
        'Attack',
        'Defense',
        'Speed',
        pl.col('^*Rank$')
    )
    .sort('Total')
    .head()
    .collect()
)

In [None]:
(
    lf
    .with_columns(
        pl.col('Attack').rank(method='dense').alias('Atk Rank'),
        pl.col('Defense').rank(method='dense').alias('Def Rank'),
        pl.col('Speed').rank(method='dense').alias('Spe Rank'),
    )
    .select(
        'Name',
        'Total',
        'Attack',
        'Deffense',
        'Speed',
        pl.col('^*Rank$')
    )
    .sort('Total')
    .head()
    .collect()
)

In [None]:
(
    lf
    .with_columns(
        pl.col('Attack').rank(method='dense').alias('Atk Rank'),
        pl.col('Defense').rank(method='dense').alias('Def Rank'),
        pl.col('Speed').rank(method='dense').alias('Spe Rank'),
    )
    # .select(
    #     'Name',
    #     'Total',
    #     'Attack',
    #     'Deffense',
    #     'Speed',
    #     pl.col('^*Rank$')
    # )
    .sort('Total')
    .head()
    .collect()
)

In [None]:
(
    lf
    .collect()
    .with_columns(
        pl.col('Attack').rank(method='dense').alias('Atk Rank'),
        pl.col('Defense').rank(method='dense').alias('Def Rank'),
        pl.col('Speed').rank(method='dense').alias('Spe Rank'),
    )
    .select(
        'Name',
        'Total',
        'Attack',
        'Deffense',
        'Speed',
        pl.col('^*Rank$')
    )
    .sort('Total')
    .head()
)

### There is more...

In [None]:
def add_ranks(lf: pl.LazyFrame) -> pl.LazyFrame:
    return (
        lf
        .with_columns(
            pl.col('Attack').rank(method='dense').alias('Atk Rank'),
            pl.col('Defense').rank(method='dense').alias('Def Rank'),
            pl.col('Speed').rank(method='dense').alias('Spe Rank'),
        )
    )

def keep_cols(lf: pl.LazyFrame) -> pl.LazyFrame:
    return (
        lf
        .select(
            'Name',
            'Total',
            'Attack',
            'Defense',
            'Speed',
            pl.col('^*Rank$')
        )
    )

(
    lf
    .pipe(add_ranks)
    .pipe(keep_cols)
    .sort('Total')
    .head()
    .collect()
)

## Inspecting and optimizing the query plan

### How to do it...

In [None]:
def keep_grass_or_fire(lf):
    accepted_types = ['Grass', 'Fire']
    return (
        lf
        .filter(
            (pl.col('Type 1').is_in(accepted_types))
            | (pl.col('Type 2').is_in(accepted_types))
        )
    )

In [None]:
(
    lf
    .pipe(add_ranks)
    .pipe(keep_grass_or_fire)
    .pipe(keep_cols)
    .show_graph()
)

In [None]:
(
    lf
    .pipe(add_ranks)
    .pipe(keep_grass_or_fire)
    .pipe(keep_cols)
    .show_graph(optimized=False)
)

In [None]:
print(
    lf
    .pipe(add_ranks)
    .pipe(keep_grass_or_fire)
    .pipe(keep_cols)
    .explain()
)

In [None]:
print(
    lf
    .pipe(add_ranks)
    .pipe(keep_grass_or_fire)
    .pipe(keep_cols)
    .explain(optimized=False)
)

In [None]:
(
    lf
    .pipe(keep_grass_or_fire)
    .pipe(keep_cols)    
    .pipe(add_ranks)
    .show_graph()
)

### There is more...

In [None]:
print(
    lf
    .pipe(keep_grass_or_fire)
    .pipe(keep_cols)    
    .pipe(add_ranks)
    .explain(streaming=True)
)

## Testing data quality with cuallee

### How to do it...

In [None]:
from cuallee import Check, CheckLevel

In [None]:
df = lf.collect()
df.head()

In [None]:
check = Check(CheckLevel.WARNING, 'Completeness')
(
    check
    .is_complete('Name')
    .is_unique('Name')
    .validate(df)
)

In [None]:
check = Check(CheckLevel.WARNING, 'Accepted Values')
accepted_types = (
    lf.select('Type 1')
    .unique()
    .collect()
    .to_series()
    .to_list()
)

(
    check
    .is_contained_in('Type 1', set(accepted_types))
    .validate(df)
    .select('check', 'column', 'rule', 'status')
)

In [None]:
check = Check(CheckLevel.WARNING, 'Validation on Stats')
stats_cols = [
    'HP',
    'Attack',
    'Defense',
    'Sp. Atk',
    'Sp. Def',
    'Speed'
]
res_cols = [
    'check', 
    'column', 
    'rule', 
    'rows', 
    'violations', 
    'pass_rate', 
    'status'
]

(
    check
    .is_complete(stats_cols)
    .is_greater_than(stats_cols, 0)
    .validate(df)
    .select(res_cols)
)

In [None]:
check = Check(CheckLevel.WARNING, 'Completeness')
cols = ['Name', 'Type 1', 'Type 2']
(
    check
    .are_complete(cols)
    .are_unique(cols)
    .validate(df)
    .select(
        'check', 
        'column', 
        'rule', 
        'rows', 
        'violations', 
        'pass_rate', 
        'status'
    )
)

### There is more...

In [None]:
check = Check(CheckLevel.WARNING, 'Completeness')
result = (
    check
    .is_complete('Type 2')
    .validate(df)
    .select('status')[0,0]=='PASS'
)
assert result

## Getting started with Pytest

Please refer to `demo.py` and `test_demo.py`