In [None]:
%pip install buckaroo
try:
    from google.colab import output
    output.enable_custom_widget_manager()
except Exception as e:
    print(e)

In [None]:
import numpy as np
import pandas as pd
import buckaroo

# Demonstrating Buckaroo on Citibike data.
This might take a little time to download

*once the view loads click 0's and 1's on the top left to toggle different parts of the UI*

In [None]:
df = pd.read_csv("https://s3.amazonaws.com/tripdata/201401-citibike-tripdata.zip")
#df = pd.read_csv("/Users/paddy/code/example-notebooks/citibike-trips.csv") #for airplane work
df

## Notice the monospaced numeric formatting in the following dataframe

In [None]:
num_df = pd.DataFrame({'a':[111_111,  77_777, 777_777, 1_000_000, 2_111_111, 1_235_999],
                       'b':[111_111, 555_555,       0,    28_123,   482_388,     5_666]})
num_df

## Histograms

Histograms are built into Buckaroo.  They enable users to quickly identify distributions of data in columns

In [None]:
#these are some utility functions for generating random distributions
#execute and ignore this cell
import numpy as np 
def bimodal(mean_1, mean_2, N, sigma=5):
    X1 = np.random.normal(mean_1, sigma, int(N/2))
    X2 = np.random.normal(mean_2, sigma, int(N/2))
    X = np.concatenate([X1, X2])
    return X

def rand_cat(named_p, na_per, N):
    choices, p = [], []
    named_total_per = sum(named_p.values()) + na_per
    total_len = int(np.floor(named_total_per * N))
    if named_total_per > 0:
        for k, v in named_p.items():
            choices.append(k)
            p.append(v/named_total_per)
        choices.append(pd.NA)
        p.append(na_per/named_total_per)    
        return [np.random.choice(choices, p=p) for k in range(total_len)]
    return []

def random_categorical(named_p, unique_per, na_per, longtail_per, N):
    choice_arr = rand_cat(named_p, na_per, N)
    discrete_choice_len = len(choice_arr)

    longtail_count = int(np.floor(longtail_per * N))//2
    extra_arr = []
    for i in range(longtail_count):
        extra_arr.append("long_%d" % i)
        extra_arr.append("long_%d" % i)

    unique_len = N - (len(extra_arr) + discrete_choice_len)
    for i in range(unique_len):
        extra_arr.append("unique_%d" % i)
    all_arr = np.concatenate([choice_arr, extra_arr])
    np.random.shuffle(all_arr)
    return all_arr    
N = 4000

## Common histogram shapes
The following shows the most common shapes you will see in histograms, allowing you to quickly identify patterns

Notice the three columns on the right.  Those are categorical histograms as opposed to numerical histograms

In [None]:
pd.DataFrame({
    'normal': np.random.normal(25, .3, N),
    'exponential' :  np.random.exponential(1.0, N) * 10 ,
    'increasing':[i for i in range(N)],
    'one': [1]*N,
    'dominant_categories':     random_categorical({'foo': .6, 'bar': .25, 'baz':.15}, unique_per=0, na_per=0, longtail_per=0, N=N),
    'all_unique_cat': random_categorical({}, unique_per=1, na_per=0, longtail_per=0, N=N)})

## Categorical histograms
Categorical histograms have special colors and patterns for `NA`/`NaN`, `longtail` (values that occur at least twice) and `unique`
Categorical histograms are always arranged from most frequent on the left to least frequent on the right.

When a column is numerical, but has less than 5 distinct values it is displayed with a categorical histogram, because the numbers were probably flags

In [None]:
pd.DataFrame({
    'all_NA' :          [pd.NA] * N,
    'half_NA' :         random_categorical({1: .5}, unique_per=0,   na_per=.5, longtail_per=.0, N=N),
    'dominant_categories':     random_categorical({'foo': .6, 'bar': .25, 'baz':.15}, unique_per=0, na_per=0, longtail_per=0, N=N),
    'longtail' :        random_categorical({},      unique_per=0,   na_per=.2, longtail_per=.8, N=N),
    'longtail_unique' : random_categorical({},      unique_per=0.5, na_per=.0, longtail_per=.5, N=N)})

## Notice the different distributions of the numeric histograms

In [None]:
pd.DataFrame({
    'bimodal' :  bimodal(20,40, N),
    'exponential' :  np.random.exponential(1.0, N) * 10 ,
    'geometric': np.random.geometric(.2, N) * 10,
    'log_normal': np.random.lognormal(25, .3, N),
    'normal': np.random.normal(25, .3, N),})

# Auto typing
Click over to summary stats and notice the difference between the dtypes on these two tables
notice that the birth year for the first table includes `\n`, autoTyping was turned off for this widget


In [None]:
BuckarooWidget = buckaroo.BuckarooWidget
w = BuckarooWidget(df, autoType=False, showCommands=False)
w

# Turning off autocleaning by column
This widget has been configured to have the low code UI on by default.
Observe that the cleaning operation for each column has been added,
and can be removed with the X

In [None]:
w = BuckarooWidget(df, showCommands=True)
w

# Adding a summary stat
Buckaroo is completely customizeable.  In the next cells we will add `Variance` to an instance of the BuckarooWidget with the `Pluggable Analysis Framework`.

## Why was the Pluggable Analysis Framework built?
The `Pluggable Analysis Framework` is engineered to allow summary_stats to be built up piecemeal and incrementally.  Traditionally when writing bits of analysis code, the tendency is to have large brittle functions that do a lot at once.  Adding extra stats either requires copying and pasting the existing function with one small addition, writing each stat independently and possibly recomputing existing stats, having a strictly ordered set of analysis functions, or some complex adhoc argument passing scheme.  I have written adhoc versions in each of these patterns.  Problems are manifest and the aparatus rarely survives even copy-pasting to the next notebook.

## How does the Pluggable Analysis Framework work?
The `Pluggable Analysis Framework` is built around a DAG of `ColAnalysis` nodes that can depend on other summary stats, and provide one or more summary stats.  Nodes cand be added to the dag with `add_analysis`.  If a class with the same name is inserted into the DAG, the newly inserted node replaces the previous instantiation.  This all facilitates interactive development of analysis functions.  During execution errors are caught and execution proceeds.  This is important because breaking the default dataframe mechanism is a show stopping problem for users



In [None]:
w = buckaroo.BuckarooWidget(df, showCommands=False)
w

In [None]:
from buckaroo.pluggable_analysis_framework import (ColAnalysis)
class Variance(ColAnalysis):
    provides_summary = ["variance"]
    requires_summary = ["mean"]

    @staticmethod
    def summary(sampled_ser, summary_ser, ser):
        mean = summary_ser.get('mean', False)
        arr = ser.to_numpy()
            
        if mean is pd.NA or mean is np.nan or mean is False:
            return dict(variance="NA")
        elif mean and pd.api.types.is_integer_dtype(ser):
            return dict(variance=np.mean((arr - mean)**2))
        elif mean and pd.api.types.is_float_dtype(ser):
            return dict(variance=np.mean((arr - mean)**2))
        return dict(variance="NA")
    summary_stats_display = [
        'dtype', 'length', 'nan_count', 'distinct_count', 'empty_count',
        'empty_per', 'unique_per', 'nan_per', 
        'is_numeric', 'is_integer', 'is_datetime',
        'mode', 'min', 'max', 'mean', 
        # we must add variance to the list of summary_stats_display, otherwise our new stat won't be displayed
        'variance']
w.add_analysis(Variance)

analysis is added interactively,  toggle the summary stats view on the widget above and notice that `variance` has been added

## Basic Unit testing is built in

Because there are so many corner cases with numerical code, every time a new summary stat is added, a variety of simple tests are run against it.  This lets you discover bugs earlier.

In [None]:
small_df = df[:500][df.columns[:4]]
# we are going to create, but not display a BuckarooWidget here, we are looking at the error behavior
w = buckaroo.BuckarooWidget(small_df, showCommands=False)

class Variance(ColAnalysis):
    provides_summary = ["variance"]
    requires_summary = ["mean"]
    
    @staticmethod
    def summary(sampled_ser, summary_ser, ser):
        mean = summary_ser.get('mean', False)
        arr = ser.to_numpy()
        #toggle SIMULATED_BUG to easily see behavior with and without a bug
        SIMULATED_BUG = True
        if SIMULATED_BUG:
            if mean in [pd.NA, np.nan, False]:
                return dict(variance="NA")
        else:
            if mean is pd.NA or mean is np.nan or mean is False:
                return dict(variance="NA")
        if mean and pd.api.types.is_integer_dtype(ser):
            return dict(variance=np.mean((arr - mean)**2))
        elif mean and pd.api.types.is_float_dtype(ser):
            return dict(variance=np.mean((arr - mean)**2))
        return dict(variance="NA")
    
    summary_stats_display = [
        'dtype', 'length', 'nan_count', 'distinct_count', 'empty_count',
        'empty_per', 'unique_per', 'nan_per', 
        'is_numeric', 'is_integer', 'is_datetime',
        'mode', 'min', 'max', 'mean', 
        # we must add variance to the list of summary_stats_display, otherwise our new stat won't be displayed
        'variance']

w.add_analysis(Variance)

## Reproducing errors in the notebook
Buckaroo printed reproduction instructions like
```
from buckaroo.analysis_management import PERVERSE_DF
Variance.summary(PERVERSE_DF['all_nan'], pd.Series({'mean': np.nan, }), PERVERSE_DF['all_nan']) # boolean value of NA is ambiguous

```

`PERVERSE_DF` is a DataFame with all kinds of edgecases that normally trip up numerical code.  You can run the above two lines, and quickly start iterating on your `ColAnalysis` class to fix the error.  Normally adhoc analysis code that iterates over a list of functions blows up in a stack trace referencing an anonymous function in the middle of a for loop called with opaque variables.  Bucakroo gives you a single line that can reproduce the error, with easily inspectable variables

## Quiet mode
Sometimes you just want to get on with it.  Buckaroo has a setting for that too, set `quiet=True` and unit test errors, and regular processing errors will be silenced.  Not recommended, but if I didn't add it, users would write their own adhoc version.

In [None]:
w = buckaroo.BuckarooWidget(small_df, showCommands=False)
#There are errors in the following functions, quiet = True will ignore them

def int_digits(n):
    if np.isnan(n):
        return 1
    if n == 0:
        return 1
    if np.sign(n) == -1:
        return int(np.floor(np.log10(np.abs(n)))) + 2
    return int(np.floor(np.log10(n)+1))
class MinDigits(ColAnalysis):
    
    requires_summary = ["min"]
    provides_summary = ["min_digits"]
    quiet = True
    
    @staticmethod
    def summary(sampled_ser, summary_ser, ser):
        is_numeric = pd.api.types.is_numeric_dtype(sampled_ser.dtype)
        if is_numeric:
            return {
                'min_digits':int_digits(summary_ser.loc['min'])}
        else:
            return {
                'min_digits':0}
w.add_analysis(MinDigits)
w

# Adding a Command to the Low Code UI

In [None]:
w = buckaroo.BuckarooWidget(df[:500], showCommands=True, autoType=False) #turning autoType=False to reduce clutter in the operations
w

In [None]:
from buckaroo.all_transforms import Command
from buckaroo.lispy import s
#Here we start adding commands to the Buckaroo Widget.  Every call to add_command replaces a command with the same name
@w.add_command
class GroupBy2(Command):
    command_default = [s("groupby2"), s('df'), 'col', {}]
    command_pattern = [[3, 'colMap', 'colEnum', ['null', 'sum', 'mean', 'median', 'count']]]
    @staticmethod 
    def transform(df, col, col_spec):
        grps = df.groupby(col)
        df_contents = {}
        for k, v in col_spec.items():
            if v == "sum":
                df_contents[k] = grps[k].apply(lambda x: x.sum())
            elif v == "mean":
                df_contents[k] = grps[k].apply(lambda x: x.mean())
            elif v == "median":
                df_contents[k] = grps[k].apply(lambda x: x.median())
            elif v == "count":
                df_contents[k] = grps[k].apply(lambda x: x.count())
        return pd.DataFrame(df_contents)

    @staticmethod 
    def transform_to_py(df, col, col_spec):
        commands = [
            "    grps = df.groupby('%s')" % col,
            "    df_contents = {}"
        ]
        for k, v in col_spec.items():
            if v == "sum":
                commands.append("    paddydf_contents['%s'] = grps['%s'].apply(lambda x: x.sum())" % (k, k))
            elif v == "mean":
                commands.append("    df_contents['%s'] = grps['%s'].apply(lambda x: x.mean())" % (k, k))
            elif v == "median":
                commands.append("    df_contents['%s'] = grps['%s'].apply(lambda x: x.median())" % (k, k))
            elif v == "count":
                commands.append("    df_contents['%s'] = grps['%s'].apply(lambda x: x.count())" % (k, k))
        commands.append("    df = pd.DataFrame(df_contents)")
        return "\n".join(commands)


Note that `groupby2` has been added to the commands

# Buckaroo also works on polars

In [None]:
import polars as pl
pl.read_csv('/Users/paddy/code/citibike-play/2014-01 - Citi Bike trip data.csv')

In [None]:
buckaroo.debug_packages()

# Making a new default dataframe display function

In [None]:
from buckaroo.widget_utils import disable
from IPython.core.getipython import get_ipython
from IPython.display import display
import warnings

disable()
def my_display_as_buckaroo(df):
    w  = BuckarooWidget(df, showCommands=False)
    #the analysis we added throws warnings, let's muffle that when used as the default display
    warnings.filterwarnings('ignore')
    w.add_analysis(Skew)
    warnings.filterwarnings('default')
    return display(w)

def my_enable():
    """
    Automatically use buckaroo to display all DataFrames
    instances in the notebook.

    """
    ip = get_ipython()
    if ip is None:
        print("must be running inside ipython to enable default display via enable()")
        return
    ip_formatter = ip.display_formatter.ipython_display_formatter
    ip_formatter.for_type(pd.DataFrame, my_display_as_buckaroo)
my_enable()