In [None]:
%pip install buckaroo[polars]
try:
    from google.colab import output
    output.enable_custom_widget_manager()
except Exception as e:
    print(e)

In [None]:
import numpy as np
import pandas as pd

## A simple DataFrame in Pandas

In [None]:
num_df = pd.DataFrame(
    {'a':[111_111,  77_777, 777_777, 1_000_000, 2_111_111, 1_235_999],
     'b':[111_111, 555_555,       0,    28_123,   482_388,     5_666]})
num_df

In [None]:
import buckaroo
num_df

# Demonstrating Buckaroo on Citibike data.
This might take a little time to download

*once the view loads click 0's and 1's on the top left to toggle different parts of the UI*

In [None]:
df = pd.read_csv("https://github.com/paddymul/buckaroo-data/raw/main/cb_data/2016-04.csv")
#df = pd.read_parquet("https://github.com/paddymul/buckaroo-data/raw/main/cb_data/2016-04.parq")
df

## Histograms

Histograms are built into Buckaroo.  They enable users to quickly identify distributions of data in columns

In [None]:
#these are some utility functions for generating random distributions
#execute and ignore this cell
import numpy as np 
def bimodal(mean_1, mean_2, N, sigma=5):
    X1 = np.random.normal(mean_1, sigma, int(N/2))
    X2 = np.random.normal(mean_2, sigma, int(N/2))
    X = np.concatenate([X1, X2])
    return X

def rand_cat(named_p, na_per, N):
    choices, p = [], []
    named_total_per = sum(named_p.values()) + na_per
    total_len = int(np.floor(named_total_per * N))
    if named_total_per > 0:
        for k, v in named_p.items():
            choices.append(k)
            p.append(v/named_total_per)
        choices.append(pd.NA)
        p.append(na_per/named_total_per)    
        return [np.random.choice(choices, p=p) for k in range(total_len)]
    return []

def random_categorical(named_p, unique_per, na_per, longtail_per, N):
    choice_arr = rand_cat(named_p, na_per, N)
    discrete_choice_len = len(choice_arr)

    longtail_count = int(np.floor(longtail_per * N))//2
    extra_arr = []
    for i in range(longtail_count):
        extra_arr.append("long_%d" % i)
        extra_arr.append("long_%d" % i)

    unique_len = N - (len(extra_arr) + discrete_choice_len)
    for i in range(unique_len):
        extra_arr.append("unique_%d" % i)
    all_arr = np.concatenate([choice_arr, extra_arr])
    np.random.shuffle(all_arr)
    try:
        return pd.Series(all_arr, dtype='UInt64')
    except:
        return pd.Series(all_arr, dtype=pd.StringDtype())        
N = 4000

## Common histogram shapes
The following shows the most common shapes you will see in histograms, allowing you to quickly identify patterns

Notice the three columns on the right.  Those are categorical histograms as opposed to numerical histograms

In [None]:
import buckaroo
pd.DataFrame({
    'normal': np.random.normal(25, .3, N),
    'exponential' :  np.random.exponential(1.0, N) * 10 ,
    'increasing':[i for i in range(N)],
    'one': [1]*N,
    'dominant_categories':     random_categorical({'foo': .6, 'bar': .25, 'baz':.15}, unique_per=0, na_per=0, longtail_per=0, N=N),
    'all_unique_cat': random_categorical({}, unique_per=1, na_per=0, longtail_per=0, N=N)})

## Categorical histograms
Categorical histograms have special colors and patterns for `NA`/`NaN`, `longtail` (values that occur at least twice) and `unique`
Categorical histograms are always arranged from most frequent on the left to least frequent on the right.

When a column is numerical, but has less than 5 distinct values it is displayed with a categorical histogram, because the numbers were probably flags

In [None]:
pd.DataFrame({
    'all_NA' :          pd.Series([pd.NA] * N, dtype='UInt8'),
    'half_NA' :         random_categorical({1: .55}, unique_per=0,   na_per=.45, longtail_per=.0, N=N),
    'dominant_categories':     random_categorical({'foo': .45, 'bar': .2, 'baz':.15}, unique_per=.2, na_per=0, longtail_per=0, N=N),
    'longtail' :        random_categorical({},      unique_per=0,   na_per=.2, longtail_per=.8, N=N),
    'longtail_unique' : random_categorical({},      unique_per=0.5, na_per=.0, longtail_per=.5, N=N)})

## Notice the different distributions of the numeric histograms

In [None]:
pd.DataFrame({
    'bimodal' :  bimodal(20,40, N),
    'exponential' :  np.random.exponential(1.0, N) * 10 ,
    'geometric': np.random.geometric(.2, N) * 10,
    'log_normal': np.random.lognormal(25, .3, N),
    'normal': np.random.normal(25, .3, N),})

# Buckaroo also works on polars

In [None]:
import polars as pl
df = pl.read_parquet("https://github.com/paddymul/buckaroo-data/raw/main/cb_data/2016-04.parq")

In [None]:
buckaroo.debug_packages()