In [None]:
import pandas as pd
import datacompy

In [None]:
# Create sample DataFrames
df_a = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [4, 5, 6],
    'c': ['foo', 'foo', 'bar'],
    'e': [100, 10, 1]})

df_b = pd.DataFrame({
    'a': [1, 2, 3],
    'b': [4, 5, 7],
    'd': ['foo', 'baz', 'baz'],
    'f': [100, 10, 1]
})  # Notice the difference in the last row

In [None]:
comparison = datacompy.Compare(
    df_a, df_b,
    join_columns='A',  # Column to join DataFrames on
    abs_tol=0,  # Absolute tolerance
    rel_tol=0)  # Relative tolerance

In [None]:
# Generate the report
#print(comparison.report())
#comparison.column_stats

In [None]:
def col_join_dfs(df1, df2, cmp):
    df1_name = cmp.df1_name
    df2_name = cmp.df2_name

    col_order = df1.columns.to_list()
    for col in df2.columns:
        if col in col_order:
            continue
            
        col_order.append(col)
    eqs = {}
    def get_col_stat(col_name):
        for obj in cmp.column_stats:
            if obj['column'] == col_name:
                return obj
        return None
            
    for col in col_order:
        col_stat = get_col_stat(col)
        if col_stat:
            eqs[col] = {'unequality': col_stat['unequal_cnt']}
        else:
            if col in df1.columns:
                eqs[col] = {'unequality': df1_name}
            else:
                eqs[col] = {'unequality': df2_name}
    ret_df_columns = {}
    column_config_overrides = {}

    for col in col_order:
        eq_col = eqs[col]['unequality']
        if eq_col == df1_name:
            #it's only in df1
            ret_df_columns[col] = df1[col]
        elif eq_col == df2_name:
            #it's only in df2
            ret_df_columns[col] = df2[col]
        elif eq_col == 0:
            #columns are exactly the same
            ret_df_columns[col] = df1[col]
        else:
            ret_df_columns[col] = df1[col]
            #|df2 is a magic value, not a super fan, but it's also unlikely
            df2_col_name = col+"|df2"
            print("col", col, "df2_cols", df2.columns)
            ret_df_columns[df2_col_name] = df2[col]
            
            column_config_overrides[df2_col_name] = {'merge_rule': 'hidden'}
            #eqs[df2_col_name] = {'merge_rule': 'hidden'}
    ret_df = pd.DataFrame(ret_df_columns)
    return ret_df, column_config_overrides, eqs


In [None]:
from buckaroo.pluggable_analysis_framework.pluggable_analysis_framework import ColAnalysis
from buckaroo import BuckarooWidget
from buckaroo.dataflow.dataflow_extras import (
    merge_sds, exception_protect)
from traitlets import observe

def DatacompyBuckaroo(df1, df2):
    cmp = datacompy.Compare(
        df1, df2,
        join_columns='a',  # Column to join DataFrames on
        abs_tol=0,  # Absolute tolerance
        rel_tol=0)  # Relative tolerance
    
    def get_df_header(cmp):
        df_header = pd.DataFrame({        
            "DataFrame": [cmp.df1_name, cmp.df2_name],
            "Columns": [cmp.df1.shape[1], cmp.df2.shape[1]],
            "Rows": [cmp.df1.shape[0], cmp.df2.shape[0]]}) #, index=['a', 'b'])
        return df_header.T
    
    def column_summary(cmp):
        df1_name = cmp.df1_name
        df2_name = cmp.df2_name
        col_df = pd.DataFrame({
            "Columns in common":[len(cmp.intersect_columns())],
            f"Columns in {df1_name} not in {df2_name}":[ cmp.df1_unq_columns().items],
            f"Columns in {df2_name} not in {df1_name}":[ cmp.df2_unq_columns().items]})
        return col_df.T
    
    def row_summary(cmp):
        # write pad arr function to pad array to number of join columns
        match_criteria = "index"
        if not cmp.on_index:
            match_criteria = ", ".join(cmp.join_columns)
            has_dupes = cmp._any_dupes
            df1_name = cmp.df1_name
            df2_name = cmp.df2_name
    
        row_df = pd.DataFrame({
            "Matched On": [match_criteria],
            "Any Duplicates on match values": [has_dupes],
            "Number of rows in common": cmp.intersect_rows.shape[0],
            f"Number of rows in {df1_name} but not in {df2_name}": cmp.df1_unq_rows.shape[0],
            f"Number of rows in {df2_name} but not in {df1_name}": cmp.df2_unq_rows.shape[0],
            "Number of rows with some compared columns unequal": [cmp.intersect_rows.shape[0] - cmp.count_matching_rows()],
            "Number of rows with all compared columns equal": [cmp.count_matching_rows()]
        })
        return row_df.T
    
    def column_matching(cmp):
        unequal_count = len([col for col in cmp.column_stats if col["unequal_cnt"] > 0])
        equal_count = len([col for col in cmp.column_stats if col["unequal_cnt"] == 0])
        total_unequal_count = sum(col["unequal_cnt"] for col in cmp.column_stats)
    
        col_df = pd.DataFrame({
            "Number of columns compared with some values unequal": [unequal_count],
            "Number of columns with all values equal": [equal_count],
            "Total number of values which compare unequal": [total_unequal_count]})
        return col_df.T
    
    def match_stats(cmp, sample_count=10):
        match_stats = []
        match_sample = []
        any_mismatch = False
        for column in cmp.column_stats:
            if not column["all_match"]:
                any_mismatch = True
                match_stats.append({
                    "Column": column["column"],
                    f"{cmp.df1_name} dtype": column["dtype1"],
                    f"{cmp.df2_name} dtype": column["dtype2"],
                    "# Unequal": column["unequal_cnt"],
                    "Max Diff": column["max_diff"],
                    "# Null Diff": column["null_diff"]})
            if column["unequal_cnt"] > 0:
                match_sample.append(
                    cmp.sample_mismatch(
                        column["column"], sample_count, for_display=True))
    
        df_match_stats = pd.DataFrame(match_stats)
        df_match_stats.sort_values("Column", inplace=True)
        return df_match_stats.T

    class DfHeader(ColAnalysis):
        @classmethod
        def post_process_df(kls, df):
            ab = get_df_header(cmp)
            print("ab", ab)
            return [ab, {
        '0': {'_type': 'obj', 'column_config_override': {'displayer_args': {'displayer': 'obj'}}},
        #'index': {'_type': 'obj', 'column_config_override': {'displayer_args': {'displayer': 'obj'}}}}
            }
            ]
        post_processing_method = "Df Headers"


    class ColumnSummary(ColAnalysis):
        @classmethod
        def post_process_df(kls, df):
            col_summary_df = column_summary(cmp)
            print("col_summary", col_summary_df)
            return [col_summary_df, {}]
        post_processing_method = "Column Summary"

    class RowSummary(ColAnalysis):
        @classmethod
        def post_process_df(kls, df):
            return [row_summary(cmp), {}]
        post_processing_method = "Row Summary"

    class ColumnMatching(ColAnalysis):
        @classmethod
        def post_process_df(kls, df):
            return [column_matching(cmp), {}]
        post_processing_method = "Column Matching"

    class MatchStats(ColAnalysis):
        @classmethod
        def post_process_df(kls, df):
            return [match_stats(cmp), {}]
        post_processing_method = "Match Stats"

    # write class that automatically re-runs styling analysis on post_processed_df
    # that way if post_processed_df has different column names then the default dataframe
    # the new column names are dipslayed,  tailor made for this situation
    # ... or these should be different pinned rows
    # nope pinned rows don't work, because then we'd have to change column names still, or have
    # a bunch of empty columns
        
    datacompy_post_processing_klasses = [
        DfHeader, ColumnSummary, RowSummary, ColumnMatching, MatchStats]
    
    base_a_klasses = BuckarooWidget.analysis_klasses.copy()
    base_a_klasses.extend(datacompy_post_processing_klasses)
    class DatacompyBuckarooWidget(BuckarooWidget):
        analysis_klasses = base_a_klasses


        #the following should move to 
        def __init__(self, orig_df, debug=False,
                     column_config_overrides=None,
                     pinned_rows=None, extra_grid_config=None,
                     component_config=None, init_sd=None):
            if init_sd is None:
                self.init_sd = {}
            else:
                self.init_sd = init_sd
            super().__init__(
                orig_df, debug, column_config_overrides, pinned_rows, extra_grid_config, component_config)

        @observe('summary_sd')
        @exception_protect('merged_sd-protector')
        def _merged_sd(self, change):
            #slightly inconsitent that processed_sd gets priority over
            #summary_sd, given that processed_df is computed first. My
            #thinking was that processed_sd has greater total knowledge
            #and should supersede summary_sd.
            self.merged_sd = merge_sds(
                self.init_sd, self.cleaned_sd, self.summary_sd, self.processed_sd)

    joined_df, column_config_overrides, init_sd = col_join_dfs(df1, df2, cmp)

    histogram_sd =  {
        'a': {'h3': {'histogram': [{'name': 'NA', 'NA': 100.0}]}},
        'b': {'histogram': {'histogram': [{'name': 1, 'cat_pop': 44.0}, {'name': 'NA', 'NA': 56.0}]}}}

    full_init_sd = merge_sds(
        {'index':{}}, # we want to make sure index is the first column recognized by buckaroo
        init_sd, histogram_sd)

    
    dcbw = DatacompyBuckarooWidget(
        joined_df, column_config_overrides=column_config_overrides, # init_sd=full_init_sd,
        pinned_rows=[
        {'primary_key_val': 'dtype',           'displayer_args': {'displayer': 'obj'}},
        {'primary_key_val': 'histogram',       'displayer_args': {'displayer': 'histogram'}},
        #{'primary_key_val': 'histogram',       'displayer_args': {'displayer': 'histogram'}},
            
        {'primary_key_val': 'unequality',      'displayer_args': {'displayer': 'obj'}},
        {'primary_key_val': 'h3', 'displayer_args': {'displayer': 'histogram'}}]
            
    )

    return dcbw


In [None]:
dcbw = DatacompyBuckaroo(df_a, df_b)
dcbw

In [None]:
cmp = datacompy.Compare(
        df_a, df_b,
        join_columns='a',  # Column to join DataFrames on
        abs_tol=0,  # Absolute tolerance
        rel_tol=0)  # Relative tolerance

In [None]:
from buckaroo.customizations.styling import (DefaultMainStyling)
class MergingMainStylingAnalysis(DefaultMainStyling):

    @classmethod
    def style_columns(kls, sd):
        print("merging main styling")
        ret_col_config = []

        #this is necessary for polars to add an index column, which is
        #required so that summary_stats makes sense
        if 'index' not in sd:
            ret_col_config.append({'col_name': 'index', 'displayer_args': {'displayer': 'obj'}})
            
        for col in sd.keys():
            col_meta = sd[col]
            if col_meta.get('merge_rule') == 'hidden':
                continue
            base_style = kls.style_column(col, col_meta)
            if 'column_config_override' in col_meta:
                #column_config_override, sent by the instantiation, gets set later
                base_style.update(col_meta['column_config_override'])
            if base_style.get('merge_rule') == 'hidden':
                continue

            ret_col_config.append(base_style)
            
        return {
            'pinned_rows': kls.pinned_rows,
            'column_config': ret_col_config,
            'extra_grid_config': kls.extra_grid_config,
            'component_config': kls.component_config
        }

In [None]:
from buckaroo.customizations.analysis import (TypingStats, ComputedDefaultSummaryStats, DefaultSummaryStats)
from buckaroo.customizations.histogram import (Histogram)
from buckaroo.customizations.styling import (DefaultSummaryStatsStyling, DefaultMainStyling)
from buckaroo.dataflow.dataflow import StylingAnalysis
base_analysis_klasses = [
    TypingStats, DefaultSummaryStats,
    Histogram, ComputedDefaultSummaryStats,
    StylingAnalysis, DefaultSummaryStats,
    DefaultSummaryStatsStyling, 
    MergingMainStylingAnalysis
    #DefaultMainStyling
]


In [None]:
def DatacompyBuckaroo(df1, df2):

    
    def get_df_header(cmp):
        df_header = pd.DataFrame({        
            "DataFrame": [cmp.df1_name, cmp.df2_name],
            "Columns": [cmp.df1.shape[1], cmp.df2.shape[1]],
            "Rows": [cmp.df1.shape[0], cmp.df2.shape[0]]}) #, index=['a', 'b'])
        return df_header.T
    class DfHeader(ColAnalysis):
        @classmethod
        def post_process_df(kls, df):
            ab = get_df_header(cmp)
            print("ab", ab)
            base_removes = {}
            #create re-run styling helper function that removes all old columns and re-adds
            for k in df.columns:
                base_removes[k] = {'merge_rule': 'hidden'}
            base_removes.update(
                {
        0: {'_type': 'obj', 'column_config_override': {'displayer_args': {'displayer': 'obj'}}},
        1: {'_type': 'obj', 'column_config_override': {'displayer_args': {'displayer': 'obj'}}}})
            
            return [ab, base_removes]
        '''
            [
        '0': {'_type': 'obj', 'column_config_override': {'displayer_args': {'displayer': 'obj'}}},
        '1': {'_type': 'obj', 'column_config_override': {'displayer_args': {'displayer': 'obj'}}},
        'a':  {'merge_rule': 'hidden'}}
        #'index': {'_type': 'obj', 'column_config_override': {'displayer_args': {'displayer': 'obj'}}}}
            
            ]
        '''
        post_processing_method = "Df Headers"
        
    base_a_klasses = base_analysis_klasses.copy()
    base_a_klasses.extend([DfHeader, MergingMainStylingAnalysis])
    class DatacompyBuckarooWidget(BuckarooWidget):
        analysis_klasses = base_a_klasses
    joined_df, column_config_overrides, init_sd = col_join_dfs(df1, df2, cmp)
    dcbw = DatacompyBuckarooWidget(joined_df, column_config_overrides=column_config_overrides)
    return dcbw
DatacompyBuckaroo(df_a, df_b)


In [None]:
dcbw.df_display_args['main']

In [None]:
dcbw.merged_sd['a']['histogram']

In [None]:
dcbw.df_data_dict['all_stats'][1]

In [None]:
dcbw.df_data_dict['all_stats'][25]

In [None]:
from buckaroo.customizations.styling import DefaultMainStyling
class CompareStylingAnalysis(DefaultMainStyling):
    @classmethod
    def style_columns(kls, sd):
        """
        ret_col_config = []

        #this is necessary for polars to add an index column, which is
        #required so that summary_stats makes sense
        if 'index' not in sd:
            ret_col_config.append({'col_name': 'index', 'displayer_args': {'displayer': 'obj'}})
            
        for col in sd.keys():
            col_meta = sd[col]
            base_style = kls.style_column(col, col_meta)
            if 'column_config_override' in col_meta:
                base_style.update(col_meta['column_config_override'])
            ret_col_config.append(base_style)

        return {
            'pinned_rows': kls.pinned_rows,
            'column_config': ret_col_config,
            'extra_grid_config': kls.extra_grid_config,
            'component_config': kls.component_config
        }
        """
        retval = 
