In [None]:
import pandas as pd
import datacompy

In [None]:
# Create sample DataFrames
df_a = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6]})

df_b = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 7] })  # Notice the difference in the last row

In [None]:
comparison = datacompy.Compare(
    df_a, df_b,
    join_columns='A',  # Column to join DataFrames on
    abs_tol=0,  # Absolute tolerance
    rel_tol=0)  # Relative tolerance

In [None]:
# Generate the report
print(comparison.report())

In [None]:
def get_df_header(df1, df2, comparison):
    df_header = pd.DataFrame({        
        "DataFrame": [comparison.df1_name, comparison.df2_name],
        "Columns": [df1.shape[1], df2.shape[1]],
        "Rows": [df1.shape[0], df2.shape[0]]})
    return df_header
get_df_header(comparison.df1, comparison.df2, comparison)

In [None]:
"""
report += render(
            "column_summary.txt",
            len(self.intersect_columns()),
            f"{len(self.df1_unq_columns())} {self.df1_unq_columns().items}",
            f"{len(self.df2_unq_columns())} {self.df2_unq_columns().items}",
            self.df1_name,
            self.df2_name,
        )
Column Summary
--------------

Number of columns in common: {0}
Number of columns in {3} but not in {4}: {1}
Number of columns in {4} but not in {3}: {2}
"""        

In [None]:
def column_summary(cmp):
    df1_name = cmp.df1_name
    df2_name = cmp.df2_name
    col_df = pd.DataFrame({
        "Columns in common":[len(cmp.intersect_columns())],
        f"Columns in {df1_name} not in {df2_name}":[ cmp.df1_unq_columns().items],
        f"Columns in {df2_name} not in {df1_name}":[ cmp.df2_unq_columns().items]})
    return col_df.T
column_summary(comparison)

In [None]:
#row_summary.txt
"""
Row Summary
-----------

Matched on: {0}
Any duplicates on match values: {10}
Absolute Tolerance: {1}
Relative Tolerance: {2}
Number of rows in common: {3:,}
Number of rows in {8} but not in {9}: {4:,}
Number of rows in {9} but not in {8}: {5:,}

Number of rows with some compared columns unequal: {6:,}
Number of rows with all compared columns equal: {7:,}
"""

In [None]:
def row_summary(cmp):
    # write pad arr function to pad array to number of join columns
    match_criteria = "index"
    if not cmp.on_index:
        match_criteria = ", ".join(cmp.join_columns)
    has_dupes = cmp._any_dupes
    df1_name = cmp.df1_name
    df2_name = cmp.df2_name

    row_df = pd.DataFrame({
        "Matched On": [match_criteria],
        "Any Duplicates on match values": [has_dupes],
        "Number of rows in common": cmp.intersect_rows.shape[0],
        f"Number of rows in {df1_name} but not in {df2_name}": cmp.df1_unq_rows.shape[0],
        f"Number of rows in {df2_name} but not in {df1_name}": cmp.df2_unq_rows.shape[0],
        "Number of rows with some compared columns unequal": [cmp.intersect_rows.shape[0] - cmp.count_matching_rows()],
        "Number of rows with all compared columns equal": [cmp.count_matching_rows()]
    })
    return row_df.T
row_summary(comparison)
    

In [None]:
        # Column Matching
        report += render(
            "column_comparison.txt",
            len([col for col in self.column_stats if col["unequal_cnt"] > 0]),
            len([col for col in self.column_stats if col["unequal_cnt"] == 0]),
            sum(col["unequal_cnt"] for col in self.column_stats),)

In [None]:
def column_matching(cmp):
    unequal_count = len([col for col in cmp.column_stats if col["unequal_cnt"] > 0])
    equal_count = len([col for col in cmp.column_stats if col["unequal_cnt"] == 0])
    total_unequal_count = sum(col["unequal_cnt"] for col in cmp.column_stats)

    col_df = pd.DataFrame({
        "Number of columns compared with some values unequal": [unequal_count],
        "Number of columns with all values equal": [equal_count],
        "Total number of values which compare unequal": [total_unequal_count]})
    return col_df.T
column_matching(comparison)

In [None]:
        match_stats = []
        match_sample = []
        any_mismatch = False
        for column in self.column_stats:
            if not column["all_match"]:
                any_mismatch = True
                match_stats.append(
                    {
                        "Column": column["column"],
                        f"{self.df1_name} dtype": column["dtype1"],
                        f"{self.df2_name} dtype": column["dtype2"],
                        "# Unequal": column["unequal_cnt"],
                        "Max Diff": column["max_diff"],
                        "# Null Diff": column["null_diff"],
                    }
                )
                if column["unequal_cnt"] > 0:
                    match_sample.append(
                        self.sample_mismatch(
                            column["column"], sample_count, for_display=True
                        )
                    )

        if any_mismatch:
            report += "Columns with Unequal Values or Types\n"
            report += "------------------------------------\n"
            report += "\n"
            df_match_stats = pd.DataFrame(match_stats)
            df_match_stats.sort_values("Column", inplace=True)
            # Have to specify again for sorting
            report += df_match_stats[
                [
                    "Column",
                    f"{self.df1_name} dtype",
                    f"{self.df2_name} dtype",
                    "# Unequal",
                    "Max Diff",
                    "# Null Diff",
                ]
            ].to_string()

In [None]:
def match_stats(cmp, sample_count=10):
    match_stats = []
    match_sample = []
    any_mismatch = False
    for column in cmp.column_stats:
        if not column["all_match"]:
             any_mismatch = True
             match_stats.append({
                "Column": column["column"],
                f"{cmp.df1_name} dtype": column["dtype1"],
                f"{cmp.df2_name} dtype": column["dtype2"],
                "# Unequal": column["unequal_cnt"],
                "Max Diff": column["max_diff"],
                "# Null Diff": column["null_diff"]})
        if column["unequal_cnt"] > 0:
            match_sample.append(
                    cmp.sample_mismatch(
                        column["column"], sample_count, for_display=True))

    df_match_stats = pd.DataFrame(match_stats)
    df_match_stats.sort_values("Column", inplace=True)
    return df_match_stats.T
match_stats(comparison)

In [None]:
import pandas as pd
import datacompy
from buckaroo.pluggable_analysis_framework.pluggable_analysis_framework import ColAnalysis
from buckaroo import BuckarooWidget


def DatacompyBuckaroo(df1, df2):
    cmp = datacompy.Compare(
        df1, df2,
        join_columns='A',  # Column to join DataFrames on
        abs_tol=0,  # Absolute tolerance
        rel_tol=0)  # Relative tolerance
    
    def get_df_header(cmp):
        df_header = pd.DataFrame({        
            "DataFrame": [cmp.df1_name, cmp.df2_name],
            "Columns": [cmp.df1.shape[1], cmp.df2.shape[1]],
            "Rows": [cmp.df1.shape[0], cmp.df2.shape[0]]}) #, columns=[0, 1])
        return df_header.T
    
    def column_summary(cmp):
        df1_name = cmp.df1_name
        df2_name = cmp.df2_name
        col_df = pd.DataFrame({
            "Columns in common":[len(cmp.intersect_columns())],
            f"Columns in {df1_name} not in {df2_name}":[ cmp.df1_unq_columns().items],
            f"Columns in {df2_name} not in {df1_name}":[ cmp.df2_unq_columns().items]})
        return col_df.T
    
    def row_summary(cmp):
        # write pad arr function to pad array to number of join columns
        match_criteria = "index"
        if not cmp.on_index:
            match_criteria = ", ".join(cmp.join_columns)
            has_dupes = cmp._any_dupes
            df1_name = cmp.df1_name
            df2_name = cmp.df2_name
    
        row_df = pd.DataFrame({
            "Matched On": [match_criteria],
            "Any Duplicates on match values": [has_dupes],
            "Number of rows in common": cmp.intersect_rows.shape[0],
            f"Number of rows in {df1_name} but not in {df2_name}": cmp.df1_unq_rows.shape[0],
            f"Number of rows in {df2_name} but not in {df1_name}": cmp.df2_unq_rows.shape[0],
            "Number of rows with some compared columns unequal": [cmp.intersect_rows.shape[0] - cmp.count_matching_rows()],
            "Number of rows with all compared columns equal": [cmp.count_matching_rows()]
        })
        return row_df.T
    
    def column_matching(cmp):
        unequal_count = len([col for col in cmp.column_stats if col["unequal_cnt"] > 0])
        equal_count = len([col for col in cmp.column_stats if col["unequal_cnt"] == 0])
        total_unequal_count = sum(col["unequal_cnt"] for col in cmp.column_stats)
    
        col_df = pd.DataFrame({
            "Number of columns compared with some values unequal": [unequal_count],
            "Number of columns with all values equal": [equal_count],
            "Total number of values which compare unequal": [total_unequal_count]})
        return col_df.T
    
    def match_stats(cmp, sample_count=10):
        match_stats = []
        match_sample = []
        any_mismatch = False
        for column in cmp.column_stats:
            if not column["all_match"]:
                any_mismatch = True
                match_stats.append({
                    "Column": column["column"],
                    f"{cmp.df1_name} dtype": column["dtype1"],
                    f"{cmp.df2_name} dtype": column["dtype2"],
                    "# Unequal": column["unequal_cnt"],
                    "Max Diff": column["max_diff"],
                    "# Null Diff": column["null_diff"]})
            if column["unequal_cnt"] > 0:
                match_sample.append(
                    cmp.sample_mismatch(
                        column["column"], sample_count, for_display=True))
    
        df_match_stats = pd.DataFrame(match_stats)
        df_match_stats.sort_values("Column", inplace=True)
        return df_match_stats.T

    class DfHeader(ColAnalysis):
        @classmethod
        def post_process_df(kls, df):
            ab = get_df_header(cmp)
            print("ab", ab)
            return [ab, {}]
        post_processing_method = "Df Headers"


    class ColumnSummary(ColAnalysis):
        @classmethod
        def post_process_df(kls, df):
            col_summary_df = column_summary(cmp)
            print("col_summary", col_summary_df)
            return [col_summary_df, {}]
        post_processing_method = "Column Summary"

    class RowSummary(ColAnalysis):
        @classmethod
        def post_process_df(kls, df):
            return [row_summary(cmp), {}]
        post_processing_method = "Row Summary"

    class ColumnMatching(ColAnalysis):
        @classmethod
        def post_process_df(kls, df):
            return [column_matching(cmp), {}]
        post_processing_method = "Column Matching"

    class MatchStats(ColAnalysis):
        @classmethod
        def post_process_df(kls, df):
            return [match_stats(cmp), {}]
        post_processing_method = "Match Stats"

        
    datacompy_post_processing_klasses = [
        DfHeader, ColumnSummary, RowSummary, ColumnMatching, MatchStats]
    
    base_a_klasses = BuckarooWidget.analysis_klasses.copy()
    base_a_klasses.extend(datacompy_post_processing_klasses)
    class DatacompyBuckarooWidget(BuckarooWidget):
        analysis_klasses = base_a_klasses
    dcbw = DatacompyBuckarooWidget(pd.DataFrame({}, columns=[0,1]), debug=False)
    return dcbw
    
dcbw = DatacompyBuckaroo(df_a, df_b)
dcbw


In [None]:
dcb.processed_df

In [None]:
comparison.df1_name