In [3]:
import pandas as pd
import numpy as np
import unicodedata
import string
import sqlalchemy as _sql
import sqlalchemy.ext.declarative as _declarative
import sqlalchemy.orm as _orm
from soupsieve.util import lower
from sqlalchemy import create_engine
import re
from rapidfuzz.fuzz import ratio, partial_ratio
from rapidfuzz.distance import JaroWinkler, Levenshtein
from rapidfuzz.process import extractOne
from fuzzywuzzy import fuzz
from jellyfish import soundex
import itertools
import json

# Data reading

In [4]:
class ReadData:
    def __init__(self, path):
        """
        Initialize with the path to the CSV file.
        """
        self.path = path  # Can be replaced by database connection later
        self.data = None

    def read_data(self):
        """
        Read data from the CSV file using pandas.
        Adds an 'ID' column for tracking original indexes.
        Returns a pandas DataFrame.
        """
        self.data = pd.read_csv(self.path)
        self.data['ID'] = self.data.index  # Add an ID column with the original row index
        return self.data

In [5]:
path = "data/restaurant-nophone.csv"
rd = ReadData(path)
source_data = rd.read_data()
source_data

Unnamed: 0,name,address,city,cuisine,ID
0,arnie morton's of chicago,435 s. la cienega blv.,los angeles,american,0
1,arnie morton's of chicago,435 s. la cienega blvd.,los angeles,steakhouses,1
2,art's delicatessen,12224 ventura blvd.,studio city,american,2
3,art's deli,12224 ventura blvd.,studio city,delis,3
4,hotel bel-air,701 stone canyon rd.,bel air,californian,4
...,...,...,...,...,...
859,ti couz,3108 16th st.,san francisco,french,859
860,trio cafe,1870 fillmore st.,san francisco,american,860
861,tu lan,8 sixth st.,san francisco,vietnamese,861
862,vicolo pizzeria,201 ivy st.,san francisco,pizza,862


# Data Pre-processing

- Normalization (lowercasing, removing diacritics, punctuations)
- Tokenization (TBD?)
- Drop duplicates

In [8]:
class DataPreprocessing:
    def __init__(self, data):
        """
        Initialize the DataPreprocessor with the data.
        :param data: pandas DataFrame containing the data to be processed.
        """
        self.data = data
        self.columns = None
        self.processed_data = data.copy()  # A copy of the data to avoid modifying the original

    def select_columns(self, columns):
        """
        Select the columns to apply preprocessing on.
        If 'all' is passed, all columns will be selected.
        :param columns: List of columns to be normalized, or 'all' to select all columns.
        """
        if columns[0] == 'all':
            # Select all columns except the 'ID' column
            self.columns = [col for col in self.data.columns if col != 'ID']
        else:
            # Otherwise, use the provided list of columns
            self.columns = columns

    def _ensure_non_numeric_string_columns(self):
        """
        Internal method to ensure that only non-numeric string columns are selected for string operations.
        """
        # Filter out non-string columns (Int64, Float64, etc.)
        self.columns = [
            col for col in self.columns if self.processed_data[col].dtype == 'object'
        ]

    def lowercase(self):
        """
        Convert text to lowercase in the selected columns.
        """
        if self.columns is None:
            raise ValueError("No columns selected for preprocessing. Use select_columns method first.")

        try:
            for col in self.columns:
                self.processed_data[col] = self.processed_data[col].str.lower()
        except Exception as e:
            print(f"Error applying lowercase operation: {e}")

    def remove_diacritics(self):
        """
        Remove diacritics from text in the selected columns.
        """
        if self.columns is None:
            raise ValueError("No columns selected for preprocessing. Use select_columns method first.")

        def _remove_diacritics(text):
            if isinstance(text, str):
                return ''.join(
                    c for c in unicodedata.normalize('NFKD', text)
                    if unicodedata.category(c) != 'Mn'
                )
            return text

        try:
            for col in self.columns:
                self.processed_data[col] = self.processed_data[col].apply(_remove_diacritics)
        except Exception as e:
            print(f"Error removing diacritics: {e}")

    def remove_punctuation(self):
        """
        Remove punctuation from text in the selected columns.
        """
        if self.columns is None:
            raise ValueError("No columns selected for preprocessing. Use select_columns method first.")

        # Ensure only string columns are processed
        self._ensure_non_numeric_string_columns()

        punctuation_pattern = f"[{re.escape(string.punctuation)}]"

        try:
            for col in self.columns:
                self.processed_data[col] = self.processed_data[col].str.replace(
                    punctuation_pattern, '', regex=True
                )
        except Exception as e:
            print(f"Error removing punctuation: {e}")

    def drop_duplicates(self):
        """
        Drop exact duplicates across all columns in the DataFrame, except 'ID'.
        """
        try:
            # Drop duplicates while preserving the 'ID' column
            self.processed_data = self.processed_data.drop_duplicates(subset=self.columns)
        except Exception as e:
            print(f"Error dropping duplicates: {e}")

    def apply_preprocessing(self, lowercase=False, diacritics_removal=False, punctuation_removal=False):
        """
        Apply preprocessing steps based on user selection.
        The order is: lowercase -> diacritics removal -> punctuation removal -> drop exact duplicates.
        :param lowercase: If True, apply lowercasing to the selected columns.
        :param diacritics_removal: If True, remove diacritics from the selected columns.
        :param punctuation_removal: If True, remove punctuation from the selected columns.
        :return: Preprocessed pandas DataFrame.
        """
        try:
            if punctuation_removal:
                self.remove_punctuation()

            # Ensure non-numeric string columns are processed
            self._ensure_non_numeric_string_columns()

            if lowercase:
                self.lowercase()

            if diacritics_removal:
                self.remove_diacritics()

            # Drop exact duplicates as the mandatory last step
            self.drop_duplicates()

        except Exception as e:
            print(f"Error during preprocessing: {e}")

        return self.processed_data

    def get_processed_data(self):
        """
        Return the preprocessed data.
        :return: Preprocessed pandas DataFrame.
        """
        return self.processed_data
    
    
    def dataframe_to_jsonb(self):
        """
        Convert a DataFrame to a JSONB-compatible string.
        :param dataframe: pandas DataFrame
        :return: JSON string
        """
        # Convert the DataFrame to a JSON string
        json_data = self.processed_data.to_json(orient='records', date_format='iso')
        return json.loads(json_data)


In [11]:
preprocessor = DataPreprocessing(source_data)
columns=['all']
lowercase=True 
diacritics_removal=True
punctuation_removal=True
preprocessor.select_columns(columns=columns)
preprocessor.apply_preprocessing(lowercase=lowercase, diacritics_removal=diacritics_removal, punctuation_removal=punctuation_removal)
preprocessed_data = preprocessor.get_processed_data()
preprocessed_data
# preprocessor.dataframe_to_jsonb()

Unnamed: 0,name,address,city,cuisine,ID
0,arnie mortons of chicago,435 s la cienega blv,los angeles,american,0
1,arnie mortons of chicago,435 s la cienega blvd,los angeles,steakhouses,1
2,arts delicatessen,12224 ventura blvd,studio city,american,2
3,arts deli,12224 ventura blvd,studio city,delis,3
4,hotel belair,701 stone canyon rd,bel air,californian,4
...,...,...,...,...,...
859,ti couz,3108 16th st,san francisco,french,859
860,trio cafe,1870 fillmore st,san francisco,american,860
861,tu lan,8 sixth st,san francisco,vietnamese,861
862,vicolo pizzeria,201 ivy st,san francisco,pizza,862


# Block building

- Sorted Neighborhood Method (SNM)
- Standard Blocking Method
- SBM with dynamic sliding window


In [12]:
class BlockBuilding:
    def __init__(self, data, method):
        """
        Initialize the BlockBuilder with data and method.
        :param data: pandas DataFrame containing the entity data.
        :param method: The blocking method to use ('sorted_neighborhood', 'dynamic_sorted_neighborhood', or 'standard_blocking').
        """
        self.data = data
        self.method = method
        self.blocks = None
        self.num_blocks = 0
        self.parameters = {}  # To store parameters used for blocking


    def build_blocks(self, columns=None, window_size=None, max_window_size=None, match_threshold=None, n_letters=3, block_index=1):
        """
        Main function to build blocks using the selected method.
        :param columns: List of columns to generate BKVs or SKVs.
        :param window_size: Window size for sorted neighborhood method.
        :param max_window_size: Maximum window size for dynamic sorted neighborhood.
        :param match_threshold: Match threshold for dynamic sorted neighborhood.
        :param n_letters: Number of letters to concatenate for SKVs.
        :param block_index: Index of the block to display (optional).
        :return: A specific block based on block_index.
        """
        if columns is None:
            raise ValueError("You must specify the columns for generating keys.")
        
        
        self.parameters = {  # Store parameters for statistics
            'columns': columns,
            'window_size': window_size,
            'max_window_size': max_window_size,
            'match_threshold': match_threshold,
            'n_letters': n_letters,
        }
        
        if self.method == 'standard_blocking':
            self.standard_blocking(columns)
        elif self.method == 'sorted_neighborhood':
            if window_size is None:
                raise ValueError("Window size must be provided for the sorted neighborhood method.")
            self.sorted_neighborhood(columns, window_size, n_letters)
        elif self.method == 'dynamic_sorted_neighborhood':
            if max_window_size is None or match_threshold is None:
                raise ValueError("Both max_window_size and match_threshold must be provided for the dynamic sorted neighborhood method.")
            self.dynamic_sorted_neighborhood(columns, max_window_size, match_threshold, n_letters)
        else:
            raise ValueError("Invalid method. Use 'standard_blocking', 'sorted_neighborhood', or 'dynamic_sorted_neighborhood'.")

        return self.display_block(block_index)

    def standard_blocking(self, columns):
        """
        Perform standard blocking using Soundex codes for the selected columns.
        :param columns: List of columns to use for generating BKVs.
        """
        self.blocks = self.data.copy()
        
        # Generate Soundex code for each selected column and concatenate them
        self.blocks['BKV'] = self.blocks[columns].apply(
            lambda col: col.map(lambda x: soundex(x) if isinstance(x, str) else '')
        ).agg(' '.join, axis=1)
        
        # Group by BKV and assign block IDs
        self.blocks['block_id'] = self.blocks.groupby('BKV').ngroup() + 1

        # Update the number of blocks
        self.num_blocks = self.blocks['block_id'].nunique()

    def sorted_neighborhood(self, columns, window_size, n_letters):
        """
        Perform sorted neighborhood blocking using concatenated first `n` letters of selected columns as SKVs.
        :param columns: List of columns to use for generating SKVs.
        :param window_size: Size of the sliding window.
        :param n_letters: Number of letters to concatenate for SKVs.
        """
        self.blocks = self.data.copy()

        # Generate the SKV by concatenating the first `n_letters` of each column
        self.blocks['SKV'] = self.blocks[columns].apply(
            lambda col: col.map(lambda x: x[:min(n_letters + 1, len(x))] if isinstance(x, str) else '')
        ).agg(''.join, axis=1)

        # Sort by SKV
        self.blocks = self.blocks.sort_values(by='SKV').reset_index(drop=True)

        # Assign block IDs based on window size
        self.blocks['block_id'] = (self.blocks.index // window_size) + 1

        # Update the number of blocks
        self.num_blocks = self.blocks['block_id'].nunique()

    def dynamic_sorted_neighborhood(self, columns, max_window_size, match_threshold, n_letters):
        """
        Perform dynamic sorted neighborhood blocking using SKVs.
        :param columns: List of columns to use for generating SKVs.
        :param max_window_size: Maximum size of the sliding window.
        :param match_threshold: Match threshold for window expansion.
        :param n_letters: Number of letters to concatenate for SKVs.
        """
        self.blocks = self.data.copy()
    
        # Generate the SKV by concatenating the first `n_letters + 1` of each column
        self.blocks['SKV'] = self.blocks[columns].apply(
            lambda col: col.map(lambda x: x[:min(n_letters + 1, len(x))] if isinstance(x, str) else '')
        ).agg(''.join, axis=1)
        # Sort by SKV
        self.blocks = self.blocks.sort_values(by='SKV').reset_index(drop=True)
    
        # Initialize variables
        block_ids = []
        current_block_id = 1
        window_start = 0
    
        # Iterate over sorted data to assign dynamic block IDs
        while window_start < len(self.blocks):
            # Start with a single row
            window_end = window_start + 1
    
            # Expand window dynamically
            while window_end < len(self.blocks) and (window_end - window_start) < max_window_size:
                # Check similarity between SKVs of current and next record
                similarity = fuzz.ratio(
                    self.blocks['SKV'].iloc[window_start],
                    self.blocks['SKV'].iloc[window_end]
                )
                if similarity >= match_threshold * 100:  # Convert threshold to percentage
                    window_end += 1
                else:
                    break
    
            # Assign the same block ID to all rows in the current window
            block_ids.extend([current_block_id] * (window_end - window_start))
    
            # Move to the next record
            window_start = window_end
            current_block_id += 1
    
        # Assign block IDs back to the dataframe
        self.blocks['block_id'] = block_ids
    
        # Update the number of blocks
        self.num_blocks = current_block_id - 1


    def get_num_blocks(self):
        """
        Return the total number of blocks generated.
        :return: Integer count of blocks.
        """
        return self.num_blocks

    def used_methods(self):
        """
        Return the blocking method used.
        :return: String name of the blocking method.
        """
        return self.method

    def used_parameters(self):
        """
        Return the parameters used for the blocking method.
        :return: Dictionary of parameters.
        """
        return self.parameters

    def display_block(self, block_index=1):
        """
        Display a specific block by block_id.
        :param block_index: The index of the block to display.
        :return: DataFrame containing the specified block.
        """
        if self.blocks is None:
            raise ValueError("No blocks have been generated. Run block building first.")

        return self.blocks[self.blocks['block_id'] == block_index]

    def get_blocks(self):
        """
        Return all generated blocks.
        :return: DataFrame containing all blocks.
        """
        if self.blocks is None:
            raise ValueError("No blocks have been generated. Run block building first.")

        return self.blocks
    
    def dataframe_to_jsonb(self):
        """
        Convert a DataFrame to a JSONB-compatible string.
        :param dataframe: pandas DataFrame
        :return: JSON string
        """
        # Convert the DataFrame to a JSON string
        json_data = self.blocks.to_json(orient='records', date_format='iso')
        return json.loads(json_data)

In [170]:
method='standard_blocking'
columns=['city', 'name']
block_builder = BlockBuilding(preprocessed_data, method=method)
block_builder.build_blocks(columns=columns)
all_blocks = block_builder.get_blocks()
# block_builder.display_block(100)
all_blocks

Unnamed: 0,name,address,city,cuisine,ID,BKV,block_id
0,arnie mortons of chicago,435 s la cienega blv,los angeles,american,0,425 A655,370
1,arnie mortons of chicago,435 s la cienega blvd,los angeles,steakhouses,1,425 A655,370
2,arts delicatessen,12224 ventura blvd,studio city,american,2,233 A632,28
3,arts deli,12224 ventura blvd,studio city,delis,3,233 A632,28
4,hotel belair,701 stone canyon rd,bel air,californian,4,146 H341,16
...,...,...,...,...,...,...,...
859,ti couz,3108 16th st,san francisco,french,859,251 T220,155
860,trio cafe,1870 fillmore st,san francisco,american,860,251 T621,160
861,tu lan,8 sixth st,san francisco,vietnamese,861,251 T450,158
862,vicolo pizzeria,201 ivy st,san francisco,pizza,862,251 V241,162


In [171]:
block_builder = BlockBuilding(preprocessed_data, method='sorted_neighborhood')
block_builder.build_blocks(window_size=20, columns=columns, n_letters=4)
block_builder.get_blocks()
# block_builder.display_block(1)

Unnamed: 0,name,address,city,cuisine,ID,SKV,block_id
0,103 west,103 w paces ferry rd,atlanta,continental,786,atla103 w,1
1,abbey,163 ponce de leon ave,atlanta,international,491,atlaabbey,1
2,abruzzi,2355 peachtree rd ne,atlanta,italian,149,atlaabruz,1
3,abruzzi,2355 peachtree rd peachtree battle shopping ...,atlanta,italian,148,atlaabruz,1
4,alecks barbecue heaven,783 martin luther king jr dr,atlanta,barbecue,492,atlaaleck,1
...,...,...,...,...,...,...,...
853,don antonios,1136 westwood blvd,westwood,italian,661,westdon a,43
854,falafel king,1059 broxton ave,westwood,middle eastern,663,westfalaf,43
855,feast from the east,1949 westwood blvd,west la,chinese,664,westfeast,43
856,john ogroats,10516 w pico blvd,west la,coffee shops,672,westjohn,43


In [172]:
block_builder = BlockBuilding(preprocessed_data, method='dynamic_sorted_neighborhood')
block_builder.build_blocks(max_window_size=20, match_threshold=0.7, columns=columns, n_letters=4)
all_blocks = block_builder.get_blocks()
all_blocks
# block_builder.display_block(1)

Unnamed: 0,name,address,city,cuisine,ID,SKV,block_id
0,103 west,103 w paces ferry rd,atlanta,continental,786,atla103 w,1
1,abbey,163 ponce de leon ave,atlanta,international,491,atlaabbey,2
2,abruzzi,2355 peachtree rd ne,atlanta,italian,149,atlaabruz,2
3,abruzzi,2355 peachtree rd peachtree battle shopping ...,atlanta,italian,148,atlaabruz,2
4,alecks barbecue heaven,783 martin luther king jr dr,atlanta,barbecue,492,atlaaleck,2
...,...,...,...,...,...,...,...
853,don antonios,1136 westwood blvd,westwood,italian,661,westdon a,310
854,falafel king,1059 broxton ave,westwood,middle eastern,663,westfalaf,311
855,feast from the east,1949 westwood blvd,west la,chinese,664,westfeast,311
856,john ogroats,10516 w pico blvd,west la,coffee shops,672,westjohn,312


# Field and Record Comparison:

* Q-gram comparison
* Jaro-Winkler
* Soundex

In [13]:

class Comparison:
    def __init__(self, data):
        """
        Initialize the Comparison class with the data.
        :param data: pandas DataFrame containing the data to be compared.
        """
        self.data = data
        self.comparison_results = None
        self.methods = {}
        self.parameters = {}

    @staticmethod
    def levenshtein_similarity(str1, str2):
        """
        Calculate the normalized Levenshtein similarity between two strings.
        Ensures the result is between 0 and 1.
        """
        from rapidfuzz.distance import Levenshtein
        score = Levenshtein.normalized_similarity(str1, str2)
        return max(0, min(1, score))  # Ensure the value is between 0 and 1

    @staticmethod
    def jaro_winkler_similarity(str1, str2):
        """
        Calculate the normalized Jaro-Winkler similarity between two strings.
        Ensures the result is between 0 and 1.
        """
        from rapidfuzz.distance import JaroWinkler
        score = JaroWinkler.similarity(str1, str2)
        return max(0, min(1, score))  # Ensure the value is between 0 and 1

    @staticmethod
    def qgram_similarity(str1, str2, q=2):
        """
        Calculate the Q-gram similarity between two strings.
        Ensures the result is between 0 and 1.
        """
        def generate_qgrams(s, q):
            return [s[i:i + q] for i in range(len(s) - q + 1)]

        qgrams1 = generate_qgrams(str1, q)
        qgrams2 = generate_qgrams(str2, q)
        matches = sum(1 for q in qgrams1 if q in qgrams2)
        total_qgrams = len(set(qgrams1 + qgrams2))
        score = matches / total_qgrams if total_qgrams > 0 else 0
        return max(0, min(1, score))  # Ensure the value is between 0 and 1

    def compare_within_blocks(self, block_col, column_algorithms):
        """
        Compare all possible pairs within each block for specified columns.
        :param block_col: The column name containing block IDs.
        :param column_algorithms: Dictionary where keys are column names and values are comparison functions.
        :return: DataFrame with comparison results for all pairs in each block.
        """
        if block_col not in self.data.columns:
            raise ValueError(f"Block column '{block_col}' not found in data.")
    
        for col in column_algorithms:
            if col not in self.data.columns:
                raise ValueError(f"Comparison column '{col}' not found in data.")
    
        # Store methods and parameters for statistics
        self.methods = list(column_algorithms.keys())
        self.parameters = {col: func.__name__ for col, func in column_algorithms.items()}
    
        # Store results in a list
        results = []
    
        # Group by block_id
        grouped = self.data.groupby(block_col)
    
        for block_id, group in grouped:
            # Get all possible pairs within the block
            pairs = list(itertools.combinations(group.iterrows(), 2))
    
            for (idx1, row1), (idx2, row2) in pairs:
                result = {
                    "block_id": block_id,
                    "row1": row1["ID"],  # Use the ID column instead of table index
                    "row2": row2["ID"],  # Use the ID column instead of table index
                }
    
                # Apply the specified algorithm to each column
                for col, comparison_func in column_algorithms.items():
                    result[f"{col}_similarity"] = comparison_func(row1[col], row2[col])
    
                results.append(result)
    
        self.comparison_results = pd.DataFrame(results)
        return self.comparison_results.sort_values(by=block_col).reset_index(drop=True)

    def get_comparison_results(self):
        """
        Get the comparison results.
        :return: DataFrame containing the comparison results.
        """
        if self.comparison_results is None:
            raise ValueError("No comparison results available. Run 'compare_within_blocks' first.")
        return self.comparison_results

    def used_methods(self):
        """
        Get the comparison methods used.
        :return: List of column names compared.
        """
        return self.methods

    def used_parameters(self):
        """
        Get the parameters used for the comparison methods.
        :return: Dictionary of methods and their parameters.
        """
        return self.parameters
    
    def dataframe_to_jsonb(self):
        """
        Convert a DataFrame to a JSONB-compatible string.
        :param dataframe: pandas DataFrame
        :return: JSON string
        """
        # Convert the DataFrame to a JSON string
        json_data = self.comparison_results.to_json(orient='records', date_format='iso')
        return json.loads(json_data)


In [174]:
comparison = Comparison(all_blocks)

# name	address	city	cuisine
column_algorithms = {
    "name": comparison.qgram_similarity,
    "address": comparison.jaro_winkler_similarity,
    "city": comparison.jaro_winkler_similarity,
    "cuisine": comparison.qgram_similarity
}

# Compare within blocks
comparison_results = comparison.compare_within_blocks(
    block_col="block_id",
    column_algorithms=column_algorithms
)
comparison_results



Unnamed: 0,block_id,row1,row2,name_similarity,address_similarity,city_similarity,cuisine_similarity
0,2,491,149,0.111111,0.659729,1.000000,0.117647
1,2,491,148,0.111111,0.574155,1.000000,0.117647
2,2,491,492,0.043478,0.535528,1.000000,0.000000
3,2,149,148,1.000000,0.871245,1.000000,1.000000
4,2,149,492,0.000000,0.586544,1.000000,0.000000
...,...,...,...,...,...,...,...
1962,296,2,3,0.500000,1.000000,1.000000,0.000000
1963,302,670,671,0.035714,0.666886,1.000000,0.000000
1964,307,31,41,0.500000,0.737596,1.000000,0.000000
1965,309,646,647,0.000000,0.516374,0.869118,0.000000


# Classification

* for now Threshold based only - but it is trivial to 

In [14]:
class Classifier:
    def __init__(self, blocked_data, comparison_table):
        """
        Initialize the MatchClassifier with blocked data and comparison table.
        :param blocked_data: DataFrame containing blocked source data with block_ids and SKVs/BKVs.
        :param comparison_table: DataFrame containing pairwise comparisons with similarities.
        """
        self.blocked_data = blocked_data
        self.comparison_table = comparison_table
        self.classification_results = None  # To store classification results

    def classify_matches(self, method='threshold_based', thresholds=None, weights=None, possible_match=False, costs=None, probabilities=None):
        """
        Classify the results based on the selected method.
        :param method: The classification method to use ('threshold_based', 'weighted', 'cost_based').
        :param thresholds: Dictionary with thresholds for classification.
        :param weights: Dictionary with weights for each similarity column (only for 'weighted' method).
        :param possible_match: Boolean indicating whether to include 'Possible Match' as a category.
        :param costs: Dictionary with costs for cost-based classification.
        :param probabilities: Dictionary with prior probabilities for cost-based classification.
        :return: DataFrame with classifications added.
        """
        
        self.method = method
        self.parameters = {
            'thresholds': thresholds,
            'weights': weights,
            'possible_match': possible_match,
            'costs': costs,
            'probabilities': probabilities,
        }
        
        if method == 'threshold_based':
            if thresholds is None:
                raise ValueError("Thresholds must be provided for threshold-based classification.")
            self.classification_results = self._threshold_based_classification(thresholds, possible_match)
        
        elif method == 'weighted':
            if thresholds is None or weights is None:
                raise ValueError("Both thresholds and weights must be provided for weighted classification.")
            self.classification_results = self._weighted_classification(thresholds, weights)
        
        elif method == 'cost_based':
            if costs is None or probabilities is None:
                raise ValueError("Costs and probabilities must be provided for cost-based classification.")
            self.classification_results = self._cost_based_classification(costs, probabilities)
        
        else:
            raise ValueError(f"Unknown classification method: {method}")

        return self.classification_results

    def _threshold_based_classification(self, thresholds, possible_match):
        merged_data = self._merge_row_details()
        similarity_columns = [col for col in self.comparison_table.columns if col.endswith('_similarity')]
        merged_data['average_similarity'] = merged_data[similarity_columns].mean(axis=1)
        
        merged_data['classification'] = merged_data['average_similarity'].apply(
            lambda similarity: self._classify_by_thresholds(similarity, thresholds, possible_match)
        )
        return merged_data

    def _weighted_classification(self, thresholds, weights):
        merged_data = self._merge_row_details()
        similarity_columns = [col for col in self.comparison_table.columns if col.endswith('_similarity')]

        if not set(weights.keys()).issubset(set(similarity_columns)):
            raise ValueError("All keys in weights must match similarity columns.")

        merged_data['weighted_similarity'] = sum(
            merged_data[col] * weight for col, weight in weights.items()
        )
        
        min_similarity = merged_data['weighted_similarity'].min()
        max_similarity = merged_data['weighted_similarity'].max()
        merged_data['normalized_similarity'] = (
            (merged_data['weighted_similarity'] - min_similarity) /
            (max_similarity - min_similarity)
            if max_similarity > min_similarity else 0
        )
        
        merged_data['classification'] = merged_data['normalized_similarity'].apply(
            lambda similarity: 'Match' if similarity >= thresholds['match'] else 'Non-Match'
        )
        
        return merged_data

    def _cost_based_classification(self, costs, probabilities):
        merged_data = self._merge_row_details()
        similarity_columns = [col for col in self.comparison_table.columns if col.endswith('_similarity')]
        merged_data['average_similarity'] = merged_data[similarity_columns].mean(axis=1)
        
        P_M = probabilities['M']
        P_U = probabilities['U']
        merged_data['cost_non_match'] = (
            costs['non_match_true_match'] * merged_data['average_similarity'] * P_M +
            costs['non_match_true_non_match'] * (1 - merged_data['average_similarity']) * P_U
        )
        merged_data['cost_match'] = (
            costs['match_true_match'] * merged_data['average_similarity'] * P_M +
            costs['match_true_non_match'] * (1 - merged_data['average_similarity']) * P_U
        )
        
        merged_data['classification'] = merged_data.apply(
            lambda row: 'Match' if row['cost_match'] < row['cost_non_match'] else 'Non-Match',
            axis=1
        )
        
        return merged_data

    def _merge_row_details(self):
        """
        Merge row1 and row2 details into the comparison table using the `ID` column.
        :return: Merged DataFrame with row details added.
        """
        # Fetch rows based on 'ID' instead of index
        row1_details = self.blocked_data.set_index('ID').loc[self.comparison_table['row1']].reset_index(drop=True)
        row2_details = self.blocked_data.set_index('ID').loc[self.comparison_table['row2']].reset_index(drop=True)
        
        # Merge the details into the comparison table
        merged_data = self.comparison_table.copy()
        for col in self.blocked_data.columns:
            if col not in ['block_id', 'SKV', 'BKV', 'ID']:  # Exclude metadata columns
                merged_data[f'row1_{col}'] = row1_details[col].values
                merged_data[f'row2_{col}'] = row2_details[col].values
        return merged_data

    def _classify_by_thresholds(self, similarity, thresholds, possible_match):
        if possible_match:
            if similarity < thresholds['not_match']:
                return 'Not Match'
            elif thresholds['not_match'] <= similarity < thresholds['match']:
                return 'Possible Match'
            else:
                return 'Match'
        else:
            if similarity < thresholds['match']:
                return 'Not Match'
            else:
                return 'Match'

    def get_classification_results(self):
        """
        Get the classification results.
        :return: DataFrame with the classification results.
        """
        if self.classification_results is None:
            raise ValueError("No classification results available. Run 'classify_matches' first.")
        return self.classification_results

    def used_methods(self):
        """
        Get the classification method used.
        :return: String representing the classification method.
        """
        return self.method

    def used_parameters(self):
        """
        Get the parameters used for the classification method.
        :return: Dictionary of parameters.
        """
        return self.parameters
    
    def dataframe_to_jsonb(self):
        """
        Convert a DataFrame to a JSONB-compatible string.
        :param dataframe: pandas DataFrame
        :return: JSON string
        """
        # Convert the DataFrame to a JSON string
        json_data = self.classification_results.to_json(orient='records', date_format='iso')
        return json.loads(json_data)


In [176]:
classifier = Classifier(all_blocks, comparison_results)
method = 'threshold_based'
possible_match = False
# thresholds = {'not_match': 0.6, 'match': 0.75}
thresholds = {'match': 0.8}
weights = {'name_similarity': 1, 'address_similarity': 1, 'city_similarity': 0.5, 'cuisine_similarity': 0.1}
costs = {
    'non_match_true_match': 5,  # Cost of classifying a true match as a non-match
    'non_match_true_non_match': 1,  # Cost of classifying a true non-match as a non-match
    'match_true_match': 1,  # Cost of classifying a true match as a match
    'match_true_non_match': 10  # Cost of classifying a true non-match as a match
}
probabilities = {'M': 0.3, 'U': 0.7}

# classified_results = classifier.classify_matches(
#     method='cost_based',
#     costs=costs,
#     probabilities=probabilities
# )

# not_match = 0.4
classified_results = classifier.classify_matches(
    method='weighted',
    thresholds=thresholds,
    possible_match=possible_match,
    weights=weights
)

classified_results[classified_results['classification'] == 'Match']

# classifier.used_parameters()


Unnamed: 0,block_id,row1,row2,name_similarity,address_similarity,city_similarity,cuisine_similarity,row1_name,row2_name,row1_address,row2_address,row1_city,row2_city,row1_cuisine,row2_cuisine,weighted_similarity,normalized_similarity,classification
3,2,149,148,1.0,0.871245,1.0,1.000000,abruzzi,abruzzi,2355 peachtree rd ne,2355 peachtree rd peachtree battle shopping ...,atlanta,atlanta,italian,italian,2.471245,0.926562,Match
12,4,150,151,1.0,0.894444,1.0,0.095238,bacchanalia,bacchanalia,3125 piedmont rd near peachtree rd,3125 piedmont rd,atlanta,atlanta,international,californian,2.403968,0.888189,Match
28,7,154,155,1.0,0.863158,1.0,0.461538,brasserie le coze,brasserie le coze,3393 peachtree rd lenox square mall near ne...,3393 peachtree rd,atlanta,atlanta,french,french bistro,2.409312,0.891236,Match
33,8,156,157,1.0,0.978947,1.0,0.666667,buckhead diner,buckhead diner,3073 piedmont road,3073 piedmont rd,atlanta,atlanta,american,american new,2.545614,0.968980,Match
65,13,161,160,1.0,1.000000,1.0,0.200000,delectables,delectables,1 margaret mitchell sq,1 margaret mitchell sq,atlanta,atlanta,cafeterias,american,2.520000,0.954370,Match
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1867,255,212,213,1.0,1.000000,1.0,0.600000,masas,masas,648 bush st,648 bush st,san francisco,san francisco,french,french new,2.560000,0.977185,Match
1900,264,218,219,1.0,1.000000,1.0,0.117647,postrio,postrio,545 post st,545 post st,san francisco,san francisco,american,californian,2.511765,0.949673,Match
1903,264,217,216,1.0,0.951961,1.0,0.200000,plumpjack cafe,plumpjack cafe,3127 fillmore st,3201 fillmore st,san francisco,san francisco,american new,mediterranean,2.471961,0.926970,Match
1955,284,11,10,1.0,1.000000,1.0,0.000000,chinois on main,chinois on main,2709 main st,2709 main st,santa monica,santa monica,pacific new wave,french,2.500000,0.942963,Match


# Evaluation


In [192]:
class Evaluation:
    def __init__(self, source_data, classified_data):
        """
        Initialize the Evaluation class with the source data and classified data.
        :param source_data: DataFrame containing the original source data.
        :param classified_data: DataFrame containing classified match results.
        """
        self.source_data = source_data
        self.classified_data = classified_data

    def show_matches_side_by_side(self):
        """
        Show all rows classified as 'Match' in an unflattened format, 
        with an additional 'dropped' column ('NO' for row1 and 'YES' for row2).
        :return: DataFrame with matched rows from source_data and 'dropped' column.
        """
        matches = self.classified_data[self.classified_data['classification'] == 'Match']
        rows = []
        dedup_id = 1
    
        for _, row in matches.iterrows():
            # Get rows for row1 and row2 from source data
            row1 = self.source_data.iloc[row['row1']].copy()
            row2 = self.source_data.iloc[row['row2']].copy()
    
            # Add 'dropped' column
            row1['dropped'] = 'NO'
            row2['dropped'] = 'YES'
            
            row1['dedup_id'] = dedup_id
            row2['dedup_id'] = dedup_id
            
            dedup_id += 1
    
            # Append both rows
            rows.append(row1)
            rows.append(row2)
    
            # Combine all rows into a single DataFrame
        result_df = pd.DataFrame(rows)
        columns = ['dedup_id'] + [col for col in result_df.columns if col != 'dedup_id']
        result_df = result_df[columns]

        return result_df


    def get_deduplicated_data(self):
        """
        Get deduplicated data by dropping all rows from source_data whose indexes are in row_2 of classified_data.
        :return: Deduplicated DataFrame.
        """
        to_drop = self.classified_data[self.classified_data['classification'] == 'Match']['row2'].unique()
        deduplicated_data = self.source_data.drop(index=to_drop).reset_index(drop=True)
        return deduplicated_data
    

    def get_statistics(self):
        """
        Get statistics about the deduplication process, including:
        - Row count before deduplication
        - Row count after deduplication
        - Duplicate percentage
        - Average similarity of rows in blocks (block_id).
        :return: DataFrame with statistics.
        """
        row_count_before = len(self.source_data)
        deduplicated_data = self.get_deduplicated_data()
        row_count_after = len(deduplicated_data)
        num_duplicates = row_count_before - row_count_after
        duplicate_percentage = ((row_count_before - row_count_after) / row_count_before) * 100

        # Average similarity within blocks
        avg_similarity_per_block = (
            self.classified_data.groupby('block_id')['normalized_similarity']
            .mean()
            .reset_index()
            .rename(columns={'normalized_similarity': 'avg_similarity'})
        )

        stats = {
            'Detected duplicates': num_duplicates,
            'Row count before deduplication': row_count_before,
            'Row count after deduplication': row_count_after,
            'Duplicate percentage': round(duplicate_percentage, 2),
            'Average similarity per block': round(avg_similarity_per_block['avg_similarity'].mean(), 2)
        }

        stats_df = pd.DataFrame([stats])
        return stats_df
    
    @staticmethod
    def used_methods_parameters(*workflow_objects):
        """
        Collect statistics about all workflow steps into a table.
        :param workflow_objects: Instances of classes (e.g., BlockBuilding, Comparison, Classifier).
        :return: DataFrame summarizing methods and parameters for each step.
        """
        stats = []
        for obj in workflow_objects:
            stats.append({
                'Step': obj.__class__.__name__,
                'Method': obj.used_methods(),
                'Parameters': obj.used_parameters(),
            })
        return pd.DataFrame(stats)
    


In [193]:
evaluation = Evaluation(source_data, classified_results)
matches_side_by_side = evaluation.show_matches_side_by_side()
matches_side_by_side

Unnamed: 0,dedup_id,name,address,city,cuisine,ID,dropped
149,1,abruzzi,2355 peachtree rd. ne,atlanta,italian,149,NO
148,1,abruzzi,2355 peachtree rd. peachtree battle shopping...,atlanta,italian,148,YES
150,2,bacchanalia,3125 piedmont rd. near peachtree rd.,atlanta,international,150,NO
151,2,bacchanalia,3125 piedmont rd.,atlanta,californian,151,YES
154,3,brasserie le coze,3393 peachtree rd. lenox square mall near n...,atlanta,french,154,NO
...,...,...,...,...,...,...,...
216,68,plumpjack cafe,3201 fillmore st.,san francisco,mediterranean,216,YES
11,69,chinois on main,2709 main st.,santa monica,pacific new wave,11,NO
10,69,chinois on main,2709 main st.,santa monica,french,10,YES
7,70,cafe bizou,14016 ventura blvd.,sherman oaks,french bistro,7,NO


In [194]:
deduplicated_data = evaluation.get_deduplicated_data()
deduplicated_data

Unnamed: 0,name,address,city,cuisine,ID
0,arnie morton's of chicago,435 s. la cienega blv.,los angeles,american,0
1,art's delicatessen,12224 ventura blvd.,studio city,american,2
2,art's deli,12224 ventura blvd.,studio city,delis,3
3,bel-air hotel,701 stone canyon rd.,bel air,californian,5
4,cafe bizou,14016 ventura blvd.,sherman oaks,french bistro,7
...,...,...,...,...,...
789,ti couz,3108 16th st.,san francisco,french,859
790,trio cafe,1870 fillmore st.,san francisco,american,860
791,tu lan,8 sixth st.,san francisco,vietnamese,861
792,vicolo pizzeria,201 ivy st.,san francisco,pizza,862


In [195]:
statistics = evaluation.get_statistics()
statistics

Unnamed: 0,Detected duplicates,Row count before deduplication,Row count after deduplication,Duplicate percentage,Average similarity per block
0,70,864,794,8.1,0.29


In [196]:
workflow_methods = evaluation.used_methods_parameters(block_builder, comparison, classifier)
workflow_methods

Unnamed: 0,Step,Method,Parameters
0,BlockBuilding,dynamic_sorted_neighborhood,"{'columns': ['city', 'name'], 'window_size': N..."
1,Comparison,"[name, address, city, cuisine]","{'name': 'qgram_similarity', 'address': 'jaro_..."
2,Classifier,weighted,"{'thresholds': {'match': 0.8}, 'weights': {'na..."
