In [1]:
import pandas as pd
import numpy as np
import unicodedata
import string
import sqlalchemy as _sql
import sqlalchemy.ext.declarative as _declarative
import sqlalchemy.orm as _orm
from soupsieve.util import lower
from sqlalchemy import create_engine
import re
from rapidfuzz.fuzz import ratio, partial_ratio
from rapidfuzz.distance import JaroWinkler, Levenshtein
from rapidfuzz.process import extractOne
from fuzzywuzzy import fuzz
from jellyfish import soundex
import itertools
import json

# Data reading

In [2]:
class ReadData:
    def __init__(self, path):
        """
        Initialize with the path to the CSV file.
        """
        self.path = path  # Can be replaced by database connection later
        self.data = None

    def read_data(self):
        """
        Read data from the CSV file using pandas.
        Adds an 'ID' column for tracking original indexes.
        Returns a pandas DataFrame.
        """
        self.data = pd.read_csv(self.path)
        self.data['ID'] = self.data.index  # Add an ID column with the original row index
        return self.data

In [3]:
path = "data/restaurant-nophone.csv"
rd = ReadData(path)
source_data = rd.read_data()
source_data

Unnamed: 0,name,address,city,cuisine,ID
0,arnie morton's of chicago,435 s. la cienega blv.,los angeles,american,0
1,arnie morton's of chicago,435 s. la cienega blvd.,los angeles,steakhouses,1
2,art's delicatessen,12224 ventura blvd.,studio city,american,2
3,art's deli,12224 ventura blvd.,studio city,delis,3
4,hotel bel-air,701 stone canyon rd.,bel air,californian,4
...,...,...,...,...,...
859,ti couz,3108 16th st.,san francisco,french,859
860,trio cafe,1870 fillmore st.,san francisco,american,860
861,tu lan,8 sixth st.,san francisco,vietnamese,861
862,vicolo pizzeria,201 ivy st.,san francisco,pizza,862


# Data Pre-processing

- Normalization (lowercasing, removing diacritics, punctuations)
- Tokenization (TBD?)
- Drop duplicates

In [4]:
class DataPreprocessing:
    def __init__(self, data):
        """
        Initialize the DataPreprocessor with the data.
        :param data: pandas DataFrame containing the data to be processed.
        """
        self.data = data
        self.columns = None
        self.processed_data = data.copy()  # A copy of the data to avoid modifying the original

    def select_columns(self, columns):
        """
        Select the columns to apply preprocessing on.
        If 'all' is passed, all columns will be selected.
        :param columns: List of columns to be normalized, or 'all' to select all columns.
        """
        if columns[0] == 'all':
            # Select all columns except the 'ID' column
            self.columns = [col for col in self.data.columns if col != 'ID']
        else:
            # Otherwise, use the provided list of columns
            self.columns = columns

    def _ensure_non_numeric_string_columns(self):
        """
        Internal method to ensure that only non-numeric string columns are selected for string operations.
        """
        # Filter out non-string columns (Int64, Float64, etc.)
        self.columns = [
            col for col in self.columns if self.processed_data[col].dtype == 'object'
        ]

    def lowercase(self):
        """
        Convert text to lowercase in the selected columns.
        """
        if self.columns is None:
            raise ValueError("No columns selected for preprocessing. Use select_columns method first.")

        try:
            for col in self.columns:
                self.processed_data[col] = self.processed_data[col].str.lower()
        except Exception as e:
            print(f"Error applying lowercase operation: {e}")

    def remove_diacritics(self):
        """
        Remove diacritics from text in the selected columns.
        """
        if self.columns is None:
            raise ValueError("No columns selected for preprocessing. Use select_columns method first.")

        def _remove_diacritics(text):
            if isinstance(text, str):
                return ''.join(
                    c for c in unicodedata.normalize('NFKD', text)
                    if unicodedata.category(c) != 'Mn'
                )
            return text

        try:
            for col in self.columns:
                self.processed_data[col] = self.processed_data[col].apply(_remove_diacritics)
        except Exception as e:
            print(f"Error removing diacritics: {e}")

    def remove_punctuation(self):
        """
        Remove punctuation from text in the selected columns.
        """
        if self.columns is None:
            raise ValueError("No columns selected for preprocessing. Use select_columns method first.")

        # Ensure only string columns are processed
        self._ensure_non_numeric_string_columns()

        punctuation_pattern = f"[{re.escape(string.punctuation)}]"

        try:
            for col in self.columns:
                self.processed_data[col] = self.processed_data[col].str.replace(
                    punctuation_pattern, '', regex=True
                )
        except Exception as e:
            print(f"Error removing punctuation: {e}")

    def drop_duplicates(self):
        """
        Drop exact duplicates across all columns in the DataFrame, except 'ID'.
        """
        try:
            # Drop duplicates while preserving the 'ID' column
            self.processed_data = self.processed_data.drop_duplicates(subset=self.columns)
        except Exception as e:
            print(f"Error dropping duplicates: {e}")

    def apply_preprocessing(self, lowercase=False, diacritics_removal=False, punctuation_removal=False):
        """
        Apply preprocessing steps based on user selection.
        The order is: lowercase -> diacritics removal -> punctuation removal -> drop exact duplicates.
        :param lowercase: If True, apply lowercasing to the selected columns.
        :param diacritics_removal: If True, remove diacritics from the selected columns.
        :param punctuation_removal: If True, remove punctuation from the selected columns.
        :return: Preprocessed pandas DataFrame.
        """
        try:
            if punctuation_removal:
                self.remove_punctuation()

            # Ensure non-numeric string columns are processed
            self._ensure_non_numeric_string_columns()

            if lowercase:
                self.lowercase()

            if diacritics_removal:
                self.remove_diacritics()

            # Drop exact duplicates as the mandatory last step
            self.drop_duplicates()

        except Exception as e:
            print(f"Error during preprocessing: {e}")

        return self.processed_data

    def get_processed_data(self):
        """
        Return the preprocessed data.
        :return: Preprocessed pandas DataFrame.
        """
        return self.processed_data
    
    
    def dataframe_to_jsonb(self):
        """
        Convert a DataFrame to a JSONB-compatible string.
        :param dataframe: pandas DataFrame
        :return: JSON string
        """
        # Convert the DataFrame to a JSON string
        json_data = self.processed_data.to_json(orient='records', date_format='iso')
        return json.loads(json_data)


In [5]:
preprocessor = DataPreprocessing(source_data)
columns=['all']
lowercase=True 
diacritics_removal=True
punctuation_removal=True
preprocessor.select_columns(columns=columns)
preprocessor.apply_preprocessing(lowercase=lowercase, diacritics_removal=diacritics_removal, punctuation_removal=punctuation_removal)
preprocessed_data = preprocessor.get_processed_data()
preprocessed_data
# preprocessor.dataframe_to_jsonb()

Unnamed: 0,name,address,city,cuisine,ID
0,arnie mortons of chicago,435 s la cienega blv,los angeles,american,0
1,arnie mortons of chicago,435 s la cienega blvd,los angeles,steakhouses,1
2,arts delicatessen,12224 ventura blvd,studio city,american,2
3,arts deli,12224 ventura blvd,studio city,delis,3
4,hotel belair,701 stone canyon rd,bel air,californian,4
...,...,...,...,...,...
859,ti couz,3108 16th st,san francisco,french,859
860,trio cafe,1870 fillmore st,san francisco,american,860
861,tu lan,8 sixth st,san francisco,vietnamese,861
862,vicolo pizzeria,201 ivy st,san francisco,pizza,862


# Block building

- Sorted Neighborhood Method (SNM)
- Standard Blocking Method
- SBM with dynamic sliding window


In [6]:
class BlockBuilding:
    def __init__(self, data, method):
        """
        Initialize the BlockBuilder with data and method.
        :param data: pandas DataFrame containing the entity data.
        :param method: The blocking method to use ('sorted_neighborhood', 'dynamic_sorted_neighborhood', or 'standard_blocking').
        """
        self.data = data
        self.method = method
        self.blocks = None
        self.num_blocks = 0
        self.parameters = {}  # To store parameters used for blocking


    def build_blocks(self, columns=None, window_size=None, max_window_size=None, match_threshold=None, n_letters=3, block_index=1):
        """
        Main function to build blocks using the selected method.
        :param columns: List of columns to generate BKVs or SKVs.
        :param window_size: Window size for sorted neighborhood method.
        :param max_window_size: Maximum window size for dynamic sorted neighborhood.
        :param match_threshold: Match threshold for dynamic sorted neighborhood.
        :param n_letters: Number of letters to concatenate for SKVs.
        :param block_index: Index of the block to display (optional).
        :return: A specific block based on block_index.
        """
        if columns is None:
            raise ValueError("You must specify the columns for generating keys.")
        
        
        self.parameters = {  # Store parameters for statistics
            'columns': columns,
            'window_size': window_size,
            'max_window_size': max_window_size,
            'match_threshold': match_threshold,
            'n_letters': n_letters,
        }
        
        if self.method == 'standard_blocking':
            self.standard_blocking(columns)
        elif self.method == 'sorted_neighborhood':
            if window_size is None:
                raise ValueError("Window size must be provided for the sorted neighborhood method.")
            self.sorted_neighborhood(columns, window_size, n_letters)
        elif self.method == 'dynamic_sorted_neighborhood':
            if max_window_size is None or match_threshold is None:
                raise ValueError("Both max_window_size and match_threshold must be provided for the dynamic sorted neighborhood method.")
            self.dynamic_sorted_neighborhood(columns, max_window_size, match_threshold, n_letters)
        else:
            raise ValueError("Invalid method. Use 'standard_blocking', 'sorted_neighborhood', or 'dynamic_sorted_neighborhood'.")

        return self.display_block(block_index)

    def standard_blocking(self, columns):
        """
        Perform standard blocking using Soundex codes for the selected columns.
        :param columns: List of columns to use for generating BKVs.
        """
        self.blocks = self.data.copy()
        
        # Generate Soundex code for each selected column and concatenate them
        self.blocks['BKV'] = self.blocks[columns].apply(
            lambda col: col.map(lambda x: soundex(x) if isinstance(x, str) else '')
        ).agg(' '.join, axis=1)
        
        # Group by BKV and assign block IDs
        self.blocks['block_id'] = self.blocks.groupby('BKV').ngroup() + 1

        # Update the number of blocks
        self.num_blocks = self.blocks['block_id'].nunique()

    def sorted_neighborhood(self, columns, window_size, n_letters):
        """
        Perform sorted neighborhood blocking using concatenated first `n` letters of selected columns as SKVs.
        :param columns: List of columns to use for generating SKVs.
        :param window_size: Size of the sliding window.
        :param n_letters: Number of letters to concatenate for SKVs.
        """
        self.blocks = self.data.copy()

        # Generate the SKV by concatenating the first `n_letters` of each column
        self.blocks['SKV'] = self.blocks[columns].apply(
            lambda col: col.map(lambda x: x[:min(n_letters + 1, len(x))] if isinstance(x, str) else '')
        ).agg(''.join, axis=1)

        # Sort by SKV
        self.blocks = self.blocks.sort_values(by='SKV').reset_index(drop=True)

        # Assign block IDs based on window size
        self.blocks['block_id'] = (self.blocks.index // window_size) + 1

        # Update the number of blocks
        self.num_blocks = self.blocks['block_id'].nunique()

    def dynamic_sorted_neighborhood(self, columns, max_window_size, match_threshold, n_letters):
        """
        Perform dynamic sorted neighborhood blocking using SKVs.
        :param columns: List of columns to use for generating SKVs.
        :param max_window_size: Maximum size of the sliding window.
        :param match_threshold: Match threshold for window expansion.
        :param n_letters: Number of letters to concatenate for SKVs.
        """
        self.blocks = self.data.copy()
    
        # Generate the SKV by concatenating the first `n_letters + 1` of each column
        self.blocks['SKV'] = self.blocks[columns].apply(
            lambda col: col.map(lambda x: x[:min(n_letters + 1, len(x))] if isinstance(x, str) else '')
        ).agg(''.join, axis=1)
        # Sort by SKV
        self.blocks = self.blocks.sort_values(by='SKV').reset_index(drop=True)
    
        # Initialize variables
        block_ids = []
        current_block_id = 1
        window_start = 0
    
        # Iterate over sorted data to assign dynamic block IDs
        while window_start < len(self.blocks):
            # Start with a single row
            window_end = window_start + 1
    
            # Expand window dynamically
            while window_end < len(self.blocks) and (window_end - window_start) < max_window_size:
                # Check similarity between SKVs of current and next record
                similarity = fuzz.ratio(
                    self.blocks['SKV'].iloc[window_start],
                    self.blocks['SKV'].iloc[window_end]
                )
                if similarity >= match_threshold * 100:  # Convert threshold to percentage
                    window_end += 1
                else:
                    break
    
            # Assign the same block ID to all rows in the current window
            block_ids.extend([current_block_id] * (window_end - window_start))
    
            # Move to the next record
            window_start = window_end
            current_block_id += 1
    
        # Assign block IDs back to the dataframe
        self.blocks['block_id'] = block_ids
    
        # Update the number of blocks
        self.num_blocks = current_block_id - 1


    def get_num_blocks(self):
        """
        Return the total number of blocks generated.
        :return: Integer count of blocks.
        """
        return self.num_blocks

    def used_methods(self):
        """
        Return the blocking method used.
        :return: String name of the blocking method.
        """
        return self.method

    def used_parameters(self):
        """
        Return the parameters used for the blocking method.
        :return: Dictionary of parameters.
        """
        return self.parameters

    def display_block(self, block_index=1):
        """
        Display a specific block by block_id.
        :param block_index: The index of the block to display.
        :return: DataFrame containing the specified block.
        """
        if self.blocks is None:
            raise ValueError("No blocks have been generated. Run block building first.")

        return self.blocks[self.blocks['block_id'] == block_index]

    def get_blocks(self):
        """
        Return all generated blocks.
        :return: DataFrame containing all blocks.
        """
        if self.blocks is None:
            raise ValueError("No blocks have been generated. Run block building first.")

        return self.blocks
    
    def dataframe_to_jsonb(self):
        """
        Convert a DataFrame to a JSONB-compatible string.
        :param dataframe: pandas DataFrame
        :return: JSON string
        """
        # Convert the DataFrame to a JSON string
        json_data = self.blocks.to_json(orient='records', date_format='iso')
        return json.loads(json_data)

In [7]:
method='standard_blocking'
columns=['city', 'name']
block_builder = BlockBuilding(preprocessed_data, method=method)
block_builder.build_blocks(columns=columns)
all_blocks = block_builder.get_blocks()
# block_builder.display_block(100)
all_blocks

Unnamed: 0,name,address,city,cuisine,ID,BKV,block_id
0,arnie mortons of chicago,435 s la cienega blv,los angeles,american,0,425 A655,370
1,arnie mortons of chicago,435 s la cienega blvd,los angeles,steakhouses,1,425 A655,370
2,arts delicatessen,12224 ventura blvd,studio city,american,2,233 A632,28
3,arts deli,12224 ventura blvd,studio city,delis,3,233 A632,28
4,hotel belair,701 stone canyon rd,bel air,californian,4,146 H341,16
...,...,...,...,...,...,...,...
859,ti couz,3108 16th st,san francisco,french,859,251 T220,155
860,trio cafe,1870 fillmore st,san francisco,american,860,251 T621,160
861,tu lan,8 sixth st,san francisco,vietnamese,861,251 T450,158
862,vicolo pizzeria,201 ivy st,san francisco,pizza,862,251 V241,162


In [8]:
block_builder = BlockBuilding(preprocessed_data, method='sorted_neighborhood')
block_builder.build_blocks(window_size=20, columns=columns, n_letters=4)
block_builder.get_blocks()
# block_builder.display_block(1)

Unnamed: 0,name,address,city,cuisine,ID,SKV,block_id
0,103 west,103 w paces ferry rd,atlanta,continental,786,atla103 w,1
1,abbey,163 ponce de leon ave,atlanta,international,491,atlaabbey,1
2,abruzzi,2355 peachtree rd ne,atlanta,italian,149,atlaabruz,1
3,abruzzi,2355 peachtree rd peachtree battle shopping ...,atlanta,italian,148,atlaabruz,1
4,alecks barbecue heaven,783 martin luther king jr dr,atlanta,barbecue,492,atlaaleck,1
...,...,...,...,...,...,...,...
853,don antonios,1136 westwood blvd,westwood,italian,661,westdon a,43
854,falafel king,1059 broxton ave,westwood,middle eastern,663,westfalaf,43
855,feast from the east,1949 westwood blvd,west la,chinese,664,westfeast,43
856,john ogroats,10516 w pico blvd,west la,coffee shops,672,westjohn,43


In [9]:
block_builder = BlockBuilding(preprocessed_data, method='dynamic_sorted_neighborhood')
block_builder.build_blocks(max_window_size=20, match_threshold=0.7, columns=columns, n_letters=4)
all_blocks = block_builder.get_blocks()
all_blocks
# block_builder.display_block(1)

Unnamed: 0,name,address,city,cuisine,ID,SKV,block_id
0,103 west,103 w paces ferry rd,atlanta,continental,786,atla103 w,1
1,abbey,163 ponce de leon ave,atlanta,international,491,atlaabbey,2
2,abruzzi,2355 peachtree rd ne,atlanta,italian,149,atlaabruz,2
3,abruzzi,2355 peachtree rd peachtree battle shopping ...,atlanta,italian,148,atlaabruz,2
4,alecks barbecue heaven,783 martin luther king jr dr,atlanta,barbecue,492,atlaaleck,2
...,...,...,...,...,...,...,...
853,don antonios,1136 westwood blvd,westwood,italian,661,westdon a,310
854,falafel king,1059 broxton ave,westwood,middle eastern,663,westfalaf,311
855,feast from the east,1949 westwood blvd,west la,chinese,664,westfeast,311
856,john ogroats,10516 w pico blvd,west la,coffee shops,672,westjohn,312


# Field and Record Comparison:

* Q-gram comparison
* Jaro-Winkler
* Soundex

In [617]:

class Comparison:
    def __init__(self, data):
        """
        Initialize the Comparison class with the data.
        :param data: pandas DataFrame containing the data to be compared.
        """
        self.data = data
        self.comparison_results = None
        self.methods = {}
        self.parameters = {}

    @staticmethod
    def levenshtein_similarity(str1, str2):
        """
        Calculate the normalized Levenshtein similarity between two strings.
        Ensures the result is between 0 and 1.
        Returns 0 if either string is NaN or empty.
        """
        if not str1 or not str2 or pd.isna(str1) or pd.isna(str2):
            return 0
        str1 = str(str1)
        str2 = str(str2)
        from rapidfuzz.distance import Levenshtein
        score = Levenshtein.normalized_similarity(str1, str2)
        return max(0, min(1, score))  # Ensure the value is between 0 and 1

    @staticmethod
    def jaro_winkler_similarity(str1, str2):
        """
        Calculate the normalized Jaro-Winkler similarity between two strings.
        Ensures the result is between 0 and 1.
        Returns 0 if either string is NaN or empty.
        """
        if not str1 or not str2 or pd.isna(str1) or pd.isna(str2):
            return 0
        str1 = str(str1)
        str2 = str(str2)
        from rapidfuzz.distance import JaroWinkler
        score = JaroWinkler.similarity(str1, str2)
        return max(0, min(1, score))  # Ensure the value is between 0 and 1

    @staticmethod
    def qgram_similarity(str1, str2, q=10):
        """
        Calculate the Q-gram similarity between two strings.
        Ensures the result is between 0 and 1.
        Returns 0 if either string is NaN or empty.
        """
        if not str1 or not str2 or pd.isna(str1) or pd.isna(str2):
            return 0

        def generate_qgrams(s, q):
            return [s[i:i + q] for i in range(len(s) - q + 1)]

        str1 = str(str1)
        str2 = str(str2)
        qgrams1 = generate_qgrams(str1, q)
        qgrams2 = generate_qgrams(str2, q)
        matches = sum(1 for q in qgrams1 if q in qgrams2)
        total_qgrams = len(set(qgrams1 + qgrams2))
        score = matches / total_qgrams if total_qgrams > 0 else 0
        return max(0, min(1, score))  # Ensure the value is between 0 and 1

    def compare_within_blocks(self, block_col, column_algorithms):
        """
        Compare all possible pairs within each block for specified columns.
        :param block_col: The column name containing block IDs.
        :param column_algorithms: Dictionary where keys are column names and values are comparison functions.
        :return: DataFrame with comparison results for all pairs in each block.
        """
        if block_col not in self.data.columns:
            raise ValueError(f"Block column '{block_col}' not found in data.")
    
        for col in column_algorithms:
            if col not in self.data.columns:
                raise ValueError(f"Comparison column '{col}' not found in data.")
    
        # Store methods and parameters for statistics
        self.methods = list(column_algorithms.keys())
        self.parameters = {col: func.__name__ for col, func in column_algorithms.items()}
    
        # Store results in a list
        results = []
    
        # Group by block_id
        grouped = self.data.groupby(block_col)
    
        for block_id, group in grouped:
            # Get all possible pairs within the block
            pairs = list(itertools.combinations(group.iterrows(), 2))
    
            for (idx1, row1), (idx2, row2) in pairs:
                result = {
                    "block_id": block_id,
                    "row1": row1["ID"],  # Use the ID column instead of table index
                    "row2": row2["ID"],  # Use the ID column instead of table index
                }
    
                # Apply the specified algorithm to each column
                for col, comparison_func in column_algorithms.items():
                    result[f"{col}_similarity"] = comparison_func(row1[col], row2[col])
    
                results.append(result)
    
        self.comparison_results = pd.DataFrame(results)
        return self.comparison_results.sort_values(by=block_col).reset_index(drop=True)

    def get_comparison_results(self):
        """
        Get the comparison results.
        :return: DataFrame containing the comparison results.
        """
        if self.comparison_results is None:
            raise ValueError("No comparison results available. Run 'compare_within_blocks' first.")
        return self.comparison_results

    def used_methods(self):
        """
        Get the comparison methods used.
        :return: List of column names compared.
        """
        return self.methods

    def used_parameters(self):
        """
        Get the parameters used for the comparison methods.
        :return: Dictionary of methods and their parameters.
        """
        return self.parameters
    
    def dataframe_to_jsonb(self):
        """
        Convert a DataFrame to a JSONB-compatible string.
        :param dataframe: pandas DataFrame
        :return: JSON string
        """
        # Convert the DataFrame to a JSON string
        json_data = self.comparison_results.to_json(orient='records', date_format='iso')
        return json.loads(json_data)


In [144]:
comparison = Comparison(all_blocks)

# name	address	city	cuisine
column_algorithms = {
    "name": comparison.qgram_similarity,
    "address": comparison.jaro_winkler_similarity,
    "city": comparison.jaro_winkler_similarity,
    "cuisine": comparison.qgram_similarity
}

# Compare within blocks
comparison_results = comparison.compare_within_blocks(
    block_col="block_id",
    column_algorithms=column_algorithms
)
comparison_results



ValueError: Comparison column 'name' not found in data.

# Classification

* for now Threshold based only - but it is trivial to 

In [71]:
class Classifier:
    def __init__(self, blocked_data, comparison_table):
        """
        Initialize the MatchClassifier with blocked data and comparison table.
        :param blocked_data: DataFrame containing blocked source data with block_ids and SKVs/BKVs.
        :param comparison_table: DataFrame containing pairwise comparisons with similarities.
        """
        self.blocked_data = blocked_data
        self.comparison_table = comparison_table
        self.classification_results = None  # To store classification results

    def classify_matches(self, method='threshold_based', thresholds=None, weights=None, possible_match=False, costs=None, probabilities=None):
        """
        Classify the results based on the selected method.
        :param method: The classification method to use ('threshold_based', 'weighted', 'cost_based').
        :param thresholds: Dictionary with thresholds for classification.
        :param weights: Dictionary with weights for each similarity column (only for 'weighted' method).
        :param possible_match: Boolean indicating whether to include 'Possible Match' as a category.
        :param costs: Dictionary with costs for cost-based classification.
        :param probabilities: Dictionary with prior probabilities for cost-based classification.
        :return: DataFrame with classifications added.
        """
        
        self.method = method
        self.parameters = {
            'thresholds': thresholds,
            'weights': weights,
            'possible_match': possible_match,
            'costs': costs,
            'probabilities': probabilities,
        }
        
        if method == 'threshold_based':
            if thresholds is None:
                raise ValueError("Thresholds must be provided for threshold-based classification.")
            self.classification_results = self._threshold_based_classification(thresholds, possible_match)
        
        elif method == 'weighted':
            if thresholds is None or weights is None:
                raise ValueError("Both thresholds and weights must be provided for weighted classification.")
            self.classification_results = self._weighted_classification(thresholds, weights)
        
        elif method == 'cost_based':
            if costs is None or probabilities is None:
                raise ValueError("Costs and probabilities must be provided for cost-based classification.")
            self.classification_results = self._cost_based_classification(costs, probabilities)
        
        else:
            raise ValueError(f"Unknown classification method: {method}")

        return self.classification_results

    def _threshold_based_classification(self, thresholds, possible_match):
        merged_data = self._merge_row_details()
        similarity_columns = [col for col in self.comparison_table.columns if col.endswith('_similarity')]
        merged_data['average_similarity'] = merged_data[similarity_columns].mean(axis=1)
        
        merged_data['classification'] = merged_data['average_similarity'].apply(
            lambda similarity: self._classify_by_thresholds(similarity, thresholds, possible_match)
        )
        return merged_data

    def _weighted_classification(self, thresholds, weights):
        merged_data = self._merge_row_details()
        similarity_columns = [col for col in self.comparison_table.columns if col.endswith('_similarity')]

        if not set(weights.keys()).issubset(set(similarity_columns)):
            raise ValueError("All keys in weights must match similarity columns.")

        merged_data['weighted_similarity'] = sum(
            merged_data[col] * weight for col, weight in weights.items()
        )
        
        min_similarity = merged_data['weighted_similarity'].min()
        max_similarity = merged_data['weighted_similarity'].max()
        merged_data['normalized_similarity'] = (
            (merged_data['weighted_similarity'] - min_similarity) /
            (max_similarity - min_similarity)
            if max_similarity > min_similarity else 0
        )
        
        merged_data['classification'] = merged_data['normalized_similarity'].apply(
            lambda similarity: 'Match' if similarity >= thresholds['match'] else 'Non-Match'
        )
        
        return merged_data

    def _cost_based_classification(self, costs, probabilities):
        merged_data = self._merge_row_details()
        similarity_columns = [col for col in self.comparison_table.columns if col.endswith('_similarity')]
        merged_data['average_similarity'] = merged_data[similarity_columns].mean(axis=1)
        
        P_M = probabilities['M']
        P_U = probabilities['U']
        merged_data['cost_non_match'] = (
            costs['non_match_true_match'] * merged_data['average_similarity'] * P_M +
            costs['non_match_true_non_match'] * (1 - merged_data['average_similarity']) * P_U
        )
        merged_data['cost_match'] = (
            costs['match_true_match'] * merged_data['average_similarity'] * P_M +
            costs['match_true_non_match'] * (1 - merged_data['average_similarity']) * P_U
        )
        
        merged_data['classification'] = merged_data.apply(
            lambda row: 'Match' if row['cost_match'] < row['cost_non_match'] else 'Non-Match',
            axis=1
        )
        
        return merged_data

    def _merge_row_details(self):
        """
        Merge row1 and row2 details into the comparison table using the `ID` column.
        :return: Merged DataFrame with row details added.
        """
        # Fetch rows based on 'ID' instead of index
        row1_details = self.blocked_data.set_index('ID').loc[self.comparison_table['row1']].reset_index(drop=True)
        row2_details = self.blocked_data.set_index('ID').loc[self.comparison_table['row2']].reset_index(drop=True)
        
        # Merge the details into the comparison table
        merged_data = self.comparison_table.copy()
        for col in self.blocked_data.columns:
            if col not in ['block_id', 'SKV', 'BKV', 'ID']:  # Exclude metadata columns
                merged_data[f'row1_{col}'] = row1_details[col].values
                merged_data[f'row2_{col}'] = row2_details[col].values
        return merged_data

    def _classify_by_thresholds(self, similarity, thresholds, possible_match):
        if possible_match:
            if similarity < thresholds['not_match']:
                return 'Not Match'
            elif thresholds['not_match'] <= similarity < thresholds['match']:
                return 'Possible Match'
            else:
                return 'Match'
        else:
            if similarity < thresholds['match']:
                return 'Not Match'
            else:
                return 'Match'

    def get_classification_results(self):
        """
        Get the classification results.
        :return: DataFrame with the classification results.
        """
        if self.classification_results is None:
            raise ValueError("No classification results available. Run 'classify_matches' first.")
        return self.classification_results

    def used_methods(self):
        """
        Get the classification method used.
        :return: String representing the classification method.
        """
        return self.method

    def used_parameters(self):
        """
        Get the parameters used for the classification method.
        :return: Dictionary of parameters.
        """
        return self.parameters
    
    def dataframe_to_jsonb(self):
        """
        Convert a DataFrame to a JSONB-compatible string.
        :param dataframe: pandas DataFrame
        :return: JSON string
        """
        # Convert the DataFrame to a JSON string
        json_data = self.classification_results.to_json(orient='records', date_format='iso')
        return json.loads(json_data)


In [72]:
classifier = Classifier(all_blocks, comparison_results)
method = 'threshold_based'
possible_match = False
# thresholds = {'not_match': 0.6, 'match': 0.75}
thresholds = {'match': 0.8}
weights = {'name_similarity': 1, 'address_similarity': 1, 'city_similarity': 0.5, 'cuisine_similarity': 0.1}
costs = {
    'non_match_true_match': 5,  # Cost of classifying a true match as a non-match
    'non_match_true_non_match': 1,  # Cost of classifying a true non-match as a non-match
    'match_true_match': 1,  # Cost of classifying a true match as a match
    'match_true_non_match': 10  # Cost of classifying a true non-match as a match
}
probabilities = {'M': 0.3, 'U': 0.7}

# classified_results = classifier.classify_matches(
#     method='cost_based',
#     costs=costs,
#     probabilities=probabilities
# )

# not_match = 0.4
classified_results = classifier.classify_matches(
    method='weighted',
    thresholds=thresholds,
    possible_match=possible_match,
    weights=weights
)

classified_results[classified_results['classification'] == 'Match']

# classifier.used_parameters()


Unnamed: 0,block_id,row1,row2,name_similarity,address_similarity,city_similarity,cuisine_similarity,row1_name,row2_name,row1_address,row2_address,row1_city,row2_city,row1_cuisine,row2_cuisine,weighted_similarity,normalized_similarity,classification
3,1,161,160,1.0,1.000000,1.000000,0.066667,delectables,delectables,1 margaret mitchell sq,1 margaret mitchell sq,atlanta,atlanta,cafeterias,american,2.506667,0.959541,Match
6,2,192,193,1.0,1.000000,1.000000,0.636364,boulevard,boulevard,1 mission st,1 mission st,san francisco,san francisco,american,american new,2.563636,0.990972,Match
7,5,55,54,1.0,1.000000,0.928571,0.000000,cafe des artistes,cafe des artistes,1 w 67th st,1 w 67th st,new york city,new york,french classic,continental,2.464286,0.936159,Match
12,5,112,113,1.0,0.875862,1.000000,0.636364,river cafe,river cafe,1 water st at the east river,1 water st,brooklyn,brooklyn,american,american new,2.439498,0.922484,Match
31,37,118,119,1.0,1.000000,0.928571,0.000000,seryna,seryna,11 e 53rd st,11 e 53rd st,new york,new york city,asian,japanese,2.464286,0.936159,Match
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378,493,194,195,1.0,0.966667,1.000000,0.416667,cafe claude,cafe claude,7 claude la,7 claude ln,san francisco,san francisco,french,french bistro,2.508333,0.960461,Match
387,512,201,200,1.0,1.000000,1.000000,0.555556,fleur de lys,fleur de lys,777 sutter st,777 sutter st,san francisco,san francisco,french new,french,2.555556,0.986514,Match
389,520,199,198,1.0,0.957895,1.000000,0.000000,chez michel,chez michel,804 north point st,804 northpoint,san francisco,san francisco,californian,french,2.457895,0.932633,Match
398,554,22,23,1.0,1.000000,0.544444,0.384615,lorangerie,lorangerie,903 n la cienega blvd,903 n la cienega blvd,los angeles,w hollywood,french,french classic,2.310684,0.851416,Match


# Evaluation


In [73]:
import pandas as pd
import json


class Evaluation:
    def __init__(self, source_data, classified_data):
        """
        Initialize the Evaluation class with the source data and classified data.
        :param source_data: DataFrame containing the original source data.
        :param classified_data: DataFrame containing classified match results.
        """
        self.json_data = None
        self.source_data = source_data
        self.classified_data = classified_data
        self.evaluated_data = None
        self.matches = None
        self.statistics = None
        self.used_parameters = None

    def show_matches_side_by_side(self):
        """
        Show all rows classified as 'Match' in an unflattened format,
        with an additional 'dropped' column ('NO' for row1 and 'YES' for row2).
        :return: DataFrame with matched rows from source_data and 'dropped' column.
        """
        matches = self.classified_data[self.classified_data['classification'] == 'Match']
        rows = []
        dedup_id = 1

        for _, row in matches.iterrows():
            row1 = self.source_data.iloc[int(row['row1'])].copy()
            row2 = self.source_data.iloc[int(row['row2'])].copy()

            row1['dropped'] = 'NO'
            row2['dropped'] = 'YES'
            row1['dedup_id'] = dedup_id
            row2['dedup_id'] = dedup_id

            dedup_id += 1
            rows.append(row1)
            rows.append(row2)

        result_df = pd.DataFrame(rows)
        columns = ['dedup_id'] + [col for col in result_df.columns if col != 'dedup_id']
        self.matches = result_df[columns]
        return self.matches

    def get_deduplicated_data(self):
        """
        Get deduplicated data by dropping all rows from source_data whose indexes are in row_2 of classified_data.
        :return: Deduplicated DataFrame.
        """
        to_drop = self.classified_data[self.classified_data['classification'] == 'Match']['row2'].unique()
        self.evaluated_data = self.source_data.drop(index=to_drop).reset_index(drop=True)
        return self.evaluated_data

    def get_statistics(self):
        """
        Get statistics about the deduplication process.
        :return: DataFrame with statistics.
        """
        row_count_before = int(self.source_data.shape[0])
        deduplicated_data = self.get_deduplicated_data()
        row_count_after = int(deduplicated_data.shape[0])
        num_duplicates = row_count_before - row_count_after
        duplicate_percentage = (num_duplicates / row_count_before) * 100

        stats = {
            'Detected duplicates': num_duplicates,
            'Row count before deduplication': row_count_before,
            'Row count after deduplication': row_count_after,
            'Duplicate percentage': round(duplicate_percentage, 2),
        }

        self.statistics = pd.DataFrame([stats])
        return self.statistics

    @staticmethod
    def used_methods_parameters(self, *workflow_objects):
        """
        Collect statistics about all workflow steps into a table.
        :param workflow_objects: Instances of classes (e.g., BlockBuilding, Comparison, Classifier).
        :return: DataFrame summarizing methods and parameters for each step.
        """
        stats = []
        for obj in workflow_objects:
            stats.append({
                'Step': obj.__class__.__name__,
                'Method': obj.used_methods(),
                'Parameters': obj.used_parameters(),
            })
        self.used_parameters = pd.DataFrame(stats)
        return self.used_parameters

    def dataframes_to_jsonb(self):
        """
        Convert multiple DataFrames to a JSONB-compatible string.
        :return: JSON-compatible string containing all DataFrames.
        """
        dataframes_dict = {
            'evaluated_data': self.evaluated_data,
            'matches': self.matches,
            'statistics': self.statistics,
        }

        json_data = {}
        for keyword, dataframe in dataframes_dict.items():
            if dataframe is not None:
                json_data[keyword] = json.loads(dataframe.to_json(orient='records', date_format='iso'))

        self.json_data = json.dumps(json_data, indent=4)
        return self.json_data

    def retrieve_dataframe_from_jsonb(self, keyword):
        """
        Retrieve a specific DataFrame from a JSONB-compatible string.
        :param keyword: The keyword of the desired DataFrame.
        :return: The reconstructed pandas DataFrame.
        """
        if not self.json_data:
            raise ValueError("No JSON data available. Call `dataframes_to_jsonb()` first.")

        json_dict = json.loads(self.json_data)
        if keyword not in json_dict:
            raise KeyError(f"Keyword '{keyword}' not found in the JSON data.")

        return pd.DataFrame(json_dict[keyword])


In [74]:
evaluation = Evaluation(source_data, classified_results)
matches_side_by_side = evaluation.show_matches_side_by_side()
matches_side_by_side

Unnamed: 0,dedup_id,name,address,city,cuisine,ID,dropped
161,1,delectables,1 margaret mitchell sq.,atlanta,cafeterias,161,NO
160,1,delectables,1 margaret mitchell sq.,atlanta,american,160,YES
192,2,boulevard,1 mission st.,san francisco,american,192,NO
193,2,boulevard,1 mission st.,san francisco,american (new),193,YES
55,3,cafe des artistes,1 w. 67th st.,new york city,french (classic),55,NO
...,...,...,...,...,...,...,...
198,67,chez michel,804 northpoint,san francisco,french,198,YES
22,68,l'orangerie,903 n. la cienega blvd.,los angeles,french,22,NO
23,68,l'orangerie,903 n. la cienega blvd.,w. hollywood,french (classic),23,YES
109,69,pisces,95 ave. a,new york city,seafood,109,NO


In [75]:
deduplicated_data = evaluation.get_deduplicated_data()
deduplicated_data

Unnamed: 0,name,address,city,cuisine,ID
0,arnie morton's of chicago,435 s. la cienega blv.,los angeles,american,0
1,art's delicatessen,12224 ventura blvd.,studio city,american,2
2,art's deli,12224 ventura blvd.,studio city,delis,3
3,hotel bel-air,701 stone canyon rd.,bel air,californian,4
4,bel-air hotel,701 stone canyon rd.,bel air,californian,5
...,...,...,...,...,...
790,ti couz,3108 16th st.,san francisco,french,859
791,trio cafe,1870 fillmore st.,san francisco,american,860
792,tu lan,8 sixth st.,san francisco,vietnamese,861
793,vicolo pizzeria,201 ivy st.,san francisco,pizza,862


In [76]:
statistics = evaluation.get_statistics()
statistics

Unnamed: 0,Detected duplicates,Row count before deduplication,Row count after deduplication,Duplicate percentage
0,69,864,795,7.99


In [77]:
evaluation.dataframes_to_jsonb()
evaluation.retrieve_dataframe_from_jsonb('statistics')
evaluation.retrieve_dataframe_from_jsonb('evaluated_data')
evaluation.retrieve_dataframe_from_jsonb('matches')

Unnamed: 0,dedup_id,name,address,city,cuisine,ID,dropped
0,1,delectables,1 margaret mitchell sq.,atlanta,cafeterias,161,NO
1,1,delectables,1 margaret mitchell sq.,atlanta,american,160,YES
2,2,boulevard,1 mission st.,san francisco,american,192,NO
3,2,boulevard,1 mission st.,san francisco,american (new),193,YES
4,3,cafe des artistes,1 w. 67th st.,new york city,french (classic),55,NO
...,...,...,...,...,...,...,...
133,67,chez michel,804 northpoint,san francisco,french,198,YES
134,68,l'orangerie,903 n. la cienega blvd.,los angeles,french,22,NO
135,68,l'orangerie,903 n. la cienega blvd.,w. hollywood,french (classic),23,YES
136,69,pisces,95 ave. a,new york city,seafood,109,NO


# Restaurants

In [78]:
path = "data/restaurant-nophone.csv"
rd = ReadData(path)
source_data = rd.read_data()
source_data.head(10)

Unnamed: 0,name,address,city,cuisine,ID
0,arnie morton's of chicago,435 s. la cienega blv.,los angeles,american,0
1,arnie morton's of chicago,435 s. la cienega blvd.,los angeles,steakhouses,1
2,art's delicatessen,12224 ventura blvd.,studio city,american,2
3,art's deli,12224 ventura blvd.,studio city,delis,3
4,hotel bel-air,701 stone canyon rd.,bel air,californian,4
5,bel-air hotel,701 stone canyon rd.,bel air,californian,5
6,cafe bizou,14016 ventura blvd.,sherman oaks,french,6
7,cafe bizou,14016 ventura blvd.,sherman oaks,french bistro,7
8,campanile,624 s. la brea ave.,los angeles,american,8
9,campanile,624 s. la brea ave.,los angeles,californian,9


In [79]:
preprocessor = DataPreprocessing(source_data)
columns=['all']
lowercase=True 
diacritics_removal=True
punctuation_removal=True
preprocessor.select_columns(columns=columns)
preprocessor.apply_preprocessing(lowercase=lowercase, diacritics_removal=diacritics_removal, punctuation_removal=punctuation_removal)
preprocessed_data = preprocessor.get_processed_data()
preprocessed_data.head(10)

Unnamed: 0,name,address,city,cuisine,ID
0,arnie mortons of chicago,435 s la cienega blv,los angeles,american,0
1,arnie mortons of chicago,435 s la cienega blvd,los angeles,steakhouses,1
2,arts delicatessen,12224 ventura blvd,studio city,american,2
3,arts deli,12224 ventura blvd,studio city,delis,3
4,hotel belair,701 stone canyon rd,bel air,californian,4
5,belair hotel,701 stone canyon rd,bel air,californian,5
6,cafe bizou,14016 ventura blvd,sherman oaks,french,6
7,cafe bizou,14016 ventura blvd,sherman oaks,french bistro,7
8,campanile,624 s la brea ave,los angeles,american,8
9,campanile,624 s la brea ave,los angeles,californian,9


In [80]:
# Standard blocking
method='standard_blocking'
columns=['address', 'name']
# block_builder = BlockBuilding(preprocessed_data, method=method)
# block_builder.build_blocks(columns=columns)

# SNM
# block_builder = BlockBuilding(preprocessed_data, method='sorted_neighborhood')
# block_builder.build_blocks(window_size=20, columns=columns, n_letters=4)

#DSNM
block_builder = BlockBuilding(preprocessed_data, method='dynamic_sorted_neighborhood')
block_builder.build_blocks(max_window_size=5, match_threshold=0.6, columns=columns, n_letters=4)

all_blocks = block_builder.get_blocks()
all_blocks.head(10)

Unnamed: 0,name,address,city,cuisine,ID,SKV,block_id
0,center stage plaza hotel,1 main st,las vegas,american,548,1 macente,1
1,delectables,1 margaret mitchell sq,atlanta,cafeterias,161,1 madelec,1
2,delectables,1 margaret mitchell sq,atlanta,american,160,1 madelec,1
3,one market,1 market st,san francisco,american,612,1 maone m,1
4,boulevard,1 mission st,san francisco,american,192,1 miboule,2
5,boulevard,1 mission st,san francisco,american new,193,1 miboule,2
6,rumpus,1 tillman pl,san francisco,american,621,1 tirumpu,3
7,ambassador grill,1 united nations plaza at 44th st,new york,american,291,1 unambas,4
8,cafe des artistes,1 w 67th st,new york city,french classic,55,1 w cafe,5
9,cafe des artistes,1 w 67th st,new york,continental,54,1 w cafe,5


In [81]:
comparison = Comparison(all_blocks)

# name	address	city	cuisine
column_algorithms = {
    "name": comparison.jaro_winkler_similarity,
    "address": comparison.qgram_similarity,
    "city": comparison.jaro_winkler_similarity,
    "cuisine": comparison.qgram_similarity
}

# Compare within blocks
comparison_results = comparison.compare_within_blocks(
    block_col="block_id",
    column_algorithms=column_algorithms
)
comparison_results.head(10)

Unnamed: 0,block_id,row1,row2,name_similarity,address_similarity,city_similarity,cuisine_similarity
0,1,548,161,0.561869,0.115385,0.55,0.066667
1,1,548,160,0.561869,0.115385,0.55,1.0
2,1,548,612,0.521032,0.285714,0.620635,1.0
3,1,161,160,1.0,1.0,1.0,0.066667
4,1,161,612,0.504545,0.192308,0.511905,0.066667
5,1,160,612,0.504545,0.192308,0.511905,1.0
6,2,192,193,1.0,1.0,1.0,0.636364
7,5,55,54,1.0,1.0,0.928571,0.0
8,5,55,112,0.561064,0.090909,0.570899,0.0
9,5,55,113,0.561064,0.1875,0.570899,0.0


In [117]:
classifier = Classifier(all_blocks, comparison_results)

# Weighted
# method = 'threshold_based'
# possible_match = False
# # thresholds = {'not_match': 0.6, 'match': 0.75}
thresholds = {'match': 0.7}
weights = {'name_similarity': 1, 'address_similarity': 0.5, 'city_similarity': 0.2, 'cuisine_similarity': 0.1}
classified_results = classifier.classify_matches(
    method='weighted',
    thresholds=thresholds,
    possible_match=possible_match,
    weights=weights
)

# Cost-based
# costs = {
#     'non_match_true_match': 10,  # Cost of classifying a true match as a non-match
#     'non_match_true_non_match': 1,  # Cost of classifying a true non-match as a non-match
#     'match_true_match': 1,  # Cost of classifying a true match as a match
#     'match_true_non_match': 4  # Cost of classifying a true non-match as a match
# }
# probabilities = {'M': 0.4, 'U': 0.6}
#
# classified_results = classifier.classify_matches(
#     method='cost_based',
#     costs=costs,
#     probabilities=probabilities
# )


classified_results


Unnamed: 0,block_id,row1,row2,name_similarity,address_similarity,city_similarity,cuisine_similarity,row1_name,row2_name,row1_address,row2_address,row1_city,row2_city,row1_cuisine,row2_cuisine,weighted_similarity,normalized_similarity,classification
0,1,548,161,0.561869,0.115385,0.550000,0.066667,center stage plaza hotel,delectables,1 main st,1 margaret mitchell sq,las vegas,atlanta,american,cafeterias,0.736228,0.306031,Non-Match
1,1,548,160,0.561869,0.115385,0.550000,1.000000,center stage plaza hotel,delectables,1 main st,1 margaret mitchell sq,las vegas,atlanta,american,american,0.829561,0.367747,Non-Match
2,1,548,612,0.521032,0.285714,0.620635,1.000000,center stage plaza hotel,one market,1 main st,1 market st,las vegas,san francisco,american,american,0.888016,0.406400,Non-Match
3,1,161,160,1.000000,1.000000,1.000000,0.066667,delectables,delectables,1 margaret mitchell sq,1 margaret mitchell sq,atlanta,atlanta,cafeterias,american,1.706667,0.947730,Match
4,1,161,612,0.504545,0.192308,0.511905,0.066667,delectables,one market,1 margaret mitchell sq,1 market st,atlanta,san francisco,cafeterias,american,0.709747,0.288520,Non-Match
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400,561,109,108,1.000000,0.411765,0.928571,1.000000,pisces,pisces,95 ave a,95 ave a at 6th st,new york city,new york,seafood,seafood,1.491597,0.805516,Match
401,561,108,469,0.555556,0.423077,1.000000,0.000000,pisces,tapika,95 ave a at 6th st,950 8th ave at 56th st,new york,new york,seafood,american,0.967094,0.458690,Non-Match
402,564,18,19,0.911111,1.000000,0.599206,0.368421,grill on the alley,grill the,9560 dayton way,9560 dayton way,los angeles,beverly hills,american,american traditional,1.567794,0.855901,Match
403,570,68,69,0.904348,1.000000,0.928571,0.636364,four seasons grill room,four seasons,99 e 52nd st,99 e 52nd st,new york,new york city,american,american new,1.653698,0.912705,Match


In [111]:
evaluation = Evaluation(source_data, classified_results)
matches_side_by_side = evaluation.show_matches_side_by_side()
matches_side_by_side

Unnamed: 0,dedup_id,name,address,city,cuisine,ID,dropped
161,1,delectables,1 margaret mitchell sq.,atlanta,cafeterias,161,NO
160,1,delectables,1 margaret mitchell sq.,atlanta,american,160,YES
192,2,boulevard,1 mission st.,san francisco,american,192,NO
193,2,boulevard,1 mission st.,san francisco,american (new),193,YES
55,3,cafe des artistes,1 w. 67th st.,new york city,french (classic),55,NO
...,...,...,...,...,...,...,...
108,94,pisces,95 ave. a at 6th st.,new york,seafood,108,YES
18,95,grill on the alley,9560 dayton way,los angeles,american,18,NO
19,95,grill the,9560 dayton way,beverly hills,american (traditional),19,YES
68,96,four seasons grill room,99 e. 52nd st.,new york,american,68,NO


In [118]:
evaluation.dataframes_to_jsonb()
evaluation.retrieve_dataframe_from_jsonb('evaluated_data')

Unnamed: 0,name,address,city,cuisine,ID
0,arnie morton's of chicago,435 s. la cienega blv.,los angeles,american,0
1,art's deli,12224 ventura blvd.,studio city,delis,3
2,hotel bel-air,701 stone canyon rd.,bel air,californian,4
3,bel-air hotel,701 stone canyon rd.,bel air,californian,5
4,cafe bizou,14016 ventura blvd.,sherman oaks,french,6
...,...,...,...,...,...
765,ti couz,3108 16th st.,san francisco,french,859
766,trio cafe,1870 fillmore st.,san francisco,american,860
767,tu lan,8 sixth st.,san francisco,vietnamese,861
768,vicolo pizzeria,201 ivy st.,san francisco,pizza,862


In [115]:
filtered_df = matches_side_by_side[matches_side_by_side['dropped'] == 'YES'].drop(columns=['dropped', 'dedup_id', 'ID'])
filtered_df.to_csv('data/detected_dups/found_dups.csv', index=False)

In [112]:
statistics = evaluation.get_statistics()
statistics

Unnamed: 0,Detected duplicates,Row count before deduplication,Row count after deduplication,Duplicate percentage
0,94,864,770,10.88


# CENSUS

In [400]:
path = "data/census.csv"
rd = ReadData(path)
# source_data = pd.read_csv('data/census.csv')
# source_data.to_csv('data/census.csv', index=False)
# source_data = source_data.drop(columns='id')
source_data = rd.read_data()
source_data

Unnamed: 0,first_name,last_name,middle_name,street_address,zip_code,ID
0,,ANDERSON,,BASSWOOD,4848.0,0
1,,ANDERSON,,BASSWOOD,4848.0,1
2,,ANDERSON,,BASSWOOD,4848.0,2
3,,ANDERSON,,BASSWOOD,4848.0,3
4,CLARA,AQUENDO,J,STARKEY,666.0,4
...,...,...,...,...,...,...
836,SHERRY,WILLIAM,V,WOODHAVEN,510.0,836
837,BRYAN,WRIGHT,,WOODHAVEN,307.0,837
838,MAXINE,WRIGHT,H,WOODHAVEN,307.0,838
839,CHANSE,YATES,E,WOODHAVEN,403.0,839


In [401]:
preprocessor = DataPreprocessing(source_data)
columns=['all']
lowercase=True
diacritics_removal=True
punctuation_removal=True
preprocessor.select_columns(columns=columns)
preprocessor.apply_preprocessing(lowercase=lowercase, diacritics_removal=diacritics_removal, punctuation_removal=punctuation_removal)
preprocessed_data = preprocessor.get_processed_data()
preprocessed_data.head(10)

Unnamed: 0,first_name,last_name,middle_name,street_address,zip_code,ID
0,,anderson,,basswood,4848.0,0
4,clara,aquendo,j,starkey,666.0,4
5,learonad,benitez,a,starkey,660.0,5
6,samuel,benitez,r,starkey,660.0,6
7,delores,benitez,r,starkey,660.0,7
8,jamel,bodner,c,bank,102.0,8
9,rosetta,bodner,l,bank,102.0,9
10,madge,bodner,r,bank,102.0,10
11,rosalind,bodner,n,bank,102.0,11
12,carolyn,bodner,l,bank,102.0,12


In [579]:
# Standard blocking
method='standard_blocking'
columns=['last_name', 'street_address']
# block_builder = BlockBuilding(preprocessed_data, method=method)
# block_builder.build_blocks(columns=columns)

# SNM
# block_builder = BlockBuilding(preprocessed_data, method='sorted_neighborhood')
# block_builder.build_blocks(window_size=20, columns=columns, n_letters=4)

# ## DSNM
block_builder = BlockBuilding(preprocessed_data, method='dynamic_sorted_neighborhood')
block_builder.build_blocks(max_window_size=5, match_threshold=0.9, columns=columns, n_letters=4)

all_blocks = block_builder.get_blocks()
all_blocks

Unnamed: 0,first_name,last_name,middle_name,street_address,zip_code,ID,SKV,block_id
0,,1rumor,a,hwy 54,3820.0,179,1rumohwy 5,1
1,zakiya,1rumor,a,hwy 54,3820.0,178,1rumohwy 5,1
2,terran,1rumor,l,hwy 54,3820.0,177,1rumohwy 5,1
3,bernadina,1rumor,,hwy 54,3820.0,176,1rumohwy 5,1
4,diwaldo,1rumor,a,hwy 54,3820.0,175,1rumohwy 5,1
...,...,...,...,...,...,...,...,...
815,bryan,wright,,woodhaven,307.0,837,wrighwoodh,306
816,alfredica,yates,s,woodhaven,403.0,840,yateswoodh,307
817,chanse,yates,e,woodhaven,403.0,839,yateswoodh,307
818,alfredrick,yetes,s,woodhaven,403.0,448,yeteswoodh,307


In [580]:
comparison = Comparison(all_blocks)

# name	address	city	cuisine
column_algorithms = {
    "first_name": comparison.jaro_winkler_similarity,
    "last_name": comparison.jaro_winkler_similarity,
    "middle_name": comparison.qgram_similarity,
    "street_address": comparison.qgram_similarity,
    "zip_code": comparison.qgram_similarity,
}

# Compare within blocks
comparison_results = comparison.compare_within_blocks(
    block_col="block_id",
    column_algorithms=column_algorithms
)
comparison_results.head(10)

Unnamed: 0,block_id,row1,row2,first_name_similarity,last_name_similarity,middle_name_similarity,street_address_similarity,zip_code_similarity
0,1,179,178,0.0,1.0,0,1.0,1.0
1,1,179,177,0.0,1.0,0,1.0,1.0
2,1,179,176,0.0,1.0,0,1.0,1.0
3,1,179,175,0.0,1.0,0,1.0,1.0
4,1,178,177,0.444444,1.0,0,1.0,1.0
5,1,178,176,0.611111,1.0,0,1.0,1.0
6,1,178,175,0.373016,1.0,0,1.0,1.0
7,1,177,176,0.62037,1.0,0,1.0,1.0
8,1,177,175,0.436508,1.0,0,1.0,1.0
9,1,176,175,0.502646,1.0,0,1.0,1.0


In [581]:
classifier = Classifier(all_blocks, comparison_results)

# Weighted
# method = 'threshold_based'
# possible_match = False
# thresholds = {'not_match': 0.6, 'match': 0.75}
thresholds = {'match': 0.85}
weights = {'first_name_similarity': 1,
           'last_name_similarity': 0.5,
           'middle_name_similarity': 0.8,
           'street_address_similarity': 0.3,
           'zip_code_similarity': 0.8}
classified_results = classifier.classify_matches(
    method='weighted',
    thresholds=thresholds,
    possible_match=possible_match,
    weights=weights
)

# Cost-based
# costs = {
#     'non_match_true_match': 4,  # Cost of classifying a true match as a non-match
#     'non_match_true_non_match': 1,  # Cost of classifying a true non-match as a non-match
#     'match_true_match': 1,  # Cost of classifying a true match as a match
#     'match_true_non_match': 5  # Cost of classifying a true non-match as a match
# }
# probabilities = {'M': 0.3, 'U': 0.7}
# #
# classified_results = classifier.classify_matches(
#     method='cost_based',
#     costs=costs,
#     probabilities=probabilities
# )


classified_results

Unnamed: 0,block_id,row1,row2,first_name_similarity,last_name_similarity,middle_name_similarity,street_address_similarity,zip_code_similarity,row1_first_name,row2_first_name,...,row2_last_name,row1_middle_name,row2_middle_name,row1_street_address,row2_street_address,row1_zip_code,row2_zip_code,weighted_similarity,normalized_similarity,classification
0,1,179,178,0.000000,1.00,0,1.0,1.0,,zakiya,...,1rumor,a,a,hwy 54,hwy 54,3820.0,3820.0,1.600000,0.480625,Non-Match
1,1,179,177,0.000000,1.00,0,1.0,1.0,,terran,...,1rumor,a,l,hwy 54,hwy 54,3820.0,3820.0,1.600000,0.480625,Non-Match
2,1,179,176,0.000000,1.00,0,1.0,1.0,,bernadina,...,1rumor,a,,hwy 54,hwy 54,3820.0,3820.0,1.600000,0.480625,Non-Match
3,1,179,175,0.000000,1.00,0,1.0,1.0,,diwaldo,...,1rumor,a,a,hwy 54,hwy 54,3820.0,3820.0,1.600000,0.480625,Non-Match
4,1,178,177,0.444444,1.00,0,1.0,1.0,zakiya,terran,...,1rumor,a,l,hwy 54,hwy 54,3820.0,3820.0,2.044444,0.720937,Non-Match
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1010,307,839,448,0.511111,0.88,0,1.0,1.0,chanse,alfredrick,...,yetes,e,s,woodhaven,woodhaven,403.0,403.0,2.051111,0.724542,Non-Match
1011,307,839,447,0.933333,0.88,0,1.0,1.0,chanse,chance,...,yetes,e,e,woodhaven,woodhaven,403.0,403.0,2.473333,0.952839,Match
1012,307,448,447,0.488889,1.00,0,1.0,1.0,alfredrick,chance,...,yetes,s,e,woodhaven,woodhaven,403.0,403.0,2.088889,0.744968,Non-Match
1013,307,840,448,0.937778,0.88,0,1.0,1.0,alfredica,alfredrick,...,yetes,s,s,woodhaven,woodhaven,403.0,403.0,2.477778,0.955242,Match


In [582]:
evaluation = Evaluation(source_data, classified_results)
evaluation.show_matches_side_by_side().to_csv('data/detected_dups/census_found_dups.csv', index=False)
evaluation.show_matches_side_by_side()

Unnamed: 0,dedup_id,first_name,last_name,middle_name,street_address,zip_code,ID,dropped
449,1,MARITZA,AHREWS,A,71ST,345.0,449,NO
450,1,MARIANNA,AHREWS,B,71ST,345.0,450,YES
181,2,MICHEAL,ALSTIN,,OCONEE,612.0,181,NO
560,2,MICHAEL,ALSTON,,OCONEE,612.0,560,YES
180,3,LEO,ALSTIN,E,OCONEE,612.0,180,NO
...,...,...,...,...,...,...,...,...
835,121,CHARLEY,WILLIAM,A,WOODHAVEN,510.0,835,YES
839,122,CHANSE,YATES,E,WOODHAVEN,403.0,839,NO
447,122,CHANCE,YETES,E,WOODHAVEN,403.0,447,YES
840,123,ALFREDICA,YATES,S,WOODHAVEN,403.0,840,NO


In [583]:

evaluation.get_deduplicated_data()

Unnamed: 0,first_name,last_name,middle_name,street_address,zip_code,ID
0,,ANDERSON,,BASSWOOD,4848.0,0
1,,ANDERSON,,BASSWOOD,4848.0,1
2,,ANDERSON,,BASSWOOD,4848.0,2
3,,ANDERSON,,BASSWOOD,4848.0,3
4,CLARA,AQUENDO,J,STARKEY,666.0,4
...,...,...,...,...,...,...
720,HARRIET,WILKS,L,EL DORADO,103.0,834
721,BRYAN,WRIGHT,,WOODHAVEN,307.0,837
722,MAXINE,WRIGHT,H,WOODHAVEN,307.0,838
723,CHANSE,YATES,E,WOODHAVEN,403.0,839


In [584]:
evaluation.get_statistics()


Unnamed: 0,Detected duplicates,Row count before deduplication,Row count after deduplication,Duplicate percentage
0,116,841,725,13.79


# CDDB

In [601]:
path = "data/cddb.csv"
rd = ReadData(path)
# source_data = pd.read_csv('data/census.csv')
# source_data.to_csv('data/census.csv', index=False)
# source_data = source_data.drop(columns='id')
source_data = rd.read_data().drop(columns='id')
source_data

Unnamed: 0,artist,category,genre,title,tracks,year,ID
0,Backstreet Boys,blues,Pop,Millennium,Larger Than Life|I Want It That Way|Show Me Th...,,0
1,Various,data,,Frankfurt Trance Vol. 04 cd1,DJ Tom Stevens VS. Fridge - Outface 2000 (Radi...,,1
2,NO RETURN,data,Data,Self Mutilation,Do or Die|Truth and Reality|Lost|Soul Extracto...,,2
3,Ã¤Â¸Â­Ã¦?â€˜Ã©â€ºâ€¦Ã¤Â¿Å,data,Pop,Ã¦Æ’Â³Ã£?â€žÃ¥â€¡ÂºÃ£?Â®Ã£?â€¹Ã£?â€˜Ã£â€šâ€°,Ã§â€ºâ€ Ã¥Â¸Â°Ã£â€šÅ |Ã£?â€žÃ£?Â¤Ã£?â€¹Ã¨Â¡â€”...,1989,3
4,Emanuel,data,Data,Felicidade,Felicidade quando o telefone toca|Vem bailar o...,1998,4
...,...,...,...,...,...,...,...
9758,Various Artists - Notting Hill,blues,Soundtrack,Notting Hill,Another Level / From The Heart|Ronan Keating /...,1999,9758
9759,Lenine,blues,Samba,Na Pressao,Soul Brasileiro|Na Pressao|Pacienca|Meuamanha ...,,9759
9760,Ben Harper,blues,Blues,The will to live,Faded|Homeless Child|Number Three|Roses From M...,1997,9760
9761,Alvin Lee,blues,Blues,I Hear You Rockin',Keep On Rockin'|Long Legs|I Hear You Knockin'|...,1993,9761


In [602]:
preprocessor = DataPreprocessing(source_data)
columns=['all']
lowercase=True
diacritics_removal=True
punctuation_removal=True
preprocessor.select_columns(columns=columns)
preprocessor.apply_preprocessing(lowercase=lowercase, diacritics_removal=diacritics_removal, punctuation_removal=punctuation_removal)
preprocessed_data = preprocessor.get_processed_data()
preprocessed_data

Unnamed: 0,artist,category,genre,title,tracks,year,ID
0,backstreet boys,blues,pop,millennium,larger than lifei want it that wayshow me the ...,,0
1,various,data,,frankfurt trance vol 04 cd1,dj tom stevens vs fridge outface 2000 radio m...,,1
2,no return,data,data,self mutilation,do or dietruth and realitylostsoul extractorsa...,,2
3,a¤a a­a¦a€ a©a€oa€¦a¤a¿a,data,pop,a¦æ’a3a£a€za¥a€¡aoa£a®a£a€1a£a€ a£a€sa€°,a§a€oa€ a¥a a°a£a€sa a£a€za£a¤a£a€1a a¡a€”a£a§...,1989,3
4,emanuel,data,data,felicidade,felicidade quando o telefone tocavem bailar o ...,1998,4
...,...,...,...,...,...,...,...
9758,various artists notting hill,blues,soundtrack,notting hill,another level from the heartronan keating wh...,1999,9758
9759,lenine,blues,samba,na pressao,soul brasileirona pressaopaciencameuamanha int...,,9759
9760,ben harper,blues,blues,the will to live,fadedhomeless childnumber threeroses from my f...,1997,9760
9761,alvin lee,blues,blues,i hear you rockin,keep on rockinlong legsi hear you knockinaintt...,1993,9761


In [610]:
# Standard blocking
method='standard_blocking'
columns=['genre', 'year']
block_builder = BlockBuilding(preprocessed_data, method=method)
block_builder.build_blocks(columns=columns)

# SNM
# block_builder = BlockBuilding(preprocessed_data, method='sorted_neighborhood')
# block_builder.build_blocks(window_size=20, columns=columns, n_letters=4)

# ## DSNM
# block_builder = BlockBuilding(preprocessed_data, method='dynamic_sorted_neighborhood')
# block_builder.build_blocks(max_window_size=5, match_threshold=0.9, columns=columns, n_letters=4)

all_blocks = block_builder.get_blocks()
all_blocks

Unnamed: 0,artist,category,genre,title,tracks,year,ID,BKV,block_id
0,backstreet boys,blues,pop,millennium,larger than lifei want it that wayshow me the ...,,0,P100,452
1,various,data,,frankfurt trance vol 04 cd1,dj tom stevens vs fridge outface 2000 radio m...,,1,,1
2,no return,data,data,self mutilation,do or dietruth and realitylostsoul extractorsa...,,2,D300,169
3,a¤a a­a¦a€ a©a€oa€¦a¤a¿a,data,pop,a¦æ’a3a£a€za¥a€¡aoa£a®a£a€1a£a€ a£a€sa€°,a§a€oa€ a¥a a°a£a€sa a£a€za£a¤a£a€1a a¡a€”a£a§...,1989,3,P100 1000,453
4,emanuel,data,data,felicidade,felicidade quando o telefone tocavem bailar o ...,1998,4,D300 1000,170
...,...,...,...,...,...,...,...,...,...
9758,various artists notting hill,blues,soundtrack,notting hill,another level from the heartronan keating wh...,1999,9758,S536 1000,601
9759,lenine,blues,samba,na pressao,soul brasileirona pressaopaciencameuamanha int...,,9759,S510,582
9760,ben harper,blues,blues,the will to live,fadedhomeless childnumber threeroses from my f...,1997,9760,B420 1000,74
9761,alvin lee,blues,blues,i hear you rockin,keep on rockinlong legsi hear you knockinaintt...,1993,9761,B420 1000,74


In [618]:
comparison = Comparison(all_blocks)

# name	address	city	cuisine
column_algorithms = {
    "artist": comparison.jaro_winkler_similarity,
    "category": comparison.jaro_winkler_similarity,
    "genre": comparison.jaro_winkler_similarity,
    "title": comparison.jaro_winkler_similarity,
    "tracks": comparison.qgram_similarity,
    "year": comparison.qgram_similarity,
}

# Compare within blocks
comparison_results = comparison.compare_within_blocks(
    block_col="block_id",
    column_algorithms=column_algorithms
)
comparison_results.head(10)

Unnamed: 0,block_id,row1,row2,artist_similarity,category_similarity,genre_similarity,title_similarity,tracks_similarity,year_similarity
0,1,4083,8723,0.507937,0.407407,0.0,0.654764,0.0,0.0
1,1,4083,8753,0.518908,0.407407,0.0,0.578539,0.0,0.0
2,1,4083,8750,0.404762,0.407407,0.0,0.539479,0.0,0.0
3,1,4083,8747,0.475564,0.407407,0.0,0.6176,0.0,0.0
4,1,4083,8743,0.420635,0.407407,0.0,0.440171,0.0,0.0
5,1,4083,8738,0.893333,0.407407,0.0,0.591816,0.0,0.0
6,1,4083,8735,0.600529,0.407407,0.0,0.569696,0.0,0.0
7,1,4083,8733,0.893333,0.407407,0.0,0.594006,0.0,0.0
8,1,4083,8727,0.428571,0.407407,0.0,0.545649,0.0,0.0
9,1,4083,8724,0.562233,0.407407,0.0,0.551709,0.0,0.0


In [619]:
comparison_results

Unnamed: 0,block_id,row1,row2,artist_similarity,category_similarity,genre_similarity,title_similarity,tracks_similarity,year_similarity
0,1,4083,8723,0.507937,0.407407,0.0,0.654764,0.0,0.0
1,1,4083,8753,0.518908,0.407407,0.0,0.578539,0.0,0.0
2,1,4083,8750,0.404762,0.407407,0.0,0.539479,0.0,0.0
3,1,4083,8747,0.475564,0.407407,0.0,0.617600,0.0,0.0
4,1,4083,8743,0.420635,0.407407,0.0,0.440171,0.0,0.0
...,...,...,...,...,...,...,...,...,...
5076481,680,1409,1575,0.434343,1.000000,1.0,0.527778,0.0,0.0
5076482,681,141,8002,0.550595,0.000000,0.9,0.460544,0.0,0.0
5076483,681,141,7952,0.492063,0.000000,0.9,0.529762,0.0,0.0
5076484,681,7952,8002,0.451389,1.000000,1.0,0.441667,0.0,0.0


In [620]:
classifier = Classifier(all_blocks, comparison_results)

# Weighted
# method = 'threshold_based'
# possible_match = False
# thresholds = {'not_match': 0.6, 'match': 0.75}
thresholds = {'match': 0.85}
weights = {'artist_similarity': 0.4,
           'category_similarity': 0.5,
           'genre_similarity': 0.8,
           'title_similarity': 1,
           'tracks_similarity': 0.8,
           'year_similarity': 0.8
           }
classified_results = classifier.classify_matches(
    method='weighted',
    thresholds=thresholds,
    possible_match=possible_match,
    weights=weights
)

# Cost-based
# costs = {
#     'non_match_true_match': 4,  # Cost of classifying a true match as a non-match
#     'non_match_true_non_match': 1,  # Cost of classifying a true non-match as a non-match
#     'match_true_match': 1,  # Cost of classifying a true match as a match
#     'match_true_non_match': 5  # Cost of classifying a true non-match as a match
# }
# probabilities = {'M': 0.3, 'U': 0.7}
# #
# classified_results = classifier.classify_matches(
#     method='cost_based',
#     costs=costs,
#     probabilities=probabilities
# )


classified_results

Unnamed: 0,block_id,row1,row2,artist_similarity,category_similarity,genre_similarity,title_similarity,tracks_similarity,year_similarity,row1_artist,...,row2_genre,row1_title,row2_title,row1_tracks,row2_tracks,row1_year,row2_year,weighted_similarity,normalized_similarity,classification
0,1,4083,8723,0.507937,0.407407,0.0,0.654764,0.0,0.0,various,...,,platinum an introspective of house cd 2 john...,lanterns of fire love the mystic in renaissan...,highlight feel it original mixdouble 99 rip...,shepherdinlove recercada segunda diego ortiz c...,,,1.061642,0.306104,Non-Match
1,1,4083,8753,0.518908,0.407407,0.0,0.578539,0.0,0.0,various,...,,platinum an introspective of house cd 2 john...,la traviata cd 1,highlight feel it original mixdouble 99 rip...,preludiodellinvito trascorsa corolibiam nelie...,,,0.989806,0.285392,Non-Match
2,1,4083,8750,0.404762,0.407407,0.0,0.539479,0.0,0.0,various,...,,platinum an introspective of house cd 2 john...,abba greatest hits,highlight feel it original mixdouble 99 rip...,abbaturesosmama miaeaglei have a dreamdoes you...,,,0.905088,0.260965,Non-Match
3,1,4083,8747,0.475564,0.407407,0.0,0.617600,0.0,0.0,various,...,,platinum an introspective of house cd 2 john...,festival strings lucerne vivadi wa mozart,highlight feel it original mixdouble 99 rip...,concerto in f major p278 allegroconcerto in f...,,,1.011529,0.291655,Non-Match
4,1,4083,8743,0.420635,0.407407,0.0,0.440171,0.0,0.0,various,...,,platinum an introspective of house cd 2 john...,13 pra©ludes,highlight feel it original mixdouble 99 rip...,op 23 no 1 in f min largoop 23 no 2 in b maest...,,,0.812129,0.234162,Non-Match
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5076481,680,1409,1575,0.434343,1.000000,1.0,0.527778,0.0,0.0,bebeto,...,world,bebeto,volume 3 further in time,batuquepra nao chorarsalgueiro choraonosotros ...,northnorth 2when youre fallingcolossuslagansha...,,,2.001515,0.577099,Non-Match
5076482,681,141,8002,0.550595,0.000000,0.9,0.460544,0.0,0.0,esma redzepova,...,world,esma queen of the gypsies macedonian songs disk 1,katsu,zasto si me majko rodilazapej makedonijosvadba...,mondo sambasandspeace of mindindian winterbran...,1998,1999,1.400782,0.403889,Non-Match
5076483,681,141,7952,0.492063,0.000000,0.9,0.529762,0.0,0.0,esma redzepova,...,world,esma queen of the gypsies macedonian songs disk 1,son egal,zasto si me majko rodilazapej makedonijosvadba...,tsy kivyavelovoandalanazotrasonegalyrafrancois...,1998,1997,1.446587,0.417096,Non-Match
5076484,681,7952,8002,0.451389,1.000000,1.0,0.441667,0.0,0.0,tarika,...,world,son egal,katsu,tsy kivyavelovoandalanazotrasonegalyrafrancois...,mondo sambasandspeace of mindindian winterbran...,1997,1999,1.922222,0.554236,Non-Match


In [621]:
evaluation = Evaluation(source_data, classified_results)
evaluation.show_matches_side_by_side().to_csv('data/detected_dups/census_found_dups.csv', index=False)
evaluation.show_matches_side_by_side()

Unnamed: 0,dedup_id,artist,category,genre,title,tracks,year,ID,dropped
2297,1,Terry Pratchett,misc,Audio Book,Hogfather,Hogfather (Disk 02) - Track 014|Hogfather (Dis...,1996.0,2297,NO
3515,1,Terry Pratchett,misc,Audio Book,Hogfather,Hogfather (Disk 03) - Track 026|Hogfather (Dis...,1996.0,3515,YES
9520,2,Radiohead,rock,Alternative,The Bends,Planet Telex|The Bends|High And Dry|Fake Plast...,1995.0,9520,NO
9662,2,Radiohead,rock,Alternative Rock,The Bends,Planet Telex|The Bends|High And Dry|Fake Plast...,1995.0,9662,YES
9732,3,Mindy McCready,country,country,If I Don't Stay The Night,What If I Do|This is Me|If I Don't Stay The Ni...,1997.0,9732,NO
9736,3,Mindy McCready,country,Country,If I Don't Stay The Night,What If I Do|This Is Me|If I Don't Stay The Ni...,1997.0,9736,YES
9491,4,Cappella,misc,Dance,U Got 2 Know,U Got 2 Know|U Got 2 Let The Music|Don't Be Pr...,1994.0,9491,NO
9696,4,Cappella,rock,Dance,U Got 2 Know,U Got 2 Know|U Got 2 Let The Music|Don't Be Pr...,1994.0,9696,YES
9570,5,Add N To (X),rock,Electronic,Loud Like Nature,total all out water|electric village|sheez min...,2002.0,9570,NO
9645,5,Add N To (X),rock,Electronic,Loud Like Nature,Total All Out Water|Electric Village|Sheez Min...,2002.0,9645,YES


In [622]:
evaluation.get_deduplicated_data()


Unnamed: 0,artist,category,genre,title,tracks,year,ID
0,Backstreet Boys,blues,Pop,Millennium,Larger Than Life|I Want It That Way|Show Me Th...,,0
1,Various,data,,Frankfurt Trance Vol. 04 cd1,DJ Tom Stevens VS. Fridge - Outface 2000 (Radi...,,1
2,NO RETURN,data,Data,Self Mutilation,Do or Die|Truth and Reality|Lost|Soul Extracto...,,2
3,Ã¤Â¸Â­Ã¦?â€˜Ã©â€ºâ€¦Ã¤Â¿Å,data,Pop,Ã¦Æ’Â³Ã£?â€žÃ¥â€¡ÂºÃ£?Â®Ã£?â€¹Ã£?â€˜Ã£â€šâ€°,Ã§â€ºâ€ Ã¥Â¸Â°Ã£â€šÅ |Ã£?â€žÃ£?Â¤Ã£?â€¹Ã¨Â¡â€”...,1989,3
4,Emanuel,data,Data,Felicidade,Felicidade quando o telefone toca|Vem bailar o...,1998,4
...,...,...,...,...,...,...,...
9731,All-4-One,blues,,Christmas,Silent Night|This Christmas|The First Noel|The...,,9756
9732,Various Artists - Notting Hill,blues,Soundtrack,Notting Hill,Another Level / From The Heart|Ronan Keating /...,1999,9758
9733,Lenine,blues,Samba,Na Pressao,Soul Brasileiro|Na Pressao|Pacienca|Meuamanha ...,,9759
9734,Ben Harper,blues,Blues,The will to live,Faded|Homeless Child|Number Three|Roses From M...,1997,9760


In [623]:
evaluation.get_statistics()


Unnamed: 0,Detected duplicates,Row count before deduplication,Row count after deduplication,Duplicate percentage
0,27,9763,9736,0.28
