In [1]:
import pandas as pd
import numpy as np
import unicodedata
import string
import sqlalchemy as _sql
import sqlalchemy.ext.declarative as _declarative
import sqlalchemy.orm as _orm
import re
from rapidfuzz.fuzz import ratio, partial_ratio
from rapidfuzz.distance import JaroWinkler, Levenshtein
from rapidfuzz.process import extractOne
import itertools

In [2]:
# TODO: change to real database
# DATABASE_URL = "postgresql+psycopg2://db_user:password@db:5432/inzynierka_db"
# engine = _sql.create_engine(DATABASE_URL)
# SessionLocal = _orm.sessionmaker(autocommit=False, autoflush=False, bind=engine)
# Base = _declarative.declarative_base()

# Data reading

In [3]:
class ReadData:
    def __init__(self, path):
        """
        Initialize with the path to the CSV file.
        """
        self.path = path  # Can be replaced by database connection later
        self.data = None

    def read_data(self):
        """
        Read data from the CSV file using pandas.
        Returns a pandas DataFrame.
        """
        self.data = pd.read_csv(self.path)
        return self.data

In [4]:
path = "data/restaurant-nophone.csv"
rd = ReadData(path)
data = rd.read_data()
data

Unnamed: 0,name,address,city,cuisine
0,arnie morton's of chicago,435 s. la cienega blv.,los angeles,american
1,arnie morton's of chicago,435 s. la cienega blvd.,los angeles,steakhouses
2,art's delicatessen,12224 ventura blvd.,studio city,american
3,art's deli,12224 ventura blvd.,studio city,delis
4,hotel bel-air,701 stone canyon rd.,bel air,californian
...,...,...,...,...
859,ti couz,3108 16th st.,san francisco,french
860,trio cafe,1870 fillmore st.,san francisco,american
861,tu lan,8 sixth st.,san francisco,vietnamese
862,vicolo pizzeria,201 ivy st.,san francisco,pizza


# Data Pre-processing

- Normalization (lowercasing, removing diacritics, punctuations)
- Tokenization (TBD?)
- Drop duplicates

In [5]:
class DataPreprocessing:
    def __init__(self, data):
        """
        Initialize the DataPreprocessor with the data.
        :param data: pandas DataFrame containing the data to be processed.
        """
        self.data = data
        self.columns = None
        self.processed_data = data.copy()  # A copy of the data to avoid modifying the original

    def select_columns(self, columns):
        """
        Select the columns to apply preprocessing on.
        If 'all' is passed, all columns will be selected.
        :param columns: List of columns to be normalized, or 'all' to select all columns.
        """
        if columns[0] == 'all':
            # Select all columns in the DataFrame
            self.columns = self.data.columns
        else:
            # Otherwise, use the provided list of columns
            self.columns = columns

    def _ensure_non_numeric_string_columns(self):
        """
        Internal method to ensure that only non-numeric string columns are selected for string operations.
        """
        # Filter out non-string columns (Int64, Float64, etc.)
        self.columns = [
            col for col in self.columns if self.processed_data[col].dtype == 'object'
        ]

    def lowercase(self):
        """
        Convert text to lowercase in the selected columns.
        """
        if self.columns is None:
            raise ValueError("No columns selected for preprocessing. Use select_columns method first.")

        try:
            for col in self.columns:
                self.processed_data[col] = self.processed_data[col].str.lower()
        except Exception as e:
            print(f"Error applying lowercase operation: {e}")

    def remove_diacritics(self):
        """
        Remove diacritics from text in the selected columns.
        """
        if self.columns is None:
            raise ValueError("No columns selected for preprocessing. Use select_columns method first.")

        def _remove_diacritics(text):
            if isinstance(text, str):
                return ''.join(
                    c for c in unicodedata.normalize('NFKD', text)
                    if unicodedata.category(c) != 'Mn'
                )
            return text

        try:
            for col in self.columns:
                self.processed_data[col] = self.processed_data[col].apply(_remove_diacritics)
        except Exception as e:
            print(f"Error removing diacritics: {e}")

    def remove_punctuation(self):
        """
        Remove punctuation from text in the selected columns.
        """
        if self.columns is None:
            raise ValueError("No columns selected for preprocessing. Use select_columns method first.")

        # Ensure only string columns are processed
        self._ensure_non_numeric_string_columns()

        punctuation_pattern = f"[{re.escape(string.punctuation)}]"

        try:
            for col in self.columns:
                self.processed_data[col] = self.processed_data[col].str.replace(
                    punctuation_pattern, '', regex=True
                )
        except Exception as e:
            print(f"Error removing punctuation: {e}")

    def drop_duplicates(self):
        """
        Drop exact duplicates across all columns in the DataFrame. This is a mandatory step.
        """
        try:
            self.processed_data = self.processed_data.drop_duplicates()
        except Exception as e:
            print(f"Error dropping duplicates: {e}")

    def apply_preprocessing(self, lowercase=False, diacritics_removal=False, punctuation_removal=False):
        """
        Apply preprocessing steps based on user selection.
        The order is: lowercase -> diacritics removal -> punctuation removal -> drop exact duplicates.
        :param lowercase: If True, apply lowercasing to the selected columns.
        :param diacritics_removal: If True, remove diacritics from the selected columns.
        :param punctuation_removal: If True, remove punctuation from the selected columns.
        :return: Preprocessed pandas DataFrame.
        """
        try:
            if punctuation_removal:
                self.remove_punctuation()

            # Ensure non-numeric string columns are processed
            self._ensure_non_numeric_string_columns()

            if lowercase:
                self.lowercase()

            if diacritics_removal:
                self.remove_diacritics()

            # Drop exact duplicates as the mandatory last step
            self.drop_duplicates()

        except Exception as e:
            print(f"Error during preprocessing: {e}")

        return self.processed_data

    def get_processed_data(self):
        """
        Return the preprocessed data.
        :return: Preprocessed pandas DataFrame.
        """
        return self.processed_data


In [6]:
preprocessor = DataPreprocessing(data)

preprocessor.select_columns(columns=['all'])
preprocessor.apply_preprocessing(lowercase=True, diacritics_removal=True, punctuation_removal=True)
data = preprocessor.get_processed_data()
data

Unnamed: 0,name,address,city,cuisine
0,arnie mortons of chicago,435 s la cienega blv,los angeles,american
1,arnie mortons of chicago,435 s la cienega blvd,los angeles,steakhouses
2,arts delicatessen,12224 ventura blvd,studio city,american
3,arts deli,12224 ventura blvd,studio city,delis
4,hotel belair,701 stone canyon rd,bel air,californian
...,...,...,...,...
859,ti couz,3108 16th st,san francisco,french
860,trio cafe,1870 fillmore st,san francisco,american
861,tu lan,8 sixth st,san francisco,vietnamese
862,vicolo pizzeria,201 ivy st,san francisco,pizza


# Block building

- Sorted Neighborhood Method (SNM)
- Standard Blocking Method
- SBM with dynamic sliding window


In [33]:
class BlockBuilding:
    def __init__(self, data, method):
        """
        Initialize the BlockBuilder with data and method.
        :param data: pandas DataFrame containing the entity data.
        :param method: The blocking method to use ('sorted_neighborhood', 'standard_blocking', or 'dynamic_sorted_neighborhood').
        """
        self.data = data
        self.method = method
        self.blocks = None
        self.num_blocks = 0

    def build_blocks(self, window_size=None, columns=None, block_index=1, max_window_size=None, match_threshold=None):
        """
        Main function to build blocks using the selected method.
        :param window_size: Window size for sorted neighborhood method.
        :param columns: List of columns to generate the sorting or blocking key.
        :param block_index: Index of the block to display (optional).
        :param max_window_size: Maximum window size for dynamic sorted neighborhood.
        :param match_threshold: Match threshold for dynamic sorted neighborhood.
        :return: A specific block based on block_index.
        """
        if columns is None:
            raise ValueError("You must specify the columns for generating the blocking key (BKV).")

        # Generate the BKV (Blocking Key Value) column
        self.data['BKV'] = self.data[columns].astype(str).agg(' '.join, axis=1)

        if self.method == 'sorted_neighborhood':
            if window_size is None:
                raise ValueError("Window size must be provided for the sorted neighborhood method.")
            self.sorted_neighborhood(window_size)
        elif self.method == 'standard_blocking':
            self.standard_blocking()
        elif self.method == 'dynamic_sorted_neighborhood':
            if max_window_size is None or match_threshold is None:
                raise ValueError("Both max_window_size and match_threshold must be provided for the dynamic sorted neighborhood method.")
            self.dynamic_sorted_neighborhood(max_window_size, match_threshold)
        else:
            raise ValueError("Invalid method. Use 'sorted_neighborhood', 'standard_blocking', or 'dynamic_sorted_neighborhood'.")

        return self.display_block(block_index)

    def sorted_neighborhood(self, window_size):
        """
        Build blocks using the Sorted Neighborhood Method (SNM).
        :param window_size: The size of the sliding window.
        """
        # Sort by the BKV
        self.blocks = self.data.sort_values(by='BKV').reset_index(drop=True)

        # Assign block IDs based on the sliding window
        self.blocks['block_id'] = (self.blocks.index // window_size) + 1

        # Update the number of blocks
        self.num_blocks = self.blocks['block_id'].nunique()

    def dynamic_sorted_neighborhood(self, max_window_size, match_threshold):
        """
        Build blocks using a Dynamic Sorted Neighborhood Method (DSNM).
        :param max_window_size: The maximum size of the sliding window.
        :param match_threshold: Minimum number of matches required to expand the window dynamically.
        """
        # Sort by the BKV
        self.blocks = self.data.sort_values(by='BKV').reset_index(drop=True)

        # Initialize variables
        block_ids = []
        current_block_id = 1
        window_start = 0

        # Iterate through the sorted data
        while window_start < len(self.blocks):
            # Start with a single row
            window_end = window_start + 1
            match_count = 0

            # Dynamically expand the window
            while window_end < len(self.blocks) and (window_end - window_start) < max_window_size:
                # Count matches in the current window
                match_count = self._count_matches_in_window(
                    self.blocks.iloc[window_start:window_end + 1],
                    'BKV'
                )
                if match_count >= match_threshold:
                    window_end += 1
                else:
                    break

            # Assign the same block_id to all rows in the current window
            for _ in range(window_start, window_end):
                block_ids.append(current_block_id)

            # Move to the next record
            window_start = window_end
            current_block_id += 1

        # Assign block IDs back to the dataframe
        self.blocks['block_id'] = block_ids

        # Update the number of blocks
        self.num_blocks = current_block_id - 1

    def standard_blocking(self):
        """
        Perform standard blocking on the data using the BKV column.
        """
        # Group by BKV and assign block IDs
        distinct_groups = self.data[['BKV']].drop_duplicates().reset_index(drop=True)
        distinct_groups['block_id'] = range(1, len(distinct_groups) + 1)

        # Merge the block IDs back to the original data
        self.blocks = pd.merge(self.data, distinct_groups, on='BKV', how='left').sort_values(by='block_id')

    def _count_matches_in_window(self, window, key_column):
        """
        Count matches within a given window based on the BKV.
        :param window: DataFrame slice representing the current window.
        :param key_column: The column containing the BKV.
        :return: Number of matches in the window.
        """
        unique_keys = window[key_column].nunique()
        return len(window) - unique_keys

    def display_block(self, block_index=1):
        """
        Display a specific block by block_id.
        :param block_index: The index of the block to display.
        :return: DataFrame containing the specified block.
        """
        if self.blocks is None:
            raise ValueError("No blocks have been generated. Run block building first.")

        return self.blocks[self.blocks['block_id'] == block_index]

    def get_blocks(self):
        """
        Return all generated blocks.
        :return: DataFrame containing all blocks.
        """
        if self.blocks is None:
            raise ValueError("No blocks have been generated. Run block building first.")

        return self.blocks


In [34]:
block_builder = BlockBuilding(data, method='dynamic_sorted_neighborhood')
blocks = block_builder.build_blocks(
    max_window_size=50,
    match_threshold=2,
    columns=['city', 'cuisine'],
    
)
block_builder.get_blocks()

Unnamed: 0,name,address,city,cuisine,sorting_key,block_id
0,anthonys,3109 piedmont rd just south of peachtree rd,atlanta,american,atlanta american,1
1,atlanta fish market,265 pharr rd,atlanta,american,atlanta american,2
2,johnny rockets at,2970 cobb pkwy,atlanta,american,atlanta american,3
3,georgia grille,2290 peachtree rd peachtree square shopping ...,atlanta,american,atlanta american,4
4,original pancake house at,4330 peachtree rd,atlanta,american,atlanta american,5
...,...,...,...,...,...,...
853,asahi ramen,2027 sawtelle blvd,west la,noodle shops,west la noodle shops,854
854,local nochol,30869 thousand oaks blvd,westlake village,health food,westlake village health food,855
855,baja fresh,3345 kimber dr,westlake village,mexican,westlake village mexican,856
856,don antonios,1136 westwood blvd,westwood,italian,westwood italian,857


In [51]:
all_blocks['block_id'].value_counts()

block_id
13    250
18    147
17    119
14     88
1      72
16     63
26     15
5      13
9       8
7       6
12      6
15      6
43      5
2       5
21      4
25      3
8       3
4       3
11      2
6       2
10      2
3       2
28      2
44      2
45      2
22      2
47      2
48      2
36      2
46      1
37      1
42      1
41      1
40      1
39      1
38      1
33      1
35      1
34      1
32      1
31      1
30      1
29      1
27      1
24      1
23      1
20      1
19      1
49      1
Name: count, dtype: int64

# Field and Record Comparison:

* Q-gram comparison
* Jaro-Winkler
* Soundex

In [52]:

class Comparison:
    def __init__(self, data):
        """
        Initialize the Comparison class with the data.
        :param data: pandas DataFrame containing the data to be compared.
        """
        self.data = data

    @staticmethod
    def levenshtein_similarity(str1, str2):
        """
        Calculate the Levenshtein similarity between two strings.
        """
        from rapidfuzz.distance import Levenshtein
        return Levenshtein.normalized_similarity(str1, str2)

    @staticmethod
    def jaro_winkler_similarity(str1, str2):
        """
        Calculate the Jaro-Winkler similarity between two strings.
        """
        from rapidfuzz.distance import JaroWinkler
        return JaroWinkler.similarity(str1, str2)

    @staticmethod
    def qgram_similarity(str1, str2, q=2):
        """
        Calculate Q-gram similarity.
        """
        def generate_qgrams(s, q):
            return [s[i:i + q] for i in range(len(s) - q + 1)]

        qgrams1 = generate_qgrams(str1, q)
        qgrams2 = generate_qgrams(str2, q)
        matches = sum(1 for q in qgrams1 if q in qgrams2)
        total_qgrams = len(set(qgrams1 + qgrams2))
        return (matches / total_qgrams)

    def compare_within_blocks(self, block_col, column_algorithms):
            """
            Compare all possible pairs within each block for specified columns.
            :param block_col: The column name containing block IDs.
            :param column_algorithms: Dictionary where keys are column names and values are comparison functions.
            :return: DataFrame with comparison results for all pairs in each block.
            """
            if block_col not in self.data.columns:
                raise ValueError(f"Block column '{block_col}' not found in data.")
    
            for col in column_algorithms:
                if col not in self.data.columns:
                    raise ValueError(f"Comparison column '{col}' not found in data.")
    
            # Store results in a list
            results = []
    
            # Group by block_id
            grouped = self.data.groupby(block_col)
    
            for block_id, group in grouped:
                # Get all possible pairs within the block
                pairs = list(itertools.combinations(group.iterrows(), 2))
    
                for (idx1, row1), (idx2, row2) in pairs:
                    result = {
                        "block_id": block_id,
                        "row1": idx1,
                        "row2": idx2,
                    }
    
                    # Apply the specified algorithm to each column
                    for col, comparison_func in column_algorithms.items():
                        result[f"{col}_similarity"] = comparison_func(row1[col], row2[col])
    
                    results.append(result)
    
            result_df = pd.DataFrame(results)
            return result_df.sort_values(by=block_col).reset_index(drop=True)


In [53]:
comparison = Comparison(all_blocks)

# name	address	city	cuisine
column_algorithms = {
    "name": comparison.levenshtein_similarity,
    "address": comparison.jaro_winkler_similarity,
    "city": comparison.jaro_winkler_similarity,
    "cuisine": comparison.qgram_similarity
}

# Compare within blocks
result = comparison.compare_within_blocks(
    block_col="block_id",
    column_algorithms=column_algorithms
)

result



Unnamed: 0,block_id,row1,row2,name_similarity,address_similarity,city_similarity,cuisine_similarity
0,1,0,245,0.192308,0.578587,1.0,0.071429
1,1,263,270,0.125000,0.513558,1.0,0.071429
2,1,263,271,0.000000,0.573529,1.0,1.000000
3,1,263,272,0.083333,0.925000,1.0,0.200000
4,1,263,273,0.090909,0.573529,1.0,0.200000
...,...,...,...,...,...,...,...
57511,43,721,717,0.307692,0.748718,1.0,0.000000
57512,44,791,790,0.461538,0.762800,1.0,1.000000
57513,45,798,822,0.160000,0.539160,1.0,0.047619
57514,47,802,807,0.086957,0.840000,1.0,0.066667
