# Polyfuzz Classes

## Imports

In [1]:
from abc import ABC, abstractmethod
import os

import pandas as pd
from rapidfuzz import fuzz

## Classes

In [2]:
class BaseMatcher(ABC):
    """ The abstract BaseMatching to be modelled after for string matching """

    def __init__(self, model_id: str = "Model 0"):
        self.model_id = model_id
        self.type = "Base Model"

    @abstractmethod
    def match(self,
              from_list: list[str],
              to_list: list[str] = None,
              **kwargs):
        
        raise NotImplementedError()

In [3]:
class StringSimilarity(BaseMatcher):
    def match(self, from_list, to_list, n_most=1, threshold=0):
        mappings = {'From': [], 'To': [], 'Similarity': []}
        outliers = []
        
        # Calculate distances
        for from_string in from_list:
            matches = [[to_string, fuzz.ratio(from_string, to_string) / 100] 
                       for to_string in to_list]

            # Save the top n_most matches
            matches.sort(reverse=True, key=lambda x: x[1])
            while len(matches) > n_most:
                matches.pop(-1)

            # Get ride of matches below the threshold
            matches = [match for match in matches if match[1] > threshold]

            # If there were no matches
            if len(matches) == 0:
                outliers.append(from_string)
            # Otherwise add matches to mappings
            else:
                for match in matches:
                    mappings['From'].append(from_string)
                    mappings['To'].append(match[0])
                    mappings['Similarity'].append(match[1])

        # Prepare result
        mappings = pd.DataFrame(mappings)
        outliers = pd.Series(outliers)
        return mappings, outliers

In [4]:
class PolyFuzzMatch:
    def __init__(self,
         method: BaseMatcher = None,
         matches = None,
         outliers = None
        ):

        self.method = method
        self.matches = matches
        self.outliers = outliers

    def match_strings(self,
                from_list: list[str],
                to_list: list[str] = None,
                **kwargs
              ):
        self.matches, self.outliers = self.method.match(from_list, to_list, **kwargs)
        return self

# Runtime

## Data

In [5]:
data_dir_edgar = os.path.join(os.sep.join(os.getcwd().split(os.sep)[:-4]), 'dat/net/sec_edgar')

In [6]:
data_a = pd.read_csv(f'{data_dir_edgar}/edgar100a.csv')
data_b = pd.read_csv(f'{data_dir_edgar}/edgar100b.csv')
data_c = pd.read_csv(f'{data_dir_edgar}/edgar100c.csv')
data_d = pd.read_csv(f'{data_dir_edgar}/edgar100d.csv')

In [7]:
to_list = data_c.loc[:, 'name'].tolist()
from_list = data_b.loc[:, 'name'].tolist()

## Evaluation

In [8]:
model = PolyFuzzMatch(method=StringSimilarity())

In [9]:
model.match_strings(to_list, from_list, n_most=3, threshold=0.5)
model.matches

Unnamed: 0,From,To,Similarity
0,1-800-PHARMACY INC,1-800-JACKPOT INC,0.685714
1,1-800-PHARMACY INC,1-800 ATTORNEY INC,0.666667
2,1-800-PHARMACY INC,1-800-PACK-RAT LLC,0.666667
3,1-800-RADIATOR FRANCHISE INC.,1 800 RADIATOR FRANCHISE INC,0.912281
4,1-800-RADIATOR FRANCHISE INC.,"1-800-DOCTORS, INC.",0.666667
...,...,...,...
230,"1010DATA, INC.","1-800-DOCTORS, INC.",0.666667
231,"1010DATA, INC.","1 DB FINANCIAL, INC.",0.588235
232,"1010DATA, INC.",1 800 AUTOTOW INC,0.580645
233,1011 BRIAR HILLS ONE INVESTORS INC,1 800 RADIATOR FRANCHISE INC,0.516129


In [10]:
model.outliers

0                                      10 GROUP PLC/ADR
1                              10-120 S. RIVERSIDE REIT
2                             10-20 CHANNEL CENTER REIT
3        10/120 SOUTH RIVERSIDE ILLINOIS BUSINESS TRUST
4                     100 WEST 93RD STREET INVESTORS LP
5                              1000 1ST AVENUE SOUTH LP
6                  1000 HOWARD BOULEVARD PARTNERS, L.P.
7     1000CHANNEL REAL ESTATE OPPORTUNITIES FEEDER F...
8                      1001 G STREET REHABILITAITON, LP
9                        1008 UPPER GULPH ASSOCIATES LP
10                 101 MAIN STREET LIMITED LIABILITY CO
11                              10101 SW 14 MEMBER, LLC
12                              1011 - 1041 TALEGA, LLC
dtype: object