In [1]:
import os
work_dir = os.sep.join(os.getcwd().split(os.sep)[0:-1])

In [2]:
print(work_dir)

/Users/matthewlee/Documents/Code/b24/b5/bot2405a/dbc24/web/net/n5


In [3]:
import sys

work_tool=os.path.join(work_dir,'action')
data_dir_netflix=os.path.join(work_dir,'setup/d_data')
data_dir_edgar=os.path.join(os.sep.join(work_dir.split(os.sep)[0:-3]),'dat/net/sec_edgar')

sys.path.insert(0, work_tool)

In [4]:
from polyfuzz1 import polyfuzz2
from polyfuzz1 import metrics
from polyfuzz1.models import tfidf, BaseMatcher, EditDistance

from rapidfuzz import fuzz

import json
import numpy as np
import pandas as pd

## Data

In [5]:
# Load data
# data_a = json.load(open(f'{data_dir_netflix}/movie_titles_netflix_a.json', 'r'))
# data_b = json.load(open(f'{data_dir_netflix}/movie_titles_netflix_b.json', 'r'))

data_a = pd.read_csv(f'{data_dir_edgar}/edgar100a.csv')
data_b = pd.read_csv(f'{data_dir_edgar}/edgar100b.csv')
data_c = pd.read_csv(f'{data_dir_edgar}/edgar100c.csv')
data_d = pd.read_csv(f'{data_dir_edgar}/edgar100d.csv')

In [6]:
to_list = data_c.loc[:, 'name'].tolist()
from_list = data_b.loc[:, 'name'].tolist()

In [7]:
len(to_list)

100

In [8]:
from_string = from_list[0]

test_matches = [fuzz.ratio(from_string, to_string) for to_string in to_list]

## Custom Model/Grouping

In [9]:
class MyModel(BaseMatcher):
    def match(self, from_list, to_list, **kwargs):
        # Calculate distances
        matches = [[fuzz.ratio(from_string, to_string) / 100 
                   for to_string in to_list] for from_string in from_list]

        # Get best matches
        mappings = [to_list[index] for index in np.argmax(matches, axis=1)]
        scores = np.max(matches, axis=1)

        # Prepare dataframe
        matches = pd.DataFrame({'From': from_list,
                                'To': mappings, 
                                'Similarity': scores})
        return matches

In [14]:
class CustomModel(BaseMatcher):
    def match(self, from_list, to_list, n_most, threshold):
        mappings = []
        
        # Calculate distances
        for from_string in from_list:
            matches = [[to_string, fuzz.ratio(from_string, to_string) / 100] 
                       for to_string in to_list]

            matches.sort(reverse=True, key=lambda x: x[1])

            while len(matches) > n_most:
                matches.pop(-1)
        
            matches = [match for match in matches if match[1] > threshold]

            mappings.append(matches)

        # Prepare dataframe
        matches = {'From': from_list, 'To': mappings}
        return matches

In [11]:
# from_list = ["apple", "apples", "appl", "recal", "house", "similarity"]
# to_list = ["apple", "apples", "mouse"]

# model = PolyFuzz("TF-IDF").match(from_list, to_list)

# Custom grouper
base_edit_grouper = EditDistance(n_jobs=1)
# model.group(base_edit_grouper)

In [15]:
# TEST
m = CustomModel()
test_matches = m.match(to_list, from_list, 3, 0.5)

In [None]:
test_matches

## Edit Distance

In [16]:
mymodel = MyModel()

In [17]:
model = polyfuzz2.PolyFuzzMatch(method=mymodel).match(to_list, from_list)

In [12]:
model.matches

Unnamed: 0,From,To,Similarity
0,1-800-PHARMACY INC,1-800-JACKPOT INC,0.685714
1,1-800-RADIATOR FRANCHISE INC.,1 800 RADIATOR FRANCHISE INC,0.912281
2,"1-900 JACKPOT, INC.",1-800-JACKPOT INC,0.833333
3,1-PAGE LTD,1-10/KATY LTD,0.608696
4,1/2 MAC LTD,1-10/KATY LTD,0.583333
...,...,...,...
95,"10101 SW 14 MEMBER, LLC","1 COMMUNITY BUILDERS, LLC",0.500000
96,"1010DATA, INC.","1-800-DOCTORS, INC.",0.666667
97,"1011 - 1041 TALEGA, LLC","1-800-PACK-RAT, LLC",0.476190
98,1011 BRIAR HILLS ONE INVESTORS INC,1 800 RADIATOR FRANCHISE INC,0.516129


In [16]:
grouper = polyfuzz2.PolyFuzzGroup(method=base_edit_grouper, matches=model.matches)

In [17]:
grouper.group()

In [18]:
grouper.matches

Unnamed: 0,From,To,Similarity,Group
0,1-800-PHARMACY INC,1-800-JACKPOT INC,0.685714,1-800-JACKPOT INC
1,1-800-RADIATOR FRANCHISE INC.,1 800 RADIATOR FRANCHISE INC,0.912281,1 800 RADIATOR FRANCHISE INC
2,"1-900 JACKPOT, INC.",1-800-JACKPOT INC,0.833333,1-800-JACKPOT INC
3,1-PAGE LTD,1-10/KATY LTD,0.608696,1-10/KATY LTD
4,1/2 MAC LTD,1-10/KATY LTD,0.583333,1-10/KATY LTD
...,...,...,...,...
95,"10101 SW 14 MEMBER, LLC","1 COMMUNITY BUILDERS, LLC",0.500000,"1 COMMUNITY BUILDERS, LLC"
96,"1010DATA, INC.","1-800-DOCTORS, INC.",0.666667,"1-800-DOCTORS, INC."
97,"1011 - 1041 TALEGA, LLC","1-800-PACK-RAT, LLC",0.476190,"1-800-PACK-RAT, LLC"
98,1011 BRIAR HILLS ONE INVESTORS INC,1 800 RADIATOR FRANCHISE INC,0.516129,1 800 RADIATOR FRANCHISE INC


In [19]:
grouper.clusters

{0: ['1 800 RADIATOR FRANCHISE INC', '1-800-RADIATOR FRANCHISE INC.'],
 1: ['1-800-JACKPOT INC', '1-900 JACKPOT, INC.'],
 2: ['1 SOUTH WACKER HOLDINGS, LLC', '1001 SOUTH STATE STREET HOLDINGS, LLC']}

In [20]:
grouper.cluster_mappings

{'1 800 RADIATOR FRANCHISE INC': 0,
 '1-800-RADIATOR FRANCHISE INC.': 0,
 '1-800-JACKPOT INC': 1,
 '1-900 JACKPOT, INC.': 1,
 '1 SOUTH WACKER HOLDINGS, LLC': 2,
 '1001 SOUTH STATE STREET HOLDINGS, LLC': 2}