In [3]:
!pip install cleanco
!pip install name_matching

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting cleanco
  Downloading cleanco-2.2-py3-none-any.whl (11 kB)
Installing collected packages: cleanco
Successfully installed cleanco-2.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting name_matching
  Downloading name_matching-0.8.4-py3-none-any.whl (111 kB)
[K     |████████████████████████████████| 111 kB 5.2 MB/s 
Installing collected packages: name-matching
Successfully installed name-matching-0.8.4


In [6]:
import unicodedata
import re

from name_matching.name_matcher import NameMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from cleanco import basename

***Definition for Company Name Matching function***

In [7]:
def find_company_name(company_name, company_name_dataset):

  # Package can be found at https://github.com/DeNederlandscheBank/name_matching

  # 1. Preprocessing data so that name matching is less computationally exprensive

  # Remove all capital letters, replace non-ASCII characters,
  company_name = company_name.lower()
  company_name = unicodedata.normalize('NFKD', company_name).encode('ASCII', 'ignore').decode()
  
  # Remove any character that is not a word or space character with nothing
  company_name = re.sub(r'[^\w\s]','',company_name)
  suffix = basename(company_name)

  # Remove the most common words using regular expressions.
  company_name = ' '.join(re.sub(r'\b{}\b'.format(re.escape(suffix)), '', company_name).split())

  # 2. We perform Cosine Similarity next so the potential number of matches can be reduced from a few million down to about fifty.
  # This is done via the conversion of a string to an n-gram and applying a tf-idf transform.
  vec = TfidfVectorizer(lowercase=False, analyzer="char", ngram_range=(2, 3))
  vec.fit(company_name_dataset)
  vec.transform(company_name)

  # 3. Fuzzy String Matching 

  # initialise the name matcher
  matcher = NameMatcher(column='name', 
                        number_of_matches=3, 
                        legal_suffixes=True, 
                        common_words=False, 
                        top_n=50, 
                        verbose=True)

  # adjust the distance metrics to use
  matcher.set_distance_metrics(discounted_levenshtein=False,
                               bag=True,
                               typo=True,
                               refined_soundex=True)

  # load the data to which the names should be matched
  matcher.load_and_process_master_data(company_name_dataset, transform=True)

  # perform the name matching on the data you want matched
  matches = matcher.match_names(to_be_matched=company_name, column_matching='name')

***Definition for Company Classifier***

In [None]:
def classify_company():

  # See if I can reverse engineer something from the notebook for
  # https://www.kaggle.com/code/thecobbler/classifying-company-names-as-per-their-industries/notebook
  # Or use LDA

  return 