In [59]:
import pandas as pd
df_company_names = pd.read_csv("company_names.csv")
df.sample(5)

Unnamed: 0,Line Number,Company Name,Company CIK Key
553935,553936,"SOLUM, INC.",1482838
366583,366584,MACDONALD DETTWILER & ASSOCIATES LTD,1121142
245924,245925,GOLDMAN SACHS CAPITAL PARTNERS V 1A LP,1360190
609597,609598,UMD INC,1137455
69706,69707,BIRAN DAN,1457986


In [55]:
def pre_process_text(name, n=3):
    """
    Parameters:
        name (str):
        n    (int): determines the n-grams (bi-gram, tri-gram, etc). By default, it's tri-gram
    
    Return:
        string with the n-grams
    """

    name = re.sub(r'[,-./]|\s', r'', name)
    name = re.sub(" +", "", name)
    tokens = zip(*[name[i:] for i in range(n)])
    processed_text = [''.join(token) for token in tokens]
    
    
    return processed_text

In [56]:
pre_process_text("The big company")

['The', 'heb', 'ebi', 'big', 'igc', 'gco', 'com', 'omp', 'mpa', 'pan', 'any']

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=5, analyzer=pre_process_text)
v_company_name = tfidf.fit_transform(df_company_names['Company Name'])

In [70]:
import numpy as np
v_input = tfidf.transform(["MACDONALD DETTWILER"])
v_input
result = v_input.dot(v_company_name.T)
row_count = result.getnnz()
row_count
arg_idx = np.argpartition(result.data, -5)[-5:]
arg_idx

array([41525, 41521, 60076, 41520, 41519], dtype=int64)

In [72]:
help(np.argpartition)

Help on function argpartition in module numpy:

argpartition(a, kth, axis=-1, kind='introselect', order=None)
    Perform an indirect partition along the given axis using the
    algorithm specified by the `kind` keyword. It returns an array of
    indices of the same shape as `a` that index data along the given
    axis in partitioned order.
    
    .. versionadded:: 1.8.0
    
    Parameters
    ----------
    a : array_like
        Array to sort.
    kth : int or sequence of ints
        Element index to partition by. The k-th element will be in its
        final sorted position and all smaller elements will be moved
        before it and all larger elements behind it. The order all
        elements in the partitions is undefined. If provided with a
        sequence of k-th it will partition all of them into their sorted
        position at once.
    axis : int or None, optional
        Axis along which to sort. The default is -1 (the last axis). If
        None, the flattened arra

In [78]:
result.indices[arg_idx]


array([366577, 366581, 366583, 366582, 366584], dtype=int32)

In [100]:
def get_top_n_match(row, n_top=5):
    """
    :param row:
    :param n_top: number of results to be determined
    :return: list of tuples with index of the match and the cosine similarity score
    """

    row_count = row.getnnz()
    if row_count == 0:
        return None
    elif row_count <= n_top:
        result = zip(row.indices, row.data)
    else:
        arg_idx = np.argpartition(row.data, -n_top)[-n_top:]
        result = zip(row.indices[arg_idx], row.data[arg_idx])
    return sorted(result, key=(lambda x: -x[1]))


def match_company_name(input_name, vectorizer, comp_name_vectors, comp_name_df,n):
    """
    :param input_name: input company name whose matches need to be found
    :param vectorizer: TFIDF vectorizer which was initialized earlier
    :param comp_name_vectors: the company names' vectors of the whole data set
    :param comp_name_df: the company names dataframe
    :return: a dataframe with top N matching names with match score
    """
    input_name = input_name.upper()
    input_name_vector = vectorizer.transform([input_name])
    result_vector = input_name_vector.dot(comp_name_vectors.T)
    matched_data = [get_top_n_match(row,n) for row in result_vector]
    flat_matched_data = [tup for data_row in matched_data for tup in data_row]
    lkp_idx, lkp_sim = zip(*flat_matched_data)
    nr_matches = len(lkp_idx)
    matched_names = np.empty([nr_matches], dtype=object)
    sim = np.zeros(nr_matches)
    for i in range(nr_matches):
        matched_names[i] = comp_name_df['Company Name'][lkp_idx[i]]
        sim[i] = lkp_sim[i]
    return pd.DataFrame({"Matching company name": matched_names,
                         "Match Score (%)": sim*100})


In [104]:
result_df = match_company_name("WAYPOINT FINAN", tfidf, v_company_name, 
                               df_company_names,n = 10)
print(result_df)

                 Matching company name  Match Score (%)
0              WAYPOINT FINANCIAL CORP        83.732844
1                       WAYPOINT I LLC        66.430100
2  WAYPOINT FIRST STREET INVESTORS, LP        58.123844
3                 WAYPOINT PARTNERS LP        57.755806
4                WAYPOINT MEDICAL, LLC        54.910641
5             WAYPOINT SECURITIES, LLC        54.169780
6              ONE POINT FINANCIAL LLC        53.824748
7               WAYPOINT ADVISORS, LLC        53.429341
8             DANA POINT FINANCIAL INC        52.641196
9             OLD POINT FINANCIAL CORP        52.245995
