# Fuzzy Matching

In [1]:
# ! py -m pip install thefuzz

Defaulting to user installation because normal site-packages is not writeable
Collecting thefuzz
  Downloading thefuzz-0.22.1-py3-none-any.whl (8.2 kB)
Collecting rapidfuzz<4.0.0,>=3.0.0
  Downloading rapidfuzz-3.6.1-cp39-cp39-win_amd64.whl (1.6 MB)
     ---------------------------------------- 1.6/1.6 MB 6.1 MB/s eta 0:00:00
Installing collected packages: rapidfuzz, thefuzz
Successfully installed rapidfuzz-3.6.1 thefuzz-0.22.1


You should consider upgrading via the 'C:\Program Files (x86)\Microsoft Visual Studio\Shared\Python39_64\python.exe -m pip install --upgrade pip' command.


In [1]:
import pandas as pd
import thefuzz
from thefuzz import fuzz
from thefuzz import process

In [29]:
'''
Input File is expected to have columns
TABLENAME
COLNAME
PK_ordinal_position
If multiple systems are present, set system flag to True and Input file must have column
SYSTEM
'''
inputFilename = 'Asset Suite Master Glossary.xlsx'
system_flag = False
ratio_cut = 100

In [30]:
fields = pd.read_excel(inputFilename, usecols=['TABLENAME', 'COLNAME', 'PK_ordinal_position']) # TODO accommodate SYSTEM column

In [31]:
fields

Unnamed: 0,TABLENAME,COLNAME,PK_ordinal_position
0,TIDACATT,AR_TYPE,1
1,TIDACATT,CODE_CATEGORY,2
2,TIDACATT,TEMPLATE_NAME,3
3,TIDACATT,GEN_ARG,0
4,TIDACATT,TIME_STAMP,0
...,...,...,...
2939,TIDXREFR,RELATED_ACTION,0
2940,TIDXREFR,LAST_UPDATED_BY,0
2941,TIDXREFR,LAST_UPDATED_DATE,0
2942,TIDXREFR,GEN_ARG,0


In [32]:
fields["match results"] = fields['COLNAME'].apply(lambda x: process.extract(x, fields['COLNAME'], scorer=fuzz.ratio))

In [33]:
fields

Unnamed: 0,TABLENAME,COLNAME,PK_ordinal_position,match results
0,TIDACATT,AR_TYPE,1,"[(AR_TYPE, 100, 0), (AR_TYPE, 100, 42), (AR_TY..."
1,TIDACATT,CODE_CATEGORY,2,"[(CODE_CATEGORY, 100, 1), (CODE_CATEGORY, 100,..."
2,TIDACATT,TEMPLATE_NAME,3,"[(TEMPLATE_NAME, 100, 2), (TEMPLATE_NAME , 10..."
3,TIDACATT,GEN_ARG,0,"[(GEN_ARG, 100, 3), (GEN_ARG, 100, 9), (GEN_AR..."
4,TIDACATT,TIME_STAMP,0,"[(TIME_STAMP, 100, 4), (TIME_STAMP, 100, 10), ..."
...,...,...,...,...
2939,TIDXREFR,RELATED_ACTION,0,"[(RELATED_ACTION , 100, 251), (RELATED_ACTION..."
2940,TIDXREFR,LAST_UPDATED_BY,0,"[(LAST_UPDATED_BY, 100, 8), (LAST_UPDATED_BY, ..."
2941,TIDXREFR,LAST_UPDATED_DATE,0,"[(LAST_UPDATED_DATE, 100, 7), (LAST_UPDATED_DA..."
2942,TIDXREFR,GEN_ARG,0,"[(GEN_ARG, 100, 3), (GEN_ARG, 100, 9), (GEN_AR..."


In [34]:
def search_term(df, term, regex=False):
    results_df = df[(df['Match Name'].str.contains(term,case=False,regex=regex))|(df['Column Name'].str.contains(term,case=False, regex=regex))]
    return results_df

In [35]:
def id_matches(results_list):
    new_list = []
    for result in results_list:
        new_tup = (fields.iloc[result[2]]['TABLENAME'], result[0],fields.iloc[result[2]]['PK_ordinal_position']) #Match table name, Match column name, match pk
        if result[1] >= ratio_cut:
            new_list.append(new_tup)

    return new_list

In [36]:
fields['table,col,pk'] = fields['match results'].apply(id_matches)

In [37]:
fields = fields.drop(columns=['match results'])

In [39]:
fields.to_excel('Asset Suite Column Matches.xlsx', index=False)

In [32]:
def expand_matches(df):
    new_rows = []
    for index, row in df.iterrows():
        for result in row['match results']:
            new_row = {'System':row['System'],
                    'Table Name':row['TABLENAME'],
                    'Column Name':row['COLNAME'],
                    'Match Name':result[0],
                    'Match Ratio':result[1],
                    'Match System':fields.iloc[result[2]]['System'],
                    'Match Table':fields.iloc[result[2]]['TABLENAME']}
            new_rows.append(new_row)
    new_df = pd.DataFrame(new_rows, columns=['System','TABLENAME','COLNAME','Match Name','Match Ratio','Match System','Match Table'])
    new_df = new_df[(new_df['System'] != new_df['Match System'])]
    return new_df
        
def search_term(df, term, regex=False):
    results_df = df[(df['Match Name'].str.contains(term,case=False,regex=regex))|(df['COLNAME'].str.contains(term,case=False, regex=regex))]
    return results_df

In [30]:
fields = expand_matches(fields)

In [33]:
id_num = search_term(fields, 'num|id', regex=True)

In [34]:
id_num

Unnamed: 0,System,Table Name,Column Name,Match Name,Match Ratio,Match System,Match Table
896,Empower,Userr,AdminUser,RDBMSUSERID,60,P6,GLOBALSECURITY
897,Empower,Userr,AdminUser,UDFNUMBER,56,P6,UDFVALUE
926,Empower,Userr,AiRoleID,ROLEID,86,P6,RESOURCEASSIGNMENT
934,Cobra,CALCDETL,ALIAS,CalID,60,Empower,Calendar
1071,Cobra,CAWP,APPLINK,ApprID,62,Empower,ai
...,...,...,...,...,...,...,...
21099,Empower,EuwtLink,WeightID,WEIGHT,86,P6,PROJECTCODETYPE
21102,Empower,Weight,WeightID,WEIGHT,86,P6,ACTIVITYSTEP
21103,Empower,Weight,WeightID,WEIGHT,86,P6,PROJECTCODE
21104,Empower,Weight,WeightID,WEIGHT,86,P6,PROJECTCODETYPE
