# Fuzzy-Lookup

In [2]:
# We import the libraries that we are goin to use. 

import numpy as np
import pandas as pd
from polyfuzz.models import TFIDF

## Using the Account Name parameter

### Cleansing the SF accounts

In [3]:
# We read the file that contains the SF Accounts.

accounts_path = 'input/sf_accounts.csv'

df2 = pd.read_csv(accounts_path, encoding = 'Latin-1').dropna(subset=['Account ID'])

In [5]:
# We stablish the strange symbols and patters that we are going to remove. 

a = '[^a-z0-9\s]| inc| plc| llc| ltd| llp| limited| group| corporation| uk'

In [6]:
# We create a new Account Name column to clean it.
# We remove those symbols and patters from the Account Name Cleaned column.
# We replace the empty strings with NaN values in the Account Name Cleaned column.

df2['Account Name Cleaned'] = df2['Account Name']
df2['Account Name Cleaned'] = df2['Account Name Cleaned'].str.lower().str.replace(a, '', regex=True).str.strip()
df2['Account Name Cleaned'] = df2['Account Name Cleaned'].replace('', np.nan)

In [7]:
# We create a DataFrame without NaN values in the Account Name Cleaned column. 
# We sort by Account Name Cleaned and Contact Count.
# We drop duplicates in the Account Name Cleaned column and we keep the first one. 

df2_account = df2.dropna(subset=['Account Name Cleaned'])
df2_account = df2_account.sort_values(by=['Account Name Cleaned', 'Contact Count'], ascending=[True, False])
df2_account = df2_account.drop_duplicates(subset='Account Name Cleaned', keep='first')

In [8]:
# We create a list with the Account Name Cleaned values. 

df2_account_list = df2_account['Account Name Cleaned'].tolist()

### Cleansing the external companies

In [9]:
# We read the file that contains the external accounts. 

ext_data_path = 'input/ext_data.csv'

df3 = pd.read_csv(ext_data_path, encoding="Latin1")


In [10]:
# We create a new Account Name column to clean it.
# We remove those symbols and patters from the Account Name Cleaned column.
# We replace the empty strings with NaN values in the Account Name Cleaned column.

df3['Company Name Cleaned'] = df3['Company Name']
df3['Company Name Cleaned'] = df3['Company Name Cleaned'].str.lower().str.replace(a, '', regex=True).str.strip()
df3['Company Name Cleaned'] = df3['Company Name Cleaned'].fillna(' ')

In [12]:
# We create a list with the Account Name Cleaned values and we drop the duplicates. 

df3_company_list = df3.drop_duplicates(subset='Company Name Cleaned', keep='first')['Company Name Cleaned'].tolist()

In [182]:
# We use the model 'TF-IDF' from the PolyFuzz library.
# Changing Parameters to be used on large Data
model = TFIDF(n_gram_range=(2,4), clean_string=True,cosine_method='sparse', top_n=1)

In [183]:
#Saving results of model
# Main part of the Fuzzy search
result = model.match(df3_company_list, df2_account_list)

In [186]:
# We create a DataFrame that contains similarity > 0

old_account = result[result['Similarity'] > 0].reset_index(drop=True)
old_account = old_account.rename({'From': 'Company Name Cleaned', 'To':  'Account Name Cleaned'}, axis=1)

In [187]:
old_account['T/F'] = np.where(old_account['Similarity'] >= 0.98, True, False)


---

In [188]:
old_account['string_based'] = old_account['Account Name Cleaned'] == old_account['Company Name Cleaned']
old_account['len'] = old_account['Company Name Cleaned'].apply(lambda x: len(x))

In [189]:
old_account = old_account.sort_values(by=['T/F'], ascending=True).reset_index(drop=True)

In [190]:
old_account.to_csv('output/old_account_check.csv', index=False)

---

In [None]:
exit()

---

In [191]:
old_account = pd.read_csv('output/old_account_check.csv')

In [193]:
old_account_T = old_account[old_account['T/F'] == True].reset_index(drop='True')
old_account_F = old_account[old_account['T/F'] == False].reset_index(drop='True')

In [194]:
inner_account_T = pd.merge(df2_account,
                      old_account_T,
                      on = 'Account Name Cleaned',
                      how = 'inner',)
                      #indicator = True).drop(columns = '_merge')

In [196]:
inner_account_T = inner_account_T[['Account ID', 'Account Name', 'Company Name Cleaned', 'Website']]

In [197]:
inner_company_T = pd.merge(inner_account_T,
                      df3,
                      on = 'Company Name Cleaned',
                      how = 'left', suffixes=('_SF','_ext'))
                      #indicator = True).drop(columns = '_merge')

In [199]:
# Accounts that we have in our system

inner_company_T.to_csv('output/old_accounts.csv', index=False) #Old accounts

---

# Concatenation function

In [200]:
# Getting the list of ID's
for i in range(0, len(inner_company_T['Account ID'].drop_duplicates()), 60):
   concat = ','.join(inner_company_T['Account ID'].drop_duplicates().tolist()[i:i+60])
   print(concat, f" size :{len(concat)}")

0018c00002E2XpDAAV,0018c00002E2Xp6AAF,0018c00002E2XpjAAF,0018c00002E2XpCAAV,0018c00002E2XpgAAF,0018c00002E2XpfAAF,0018c00002E2Xo4AAF,0018c00002E2XodAAF,0018c00002E2XpnAAF,0018c00002E2XqHAAV,0018c00002E2XoyAAF,0018c00002E2Xq0AAF,0018c00002E2Xq9AAF,0018c00002E2XoQAAV,0018c00002E2Xp2AAF,0018c00002E2XonAAF,0018c00002E2XowAAF,0018c00002E2XorAAF,0018c00002E2XoOAAV,0018c00002E2XofAAF,0018c00002E2XoJAAV,0018c00002E2Xq8AAF,0018c00002E2XpxAAF,0018c00002E2Xp1AAF,0018c00002E2Xp5AAF,0018c00002E2XojAAF,0018c00002E2XoBAAV,0018c00002E2XplAAF,0018c00002E2XppAAF,0018c00002E2XpQAAV,0018c00002E2XosAAF,0018c00002E2XoTAAV,0018c00002E2XouAAF,0018c00002E2XpXAAV,0018c00002E2XoNAAV,0018c00002E2Xq5AAF,0018c00002E2XpZAAV,0018c00002E2XoSAAV,0018c00002E2XpyAAF,0018c00002E2XpoAAF,0018c00002E2XpmAAF,0018c00002E2Xo8AAF,0018c00002E2XpzAAF,0018c00002E2XoPAAV,0018c00002E2Xo9AAF,0018c00002E2XpOAAV,0018c00002E2XpbAAF,0018c00002E2Xp3AAF,0018c00002E2XohAAF,0018c00002E2XobAAF,0018c00002E2Xo6AAF,0018c00002E2XqEAAV,0018c00002E2

---

In [201]:
inner_company_T_list = inner_company_T['Company Name'].tolist()

In [202]:
new_accounts = df3[~df3['Company Name'].isin(inner_company_T_list)].reset_index(drop=True)

In [203]:
new_accounts.to_csv('output/new_accounts.csv',index=False, encoding='utf-8')