# Fuzzy-Lookup

In [53]:
# We import the libraries that we are goin to use. 

import numpy as np
import pandas as pd
from polyfuzz.models import TFIDF

## Using the Account Name parameter

### Cleansing the SF accounts

In [54]:
# We read the file that contains the SF Accounts.

accounts_path = 'input/sf_accounts.csv'

df2 = pd.read_csv(accounts_path, encoding = 'utf-8').dropna(subset=['Account ID'])

In [55]:
# We stablish the strange symbols and patters that we are going to remove. 

#a = r'[^a-z0-9\s]| inc\b| plc\b| llc\b| ltd\b| llp\b| limited\b| group\b| corporation\b| uk\b'
a = r'[^a-z0-9\s]'

In [56]:
# We create a new Account Name column to clean it.
# We remove those symbols and patters from the Account Name Cleaned column.
# We replace the empty strings with NaN values in the Account Name Cleaned column.

df2['Account Name Cleaned'] = df2['Account Name']
df2['Account Name Cleaned'] = df2['Account Name Cleaned'].str.lower().str.replace(a, '', regex=True).str.strip()
df2['Account Name Cleaned'] = df2['Account Name Cleaned'].replace('', np.nan)

In [57]:
# We create a DataFrame without NaN values in the Account Name Cleaned column. 
# We sort by Account Name Cleaned and Contact Count.
# We drop duplicates in the Account Name Cleaned column and we keep the first one. 

df2_account = df2.dropna(subset=['Account Name Cleaned'])
df2_account = df2_account.sort_values(by=['Account Name Cleaned', 'Contact Count'], ascending=[True, False])
df2_account = df2_account.drop_duplicates(subset='Account Name Cleaned', keep='first')

In [58]:
# We create a list with the Account Name Cleaned values. 

df2_account_list = df2_account['Account Name Cleaned'].tolist()

### Cleansing the external companies

In [59]:
# We read the file that contains the external accounts. 

ext_data_path = 'input/ext_data.csv'

df3 = pd.read_csv(ext_data_path, encoding="utf-8")


In [60]:
# We create a new Account Name column to clean it.
# We remove those symbols and patters from the Account Name Cleaned column.
# We replace the empty strings with NaN values in the Account Name Cleaned column.

df3['Company Name Cleaned'] = df3['Company Name']
df3['Company Name Cleaned'] = df3['Company Name Cleaned'].str.lower().str.replace(a, '', regex=True).str.strip()
df3['Company Name Cleaned'] = df3['Company Name Cleaned'].fillna(' ')

In [61]:
# We create a list with the Account Name Cleaned values and we drop the duplicates. 

df3_company_list = df3.drop_duplicates(subset='Company Name Cleaned', keep='first')['Company Name Cleaned'].tolist()

In [62]:
# We use the model 'TF-IDF' from the PolyFuzz library.
# Changing Parameters to be used on large Data
model = TFIDF(n_gram_range=(1,4), clean_string=True,cosine_method='sparse', top_n=1)

In [63]:
#Saving results of model
# Main part of the Fuzzy search
result = model.match(df3_company_list, df2_account_list)
#result = model.match(df2_account_list, df3_company_list)

In [64]:
# We create a DataFrame that contains similarity > 0

old_account = result[result['Similarity'] > 0].reset_index(drop=True)
old_account = old_account.rename({'From': 'Company Name Cleaned', 'To':  'Account Name Cleaned'}, axis=1)

In [65]:
old_account['T/F'] = np.where(old_account['Similarity'] >= 0.98, True, False)

---

In [66]:
old_account['string_based'] = old_account['Account Name Cleaned'] == old_account['Company Name Cleaned']
old_account['len'] = old_account['Company Name Cleaned'].apply(lambda x: len(x))

In [67]:
old_account = old_account.sort_values(by=['T/F'], ascending=True).reset_index(drop=True)

In [68]:
old_account.to_csv('output/old_account_check.csv', index=False, encoding='utf-8-sig')

---

In [None]:
exit()

---

In [69]:
old_account = pd.read_csv('output/old_account_check.csv')

In [70]:
old_account_T = old_account[old_account['T/F'] == True].reset_index(drop='True')
old_account_F = old_account[old_account['T/F'] == False].reset_index(drop='True')

In [71]:
inner_account_T = pd.merge(df2_account,
                      old_account_T,
                      on = 'Account Name Cleaned',
                      how = 'inner',)
                      #indicator = True).drop(columns = '_merge')

In [72]:
inner_account_T = inner_account_T[['Account ID', 'Account Name', 'Company Name Cleaned', 'Website']]

In [73]:
inner_company_T = pd.merge(inner_account_T,
                      df3,
                      on = 'Company Name Cleaned',
                      how = 'left', suffixes=('_SF','_ext'))
                      #indicator = True).drop(columns = '_merge')

In [74]:
# Accounts that we have in our system

inner_company_T.to_csv('output/old_accounts.csv', index=False, encoding='utf-8-sig') #Old accounts

---

# Concatenation function

In [75]:
# Getting the list of ID's
for i in range(0, len(inner_company_T['Account ID'].drop_duplicates()), 60):
   concat = ','.join(inner_company_T['Account ID'].drop_duplicates().tolist()[i:i+60])
   print(concat, f" size :{len(concat)}")

0014O0000292zu2QAA,0014O00002EthLXQAZ,0014O0000293G8qQAE,0014O00002LBqAoQAL,0014O00002LCLNzQAP,0014O00002LEf4hQAD,001a000001Tfu8bAAB,0014O00002LEfTFQA1,0014O00002927p6QAA,0014O0000293HozQAE,0014O000029381RQAQ,0014O0000293DhgQAE,0011300001wDBw8AAG,0014O0000292jp2QAA,0014O0000293DSiQAM,0014O0000291HLTQA2,0014O00002938ibQAA,0014O00002IeRJRQA3,0014O000029381EQAQ,0014O0000293HPIQA2,0014O0000292zABQAY,0014O0000292xhMQAQ,0014O00002917oGQAQ,0014O0000292xQxQAI,0014O00002927r8QAA,0014O000029358iQAA,0014O00002932vuQAA,0014O0000292zTOQAY,0014O0000291HKhQAM,0014O0000293HvYQAU,0014O00002LEf5bQAD,0014O00002UD4zVQAT,0014O00002LEhSbQAL,0014O0000293L50QAE,0014O00002OuTODQA3,0014O0000293DTlQAM  size :683


---

In [76]:
inner_company_T_list = inner_company_T['Company Name'].tolist()

In [77]:
new_accounts = df3[~df3['Company Name'].isin(inner_company_T_list)].reset_index(drop=True)

In [78]:
new_accounts.to_csv('output/new_accounts.csv',index=False, encoding='utf-8-sig')