# Fuzzy-Lookup

In [27]:
# We import the libraries that we are goin to use. 

import numpy as np
import pandas as pd
from polyfuzz.models import TFIDF

## Using the Account Name parameter

### Cleansing the SF accounts

In [28]:
# We read the file that contains the SF Accounts.

accounts_path = 'input/sf_accounts.csv'

df2 = pd.read_csv(accounts_path, encoding = 'utf-8').dropna(subset=['Account ID'])

In [29]:
# We stablish the strange symbols and patters that we are going to remove. 

a = r'[^a-z0-9\s]| inc\b| plc\b| llc\b| ltd\b| llp\b| limited\b| group\b| corporation\b| uk\b'

In [30]:
# We create a new Account Name column to clean it.
# We remove those symbols and patters from the Account Name Cleaned column.
# We replace the empty strings with NaN values in the Account Name Cleaned column.

df2['Account Name Cleaned'] = df2['Account Name']
df2['Account Name Cleaned'] = df2['Account Name Cleaned'].str.lower().str.replace(a, '', regex=True).str.strip()
df2['Account Name Cleaned'] = df2['Account Name Cleaned'].replace('', np.nan)

In [31]:
# We create a DataFrame without NaN values in the Account Name Cleaned column. 
# We sort by Account Name Cleaned and Contact Count.
# We drop duplicates in the Account Name Cleaned column and we keep the first one. 

df2_account = df2.dropna(subset=['Account Name Cleaned'])
df2_account = df2_account.sort_values(by=['Account Name Cleaned', 'Contact Count'], ascending=[True, False])
df2_account = df2_account.drop_duplicates(subset='Account Name Cleaned', keep='first')

In [32]:
# We create a list with the Account Name Cleaned values. 

df2_account_list = df2_account['Account Name Cleaned'].tolist()

### Cleansing the external companies

In [33]:
# We read the file that contains the external accounts. 

ext_data_path = 'input/ext_data.csv'

df3 = pd.read_csv(ext_data_path, encoding="utf-8")


In [34]:
# We create a new Account Name column to clean it.
# We remove those symbols and patters from the Account Name Cleaned column.
# We replace the empty strings with NaN values in the Account Name Cleaned column.

df3['Company Name Cleaned'] = df3['Company Name']
df3['Company Name Cleaned'] = df3['Company Name Cleaned'].str.lower().str.replace(a, '', regex=True).str.strip()
df3['Company Name Cleaned'] = df3['Company Name Cleaned'].fillna(' ')

In [35]:
# We create a list with the Account Name Cleaned values and we drop the duplicates. 

df3_company_list = df3.drop_duplicates(subset='Company Name Cleaned', keep='first')['Company Name Cleaned'].tolist()

In [36]:
# We use the model 'TF-IDF' from the PolyFuzz library.
# Changing Parameters to be used on large Data
model = TFIDF(n_gram_range=(2,4), clean_string=True,cosine_method='sparse', top_n=1)

In [37]:
#Saving results of model
# Main part of the Fuzzy search
result = model.match(df3_company_list, df2_account_list)

In [38]:
# We create a DataFrame that contains similarity > 0

old_account = result[result['Similarity'] > 0].reset_index(drop=True)
old_account = old_account.rename({'From': 'Company Name Cleaned', 'To':  'Account Name Cleaned'}, axis=1)

In [39]:
old_account['T/F'] = np.where(old_account['Similarity'] >= 0.98, True, False)


---

In [40]:
old_account['string_based'] = old_account['Account Name Cleaned'] == old_account['Company Name Cleaned']
old_account['len'] = old_account['Company Name Cleaned'].apply(lambda x: len(x))

In [41]:
old_account = old_account.sort_values(by=['T/F'], ascending=True).reset_index(drop=True)

In [42]:
old_account.to_csv('output/old_account_check.csv', index=False, encoding='utf-8-sig')

---

In [None]:
exit()

---

In [43]:
old_account = pd.read_csv('output/old_account_check.csv')

In [44]:
old_account_T = old_account[old_account['T/F'] == True].reset_index(drop='True')
old_account_F = old_account[old_account['T/F'] == False].reset_index(drop='True')

In [45]:
inner_account_T = pd.merge(df2_account,
                      old_account_T,
                      on = 'Account Name Cleaned',
                      how = 'inner',)
                      #indicator = True).drop(columns = '_merge')

In [46]:
inner_account_T = inner_account_T[['Account ID', 'Account Name', 'Company Name Cleaned', 'Website']]

In [47]:
inner_company_T = pd.merge(inner_account_T,
                      df3,
                      on = 'Company Name Cleaned',
                      how = 'left', suffixes=('_SF','_ext'))
                      #indicator = True).drop(columns = '_merge')

In [48]:
# Accounts that we have in our system

inner_company_T.to_csv('output/old_accounts.csv', index=False, encoding='utf-8-sig') #Old accounts

---

# Concatenation function

In [49]:
# Getting the list of ID's
for i in range(0, len(inner_company_T['Account ID'].drop_duplicates()), 60):
   concat = ','.join(inner_company_T['Account ID'].drop_duplicates().tolist()[i:i+60])
   print(concat, f" size :{len(concat)}")

0014O000025jmpPQAQ,0014O000025jmpVQAQ,0014O00002LCJ3TQAX,0014O0000293DmbQAE,0014O00002LCIGgQAP,0014O000025jnpuQAA,0014O00002LCJOvQAP,0011300001wB52uAAC,0011300001wDBRXAA4,0011B00002C04RqQAJ,001a000001RZDLaAAP,0014O00002LCIydQAH,0014O000025jmPtQAI,0011B000029kI0pQAE,0011300001irvqpAAA,001a000001TfuT1AAJ,0014O00002Ev0FEQAZ,0014O000025jqrRQAQ,0014O000025jo21QAA,001a000001RZDVtAAP,0014O00002LBq0kQAD,0014O000025jo6dQAA,0014O00002LEp0KQAT,0014O00002IeSM8QAN,0014O0000293ERYQA2,0014O000025jmv5QAA,0014O00002LEcwcQAD,0014O00002LCJNTQA5,001a000001RZDauAAH,0011300001dJ06jAAC,001a000001UAynbAAD,0014O000025jnptQAA,001a000001RZDcHAAX,0014O000025jq5sQAA,001a000001RZDcbAAH,0011300001qH5fyAAC,0011300001c0qYqAAI,0014O00002FfDQaQAN,0014O000025jmy2QAA,0011300001wDBuoAAG,0014O00002LEdlIQAT,0014O00002LCxRIQA1,0011B00002BWaquQAD,0014O00002LCzK6QAL,0014O000025jmzKQAQ,0014O00002LEwfrQAD,0014O00002LCHIaQAP,0014O00002917r4QAA,0014O000025jqpsQAA,0014O0000293JX5QAM,0011300001wDC3gAAG,0014O00002LEdTBQA1,001a000001Tf

---

In [50]:
inner_company_T_list = inner_company_T['Company Name'].tolist()

In [51]:
new_accounts = df3[~df3['Company Name'].isin(inner_company_T_list)].reset_index(drop=True)

In [52]:
new_accounts.to_csv('output/new_accounts.csv',index=False, encoding='utf-8-sig')