# Fuzzy-Lookup

In [69]:
# We import the libraries that we are goin to use. 

import numpy as np
import pandas as pd
from polyfuzz.models import TFIDF

## Using the Account Name parameter

### Cleansing the SF accounts

In [70]:
# We read the file that contains the SF Accounts.

accounts_path = 'input/sf_accounts.csv'

df2 = pd.read_csv(accounts_path).dropna(subset=['Account ID'])

In [71]:
df2.head()

Unnamed: 0,Account Name,Website,Contact Count,Account ID
0,BH Live,www.bhlive.org.uk,2.0,0011300001wDBbT
1,Future Tech Consultants,ftcny.com,0.0,0014O00002Euzlw
2,Gardiner & Theobald,gardiner.com,1.0,0014O00002LDdDI
3,When I Work,www.wheniwork.com,16.0,0014O0000293FrM
4,United Incentives,www.unitedincentives.com,1.0,0014O0000293HCU


In [72]:
# We stablish the strange symbols and patters that we are going to remove. 

a = '[^a-z0-9\s]| inc| plc| llc| ltd| llp| limited| group| corporation| uk'

In [73]:
# We create a new Account Name column to clean it.
# We remove those symbols and patters from the Account Name Cleaned column.
# We replace the empty strings with NaN values in the Account Name Cleaned column.

df2['Account Name Cleaned'] = df2['Account Name']
df2['Account Name Cleaned'] = df2['Account Name Cleaned'].str.lower().str.replace(a, '', regex=True).str.strip()
df2['Account Name Cleaned'] = df2['Account Name Cleaned'].replace('', np.nan)

In [74]:
# We create a DataFrame without NaN values in the Account Name Cleaned column. 
# We sort by Account Name Cleaned and Contact Count.
# We drop duplicates in the Account Name Cleaned column and we keep the first one. 

df2_account = df2.dropna(subset=['Account Name Cleaned'])
df2_account = df2_account.sort_values(by=['Account Name Cleaned', 'Contact Count'], ascending=[True, False])
df2_account = df2_account.drop_duplicates(subset='Account Name Cleaned', keep='first')

In [75]:
# We create a list with the Account Name Cleaned values. 

df2_account_list = df2_account['Account Name Cleaned'].tolist()

### Cleansing the external companies

In [76]:
# We read the file that contains the external accounts. 

ext_data_path = 'input/ext_data.csv'

df3 = pd.read_csv(ext_data_path, encoding="latin-1")


In [77]:
# We create a new Account Name column to clean it.
# We remove those symbols and patters from the Account Name Cleaned column.
# We replace the empty strings with NaN values in the Account Name Cleaned column.

df3['Company Name Cleaned'] = df3['Company Name']
df3['Company Name Cleaned'] = df3['Company Name Cleaned'].str.lower().str.replace(a, '', regex=True).str.strip()
df3['Company Name Cleaned'] = df3['Company Name Cleaned'].fillna(' ')

In [78]:
df3.head()

Unnamed: 0,Prority,Status,Account Manager,Company Name,Vertical,Turnover (000),Employee,Customer,Key Prospects,Company Name Cleaned
0,,New Bus,Daniel Willacy,Arrow XL,Logistics,481505,1136,,,arrow xl
1,,New Bus,Daniel Willacy,Aryzta Bakeries UK,FMCG,834000,2394,Yes,,aryzta bakeries
2,Priority 1,New Bus,Daniel Willacy,Burtons Foods Group,FMCG,232791,1771,Yes,Yes,burtons foods
3,Priority 1,New Bus,Daniel Willacy,Certegy,Finance,No Data,No Data,Yes,Yes,certegy
4,,New Bus,Daniel Willacy,Delice de France,Retail/ Food,72487849,338,,,delice de france


In [79]:
# We create a list with the Account Name Cleaned values and we drop the duplicates. 

df3_company_list = df3.drop_duplicates(subset='Company Name Cleaned', keep='first')['Company Name Cleaned'].tolist()

In [80]:
df3_company_list[:5]

['arrow xl', 'aryzta bakeries', 'burtons foods', 'certegy', 'delice de france']

In [81]:
# We use the model 'TF-IDF' from the PolyFuzz library.
# Changing Parameters to be used on large Data
model = TFIDF(n_gram_range=(2,2), clean_string=True, min_similarity=0.95, cosine_method='sparse', top_n=1)

In [82]:
#Saving results of model
# Main part of the Fuzzy search
result = model.match(df3_company_list, df2_account_list)

---

In [83]:
result.head()

Unnamed: 0,From,To,Similarity
0,arrow xl,arrow xl,1.0
1,aryzta bakeries,,0.0
2,burtons foods,,0.0
3,certegy,,0.0
4,delice de france,,0.0


In [84]:
# We create a DataFrame that contains similarity > .95

old_account = result[result['Similarity'] > .95].reset_index(drop=True)
old_account = old_account.rename({'From': 'Account Name Cleaned', 'To': 'Company Name Cleaned'}, axis=1)

In [85]:
# old_account['T/F'] = old_account['Account Name Cleaned'] == old_account['Company Name Cleaned']

old_account['T/F'] = np.where(old_account['Similarity'] >= 0.98, True, False)


---

In [86]:
old_account['string_based'] = old_account['Account Name Cleaned'] == old_account['Company Name Cleaned']
old_account['len'] = old_account['Company Name Cleaned'].apply(lambda x: len(x))

---

In [87]:
old_account = old_account.sort_values(by=['T/F'], ascending=True).reset_index(drop=True)

In [88]:
old_account.to_csv('output/old_account_check.csv', index=False)

exit()

In [89]:
old_account = pd.read_csv('output/old_account_check.csv')

---

In [90]:
old_account.loc[(old_account['T/F'] != old_account['string_based']) & (old_account['len'] <= 20)].sort_values(by='len')

Unnamed: 0,Account Name Cleaned,Company Name Cleaned,Similarity,T/F,string_based,len
6,bellway p l c,bellway,1.0,True,False,7


In [91]:
old_account_T = old_account[old_account['T/F'] == True].reset_index(drop='True')
old_account_F = old_account[old_account['T/F'] == False].reset_index(drop='True')

In [92]:
old_account_T.head()

Unnamed: 0,Account Name Cleaned,Company Name Cleaned,Similarity,T/F,string_based,len
0,arrow xl,arrow xl,1.0,True,True,8
1,brother international europe,brother international europe,1.0,True,True,28
2,bellway p l c,bellway,1.0,True,False,7
3,auto trader,auto trader,1.0,True,True,11
4,adidas uk,adidas uk,1.0,True,True,9


In [93]:
inner_account_T = pd.merge(df2_account,
                      old_account_T,
                      on = 'Account Name Cleaned',
                      how = 'inner',)
                      #indicator = True).drop(columns = '_merge')

In [94]:
inner_account_T = inner_account_T[['Account ID', 'Account Name', 'Company Name Cleaned', 'Website']]

In [95]:
inner_company_T = pd.merge(inner_account_T,
                      df3,
                      on = 'Company Name Cleaned',
                      how = 'inner',
                      indicator = True).drop(columns = '_merge')

In [96]:
# Accounts that we have in our system

inner_company_T.to_csv('output/old_accounts.csv') #Old accounts

---

# Concatenation function

In [97]:
# Getting the list of ID's
for i in range(0, len(inner_company_T['Account ID']), 60):
   concat = ','.join(inner_company_T['Account ID'].tolist()[i:i+60])
   print(concat, f" size :{len(concat)}")

0011300001wDBNf,001a000001RZDIE,0011300001wDBPR,0011300001wDBQH,0014O00002FeVRf,001a000001RZDO3,001a000001RZDPL,0011300001uV2cg,0014O000025jmPg,0014O00002LBXVV,001a000001XXLVB,001a000001RZDXJ,001a000001RZDZE,001a000001U9xv9,0014O0000293CUA,001a000001RZDbo,0014O00002LD8pc,0014O00002LD7mX,0014O00002FfDFw,0014O00002LCNN0,0014O000025jo8E,001a000001RZDhL,0011300001wDBz8,0014O000029358B,001a000001TfszS,001a000001RZDpM,0011300001wDC21,0011300001wDC2h,0011300001wDC3x,0014O00002K70mK,0014O000025jpl0,001a000001RZDmB,0011300001wDC6z,0014O00002LDLrL,0011300001wDC7z,0011300001wDC9E,001a000001RZDoG,0014O000025jnI1,0014O000025kB45,001a000001RZDqU,0014O0000293CfL,001a000001U9y36,001a000001RZDuH,001a000001XXLXb,0011300001wDCQB,0014O00002920yl,001a000001TfuEA,001a000001RZDwA,0014O00002LDdDA,0014O000025jn9Y,001a000001RZDxm,001a000001U9y6K,0014O00002LBpxa,0011300001wDDc9,001a000001RZDzR,001a000001RZDzT,0014O000025jpat,001a000001U9y7V,001a000001Tfuam,0014O00002FeJID  size :959
001a000001U9y7W,001a000001RZE

---

In [98]:
inner_company_T_list = inner_company_T['Company Name'].tolist()

In [99]:
new_accounts = df3[~df3['Company Name'].isin(inner_company_T_list)].reset_index(drop=True)

In [100]:
new_accounts.to_csv('/mnt/d/rafa_code/fuzzy-search/output/new_accounts.csv')

## Using the Website parameter



### Cleansing the old-new DataFrame