# Fuzzy-Lookup

In [1]:
# We import the libraries that we are goin to use. 

import numpy as np
import pandas as pd
from polyfuzz.models import TFIDF

  return torch._C._cuda_getDeviceCount() > 0


## Using the Account Name parameter

### Cleansing the SF accounts

In [34]:
# We read the file that contains the SF Accounts.

accounts_path = 'input/sf_accounts.csv'

df2 = pd.read_csv(accounts_path).dropna(subset=['Account ID'])

In [35]:
df2.head()

Unnamed: 0,Account Name,Website,Contact Count,Account ID
0,BH Live,www.bhlive.org.uk,2.0,0011300001wDBbT
1,Five Star Bank,www.five-starbank.com,3.0,0014O00002Euy8z
2,Nubank,www.nubank.com.br,1.0,0014O00002LDg27
3,KTH Parts Industries,www.kth.net,2.0,0014O0000293Dej
4,Waverley School,,2.0,0014O00002EtXjJ


In [36]:
# We stablish the strange symbols and patters that we are going to remove. 

a = '[^a-z0-9\s]| inc| plc| llc| ltd| llp| limited| group| corporation| uk'

In [37]:
# We create a new Account Name column to clean it.
# We remove those symbols and patters from the Account Name Cleaned column.
# We replace the empty strings with NaN values in the Account Name Cleaned column.

df2['Account Name Cleaned'] = df2['Account Name']
df2['Account Name Cleaned'] = df2['Account Name Cleaned'].str.lower().str.replace(a, '', regex=True).str.strip()
df2['Account Name Cleaned'] = df2['Account Name Cleaned'].replace('', np.nan)

In [38]:
# We create a DataFrame without NaN values in the Account Name Cleaned column. 
# We sort by Account Name Cleaned and Contact Count.
# We drop duplicates in the Account Name Cleaned column and we keep the first one. 

df2_account = df2.dropna(subset=['Account Name Cleaned'])
df2_account = df2_account.sort_values(by=['Account Name Cleaned', 'Contact Count'], ascending=[True, False])
df2_account = df2_account.drop_duplicates(subset='Account Name Cleaned', keep='first')

In [39]:
# We create a list with the Account Name Cleaned values. 

df2_account_list = df2_account['Account Name Cleaned'].tolist()

### Cleansing the external companies

In [40]:
# We read the file that contains the external accounts. 

ext_data_path = 'input/ext_data.csv'

df3 = pd.read_csv(ext_data_path, encoding="latin-1")


In [41]:
# We create a new Account Name column to clean it.
# We remove those symbols and patters from the Account Name Cleaned column.
# We replace the empty strings with NaN values in the Account Name Cleaned column.

df3['Company Name Cleaned'] = df3['Company Name']
df3['Company Name Cleaned'] = df3['Company Name Cleaned'].str.lower().str.replace(a, '', regex=True).str.strip()
df3['Company Name Cleaned'] = df3['Company Name Cleaned'].fillna(' ')

In [42]:
df3.head()

Unnamed: 0,Company Name,First Name,Last Name,Job Title,Email,Swichboard,Mobile,LinkedIn URL,Location,Tag,...,Direct Line,Anual Revenue,Ownership Type,Company City,Company State,Company Zip Code,Company Country,company_products_services,company_description,Company Name Cleaned
0,Oatly,Anne-Fleur,Jansen,Sales Manager Benelux,Anne-Fleur.Jansen@oatly.com,+44 (0)800 22881234,,www.linkedin.com/in/anne-fleur-jansen-760b8123/,"Amsterdam, North Holland, Netherlands",Global Client Climate ai,...,,,,,,,,,,oatly
1,Natures Way Foods,Sam,Mitchell,Marketing Executive,Sam.Mitchell@natureswayfoods.com,+44 1243 603111,+44 7710 636453,www.linkedin.com/in/sam-mitchell-9842b2126/,"Chichester, England, United Kingdom",Global Client Climate ai,...,,,,,,,,,,natures way foods
2,Anb Investments,Viresh,Ramburan,Group Chief Executive and Director,viresh@anbinvestments.co.za,+27 (15) 345 1650,,www.linkedin.com/in/viresh-ramburan-88516512/,"City of Cape Town, Western Cape, South Africa",Global Client Climate ai,...,,,,,,,,,,anb investments
3,Anb Investments,Jan-Willem,Boonzaaier,Operations Executive,jan-willem@anbinvestments.co.za,+27 (15) 345 1650,+27 82 558 8444,www.linkedin.com/in/janwillemboonzaaier/,"City of Cape Town, Western Cape, South Africa",Global Client Climate ai,...,,,,,,,,,,anb investments
4,BOHMER,Bob,Lu,GENERAL MANAGER,Bob.Lu@boehmer.com,+49 (0) 2324 7001-0,,www.linkedin.com/in/bob-lu-07324642/,"Shanghai, China",Global Client Climate ai,...,,,,,,,,,,bohmer


In [11]:
# We create a list with the Account Name Cleaned values and we drop the duplicates. 

df3_company_list = df3.drop_duplicates(subset='Company Name Cleaned', keep='first')['Company Name Cleaned'].tolist()

In [12]:
df3_company_list[:5]

['oatly',
 'natures way foods',
 'anb investments',
 'bohmer',
 'molson coors brewing company']

In [13]:
# We use the model 'TF-IDF' from the PolyFuzz library.
# Changing Parameters to be used on large Data
model = TFIDF(n_gram_range=(2,2), clean_string=True, min_similarity=0.95, cosine_method='sparse', top_n=1)

In [62]:
#Saving results of model
# Main part of the Fuzzy search
result = model.match(df2_account_list, df3_company_list)

---

In [63]:
result.head()

Unnamed: 0,From,To,Similarity
0,012 golden lines,,0.0
1,01synergy,,0.0
2,01t,,0.0
3,02 telefonica wifi,,0.0
4,07 media,,0.0


In [64]:
# We create a DataFrame that contains similarity > .95

old_account = result[result['Similarity'] > .95].reset_index(drop=True)
old_account = old_account.rename({'From': 'Account Name Cleaned', 'To': 'Company Name Cleaned'}, axis=1)

In [65]:
# old_account['T/F'] = old_account['Account Name Cleaned'] == old_account['Company Name Cleaned']

old_account['T/F'] = np.where(old_account['Similarity'] >= 0.98, True, False)


---

In [66]:
old_account['string_based'] = old_account['Account Name Cleaned'] == old_account['Company Name Cleaned']
old_account['len'] = old_account['Company Name Cleaned'].apply(lambda x: len(x))

---

In [67]:
old_account = old_account.sort_values(by=['T/F'], ascending=True).reset_index(drop=True)

In [68]:
old_account.to_csv('output/old_account_check.csv', index=False)

exit()

In [91]:
old_account = pd.read_csv('output/old_account_check.csv')

---

In [92]:
old_account.loc[(old_account['T/F'] != old_account['string_based']) & (old_account['len'] <= 20)].sort_values(by='len')

Unnamed: 0,Account Name Cleaned,Company Name Cleaned,Similarity,T/F,string_based,len


In [93]:
old_account_T = old_account[old_account['T/F'] == True].reset_index(drop='True')
old_account_F = old_account[old_account['T/F'] == False].reset_index(drop='True')

In [155]:
inner_account_T = pd.merge(df2_account,
                      old_account_T,
                      on = 'Account Name Cleaned',
                      how = 'inner',)
                      #indicator = True).drop(columns = '_merge')

In [156]:
inner_account_T

Unnamed: 0,Account Name,Website,Contact Count,Account ID,Account Name Cleaned,Company Name Cleaned,Similarity,T/F,string_based,len
0,Baker McKenzie,bakermckenzie.com,1.0,0014O00002LDmYb,baker mckenzie,baker mckenzie,1.0,True,True,15
1,BNP Paribas Cardif,www.bnpparibascardif.com,54.0,0014O000025jo6d,bnp paribas cardif,bnp paribas cardif,1.0,True,True,19


In [149]:
inner_account_T = inner_account_T[['Account ID', 'Account Name', 'Company Name Cleaned', 'Website']]

In [176]:
inner_company_T = pd.merge(inner_account_T,
                      df3,
                      on = 'Company Name Cleaned',
                      how = 'left',)
                      #indicator = True).drop(columns = '_merge')

In [177]:
inner_company_T.head()

Unnamed: 0,Account Name,Website_x,Contact Count,Account ID,Account Name Cleaned,Company Name Cleaned,Similarity,T/F,string_based,len,...,Website_y,Direct Line,Anual Revenue,Ownership Type,Company City,Company State,Company Zip Code,Company Country,company_products_services,company_description
0,Baker McKenzie,bakermckenzie.com,1.0,0014O00002LDmYb,baker mckenzie,baker mckenzie,1.0,True,True,15,...,,,,,,,,,,
1,BNP Paribas Cardif,www.bnpparibascardif.com,54.0,0014O000025jo6d,bnp paribas cardif,bnp paribas cardif,1.0,True,True,19,...,,,,,,,,,,


In [178]:
# Accounts that we have in our system

inner_company_T.to_csv('output/old_accounts.csv') #Old accounts

---

# Concatenation function

In [179]:
# Getting the list of ID's
for i in range(0, len(inner_company_T['Account ID']), 60):
   concat = ','.join(inner_company_T['Account ID'].tolist()[i:i+60])
   print(concat, f" size :{len(concat)}")

0014O00002LDmYb,0014O000025jo6d  size :31


---

In [180]:
inner_company_T_list = inner_company_T['Company Name'].tolist()

In [186]:
new_accounts = df3[df3['Company Name'].isin(inner_company_T_list)].reset_index(drop=True)

In [187]:
new_accounts.to_csv('output/new_accounts.csv')

## Using the Website parameter



### Cleansing the old-new DataFrame