# Fuzzy-Lookup

In [1]:
# We import the libraries that we are goin to use. 

import numpy as np
import pandas as pd
from polyfuzz.models import TFIDF

  return torch._C._cuda_getDeviceCount() > 0


## Using the Account Name parameter

### Cleansing the SF accounts

In [3]:
# We read the file that contains the SF Accounts.

accounts_path = 'input/sf_accounts.csv'

df2 = pd.read_csv(accounts_path, encoding = 'Latin1').dropna(subset=['Account ID'])

In [4]:
df2.head()

Unnamed: 0,Account Name,Website,Contact Count,Account ID
0,BH Live,www.bhlive.org.uk,2.0,0011300001wDBbT
1,Telesure group services,,1.0,0014O00002IfXnR
2,VICTORIA INVESTMENTS LIMITED,vicinvest.com,1.0,0014O00002LCSle
3,L Squared Digital Signage Solutions,lsquared.com,1.0,0014O00002LE2KV
4,Concord Integrated Services,,1.0,0014O00002LCyjA


In [5]:
# We stablish the strange symbols and patters that we are going to remove. 

a = '[^a-z0-9\s]| inc| plc| llc| ltd| llp| limited| group| corporation| uk'

In [6]:
# We create a new Account Name column to clean it.
# We remove those symbols and patters from the Account Name Cleaned column.
# We replace the empty strings with NaN values in the Account Name Cleaned column.

df2['Account Name Cleaned'] = df2['Account Name']
df2['Account Name Cleaned'] = df2['Account Name Cleaned'].str.lower().str.replace(a, '', regex=True).str.strip()
df2['Account Name Cleaned'] = df2['Account Name Cleaned'].replace('', np.nan)

In [7]:
# We create a DataFrame without NaN values in the Account Name Cleaned column. 
# We sort by Account Name Cleaned and Contact Count.
# We drop duplicates in the Account Name Cleaned column and we keep the first one. 

df2_account = df2.dropna(subset=['Account Name Cleaned'])
df2_account = df2_account.sort_values(by=['Account Name Cleaned', 'Contact Count'], ascending=[True, False])
df2_account = df2_account.drop_duplicates(subset='Account Name Cleaned', keep='first')

In [8]:
# We create a list with the Account Name Cleaned values. 

df2_account_list = df2_account['Account Name Cleaned'].tolist()

### Cleansing the external companies

In [1]:
# We read the file that contains the external accounts. 

ext_data_path = 'input/ext_data.csv'

df3 = pd.read_csv(ext_data_path, encoding="Latin1")


NameError: name 'pd' is not defined

In [10]:
# We create a new Account Name column to clean it.
# We remove those symbols and patters from the Account Name Cleaned column.
# We replace the empty strings with NaN values in the Account Name Cleaned column.

df3['Company Name Cleaned'] = df3['Company Name']
df3['Company Name Cleaned'] = df3['Company Name Cleaned'].str.lower().str.replace(a, '', regex=True).str.strip()
df3['Company Name Cleaned'] = df3['Company Name Cleaned'].fillna(' ')

In [11]:
df3.head()

Unnamed: 0,Company Name,Segment,Unnamed: 2,Unnamed: 3,Unnamed: 4,Company Name Cleaned
0,Advanta Seeds,Seed Company,,,,advanta seeds
1,Agrico,Processor / Packer,,,,agrico
2,Bakker Brothers,Seed Company,,,,bakker brothers
3,BASF,Seed Company,,,,basf
4,Bayer,Seed Company,,,,bayer


In [12]:
# We create a list with the Account Name Cleaned values and we drop the duplicates. 

df3_company_list = df3.drop_duplicates(subset='Company Name Cleaned', keep='first')['Company Name Cleaned'].tolist()

In [13]:
df3_company_list[:5]

['advanta seeds', 'agrico', 'bakker brothers', 'basf', 'bayer']

In [162]:
# We use the model 'TF-IDF' from the PolyFuzz library.
# Changing Parameters to be used on large Data
model = TFIDF(n_gram_range=(2,2), clean_string=True, min_similarity=0.95, cosine_method='sparse', top_n=1)

In [163]:
#Saving results of model
# Main part of the Fuzzy search
result = model.match(df2_account_list,df3_company_list)

---

In [164]:
result.head()

Unnamed: 0,From,To,Similarity
0,012 golden lines,,0.0
1,01synergy,,0.0
2,01t,,0.0
3,02 telefonica wifi,,0.0
4,07 media,,0.0


In [165]:
# We create a DataFrame that contains similarity > .95

old_account = result[result['Similarity'] > .95].reset_index(drop=True)
old_account = old_account.rename({'From': 'Account Name Cleaned', 'To': 'Company Name Cleaned'}, axis=1)

In [166]:
# old_account['T/F'] = old_account['Account Name Cleaned'] == old_account['Company Name Cleaned']

old_account['T/F'] = np.where(old_account['Similarity'] >= 0.98, True, False)


---

In [167]:
old_account['string_based'] = old_account['Account Name Cleaned'] == old_account['Company Name Cleaned']
old_account['len'] = old_account['Company Name Cleaned'].apply(lambda x: len(x))

---

In [168]:
old_account = old_account.sort_values(by=['T/F'], ascending=True).reset_index(drop=True)

In [169]:
old_account.to_csv('output/old_account_check.csv', index=False)

In [None]:
exit()

---

In [179]:
old_account = pd.read_csv('output/old_account_check.csv')

In [180]:
old_account.loc[(old_account['T/F'] != old_account['string_based']) & (old_account['len'] <= 20)].sort_values(by='len')

Unnamed: 0,Account Name Cleaned,Company Name Cleaned,Similarity,T/F,string_based,len
0,the wonderful company,wonderful company,0.951,True,False,17


In [181]:
old_account_T = old_account[old_account['T/F'] == True].reset_index(drop='True')
old_account_F = old_account[old_account['T/F'] == False].reset_index(drop='True')

In [182]:
inner_account_T = pd.merge(df2_account,
                      old_account_T,
                      on = 'Account Name Cleaned',
                      how = 'inner',)
                      #indicator = True).drop(columns = '_merge')

In [183]:
inner_account_T

Unnamed: 0,Account Name,Website,Contact Count,Account ID,Account Name Cleaned,Company Name Cleaned,Similarity,T/F,string_based,len
0,Advanta Seeds,,3.0,0014O00002LCllG,advanta seeds,advanta seeds,1.0,True,True,13
1,Agrico,,,0014O00002LCllE,agrico,agrico,1.0,True,True,6
2,Bakker Brothers,,,0014O00002LClkx,bakker brothers,bakker brothers,1.0,True,True,15
3,BASF,www.basf.com/en,317.0,0014O000025jmPg,basf,basf,1.0,True,True,4
4,Bayer,www.bayer.com,1599.0,0014O000025jmuS,bayer,bayer,1.0,True,True,5
5,Bejo Zaden,www.bejo.com,11.0,0014O000025jpUz,bejo zaden,bejo zaden,1.0,True,True,10
6,Driscoll's,www.driscolls.com,9.0,0014O0000292Dxw,driscolls,driscolls,1.0,True,True,9
7,Enza Zaden,,1.0,0014O00002LClkX,enza zaden,enza zaden,1.0,True,True,10
8,Greenfield,,,0014O00002LCllK,greenfield,greenfield,1.0,True,True,10
9,Hazera,,,0014O00002LCllF,hazera,hazera,1.0,True,True,6


In [184]:
inner_account_T = inner_account_T[['Account ID', 'Account Name', 'Company Name Cleaned', 'Website']]

In [185]:
inner_company_T = pd.merge(inner_account_T,
                      df3,
                      on = 'Company Name Cleaned',
                      how = 'left',)
                      #indicator = True).drop(columns = '_merge')

In [186]:
inner_company_T.head()

Unnamed: 0,Account ID,Account Name,Company Name Cleaned,Website,Company Name,Segment,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,0014O00002LCllG,Advanta Seeds,advanta seeds,,Advanta Seeds,Seed Company,,,
1,0014O00002LCllE,Agrico,agrico,,Agrico,Processor / Packer,,,
2,0014O00002LCllE,Agrico,agrico,,Agrico,,,,
3,0014O00002LClkx,Bakker Brothers,bakker brothers,,Bakker Brothers,Seed Company,,,
4,0014O000025jmPg,BASF,basf,www.basf.com/en,BASF,Seed Company,,,


In [187]:
# Accounts that we have in our system

inner_company_T.to_csv('output/old_accounts.csv') #Old accounts

---

# Concatenation function

In [188]:
# Getting the list of ID's
for i in range(0, len(inner_company_T['Account ID']), 60):
   concat = ','.join(inner_company_T['Account ID'].tolist()[i:i+60])
   print(concat, f" size :{len(concat)}")

0014O00002LCllG,0014O00002LCllE,0014O00002LCllE,0014O00002LClkx,0014O000025jmPg,0014O000025jmuS,0014O000025jpUz,0014O0000292Dxw,0014O00002LClkX,0014O00002LCllK,0014O00002LCllF,0014O00002LCllJ,0014O00002LCllI,0014O00002LClkE,0014O00002LCllC,0014O0000293DYa,0014O00002LClkk,0014O00002LCllD,0014O0000292xdY,0014O00002LDjno,0014O00002LCllH,0014O00002917x8,0014O0000293FD0,0014O00002LCll1,0014O00002LDjnG,0014O00002LClkw,0014O00002K6Jk1,001a000001Tfgot,0014O0000293DVV,0014O00002EuySW  size :479


---

In [189]:
inner_company_T_list = inner_company_T['Company Name'].tolist()

In [190]:
new_accounts = df3[~df3['Company Name'].isin(inner_company_T_list)].reset_index(drop=True)

In [191]:
new_accounts.to_csv('output/new_accounts.csv')