# Fuzzy-Lookup

In [1]:
# We import the libraries that we are goin to use. 

import numpy as np
import pandas as pd
from polyfuzz.models import TFIDF

  return torch._C._cuda_getDeviceCount() > 0


## Using the Account Name parameter

### Cleansing the SF accounts

In [2]:
# We read the file that contains the SF Accounts.

accounts_path = 'input/sf_accounts.csv'

df2 = pd.read_csv(accounts_path).dropna(subset=['Account ID'])

In [3]:
df2.head()

Unnamed: 0,Account Name,Website,Contact Count,Account ID
0,BH Live,www.bhlive.org.uk,2.0,0011300001wDBbT
1,Five Star Bank,www.five-starbank.com,3.0,0014O00002Euy8z
2,Nubank,www.nubank.com.br,1.0,0014O00002LDg27
3,KTH Parts Industries,www.kth.net,2.0,0014O0000293Dej
4,Waverley School,,2.0,0014O00002EtXjJ


In [4]:
# We stablish the strange symbols and patters that we are going to remove. 

a = '[^a-z0-9\s]| inc| plc| llc| ltd| llp| limited| group| corporation| uk'

In [5]:
# We create a new Account Name column to clean it.
# We remove those symbols and patters from the Account Name Cleaned column.
# We replace the empty strings with NaN values in the Account Name Cleaned column.

df2['Account Name Cleaned'] = df2['Account Name']
df2['Account Name Cleaned'] = df2['Account Name Cleaned'].str.lower().str.replace(a, '', regex=True).str.strip()
df2['Account Name Cleaned'] = df2['Account Name Cleaned'].replace('', np.nan)

In [6]:
# We create a DataFrame without NaN values in the Account Name Cleaned column. 
# We sort by Account Name Cleaned and Contact Count.
# We drop duplicates in the Account Name Cleaned column and we keep the first one. 

df2_account = df2.dropna(subset=['Account Name Cleaned'])
df2_account = df2_account.sort_values(by=['Account Name Cleaned', 'Contact Count'], ascending=[True, False])
df2_account = df2_account.drop_duplicates(subset='Account Name Cleaned', keep='first')

In [7]:
# We create a list with the Account Name Cleaned values. 

df2_account_list = df2_account['Account Name Cleaned'].tolist()

### Cleansing the external companies

In [8]:
# We read the file that contains the external accounts. 

ext_data_path = 'input/ext_data.csv'

df3 = pd.read_csv(ext_data_path, encoding="latin-1")


In [9]:
# We create a new Account Name column to clean it.
# We remove those symbols and patters from the Account Name Cleaned column.
# We replace the empty strings with NaN values in the Account Name Cleaned column.

df3['Company Name Cleaned'] = df3['Company Name']
df3['Company Name Cleaned'] = df3['Company Name Cleaned'].str.lower().str.replace(a, '', regex=True).str.strip()
df3['Company Name Cleaned'] = df3['Company Name Cleaned'].fillna(' ')

In [10]:
df3.head()

Unnamed: 0,Company Name,ID,description,comment,url,city,city id,country,country id,linkedin,...,tag 2,tag 3,logo,acquired?,acquisition price,investor 1 (required),investor 2,CB score,ST score,Company Name Cleaned
0,Delivery Hero,,Delivery Hero is a network of online food orde...,,deliveryhero.com,Berlin,11,Germany,1,https://www.linkedin.com/company/delivery-hero/,...,Food and Beverage,,Delivery Hero,Made Acquisitions,,Rocket Internet,General Atlantic,197,,delivery hero
1,Wirecard,,Wirecard is fastest-growing digital platforms ...,,wirecard.com/,Munich,12,Germany,1,https://www.linkedin.com/company/wirecard/,...,Professional Services,,wirecard,"Made Acquisitions, Was Acquired",,SoftBank,,342,,wirecard
2,CoachHub,,CoachHub is the digital coaching provider that...,,coachhub.io/,Berlin,11,Germany,1,https://www.linkedin.com/company/coachhub-io/,...,,,coachhub,,,Partech,Speedinvest,1572,,coachhub
3,SoundCloud,,SoundCloud is a social sound platform where an...,,soundcloud.com/,Berlin,11,Germany,1,https://www.linkedin.com/company/soundcloud/,...,Media and Entertainment,,soundcloud,Made Acquisitions,,Index Ventures,Kleiner Perkins,121,,soundcloud
4,Mambu,,Mambu is an SaaS banking engine powering innov...,,mambu.com,Berlin,11,Germany,1,http://www.linkedin.com/company/mambu,...,Lending and Investments,,mambu,,,Acton Capital,Bessemer Venture Partners,504,,mambu


In [11]:
# We create a list with the Account Name Cleaned values and we drop the duplicates. 

df3_company_list = df3.drop_duplicates(subset='Company Name Cleaned', keep='first')['Company Name Cleaned'].tolist()

In [13]:
df3_company_list[:5]

['delivery hero', 'wirecard', 'coachhub', 'soundcloud', 'mambu']

In [14]:
# We use the model 'TF-IDF' from the PolyFuzz library.
# Changing Parameters to be used on large Data
model = TFIDF(n_gram_range=(2,2), clean_string=True, min_similarity=0.95, cosine_method='sparse', top_n=1)

In [15]:
#Saving results of model
# Main part of the Fuzzy search
result = model.match(df2_account_list, df3_company_list)

---

In [16]:
result.head()

Unnamed: 0,From,To,Similarity
0,012 golden lines,,0.0
1,01synergy,,0.0
2,01t,,0.0
3,02 telefonica wifi,,0.0
4,07 media,,0.0


In [17]:
# We create a DataFrame that contains similarity > .95

old_account = result[result['Similarity'] > .95].reset_index(drop=True)
old_account = old_account.rename({'From': 'Account Name Cleaned', 'To': 'Company Name Cleaned'}, axis=1)

In [18]:
# old_account['T/F'] = old_account['Account Name Cleaned'] == old_account['Company Name Cleaned']

old_account['T/F'] = np.where(old_account['Similarity'] >= 0.98, True, False)


---

In [19]:
old_account['string_based'] = old_account['Account Name Cleaned'] == old_account['Company Name Cleaned']
old_account['len'] = old_account['Company Name Cleaned'].apply(lambda x: len(x))

---

In [20]:
old_account = old_account.sort_values(by=['T/F'], ascending=True).reset_index(drop=True)

In [21]:
old_account.to_csv('output/old_account_check.csv', index=False)

exit()

---

In [12]:
old_account = pd.read_csv('output/old_account_check.csv')

In [13]:
old_account.loc[(old_account['T/F'] != old_account['string_based']) & (old_account['len'] <= 20)].sort_values(by='len')

Unnamed: 0,Account Name Cleaned,Company Name Cleaned,Similarity,T/F,string_based,len
63,home24 se,home24,0.969,True,False,6
176,bilfinger se,bilfinger,0.957,True,False,9
218,homesmart,home smart,0.962,True,False,11
236,signal iduna ag,signal iduna,0.955,True,False,12
237,flaschenpost se,flaschenpost,0.972,True,False,12
254,bucher suter ch,bucher suter,0.971,True,False,13
270,gft technologies se,gft technologies,0.976,True,False,16
275,evonik industries ag,evonik industries,0.963,True,False,17


In [14]:
old_account_T = old_account[old_account['T/F'] == True].reset_index(drop='True')
old_account_F = old_account[old_account['T/F'] == False].reset_index(drop='True')

In [15]:
inner_account_T = pd.merge(df2_account,
                      old_account_T,
                      on = 'Account Name Cleaned',
                      how = 'inner',)
                      #indicator = True).drop(columns = '_merge')

In [16]:
inner_account_T

Unnamed: 0,Account Name,Website,Contact Count,Account ID,Account Name Cleaned,Company Name Cleaned,Similarity,T/F,string_based,len
0,2iQ Research,www.2iqresearch.com,3.0,0014O000025jov3,2iq research,2iq research,1.0,True,True,12
1,360T,www.360t.com,28.0,0014O000025jnUT,360t,360t,1.0,True,True,4
2,ABB Limited,http://www.abb.co.uk,886.0,001a000001RZDGP,abb,abb,1.0,True,True,3
3,ABOX42,www.abox42.com,1.0,0014O000025jmaO,abox42,abox42,1.0,True,True,6
4,ABP Induction,www.abpinduction.com,0.0,0014O00002EuyJD,abp induction,abp induction,1.0,True,True,13
...,...,...,...,...,...,...,...,...,...,...
267,Westwing,westwing.com,2.0,0014O00002LCzFQ,westwing,westwing,1.0,True,True,8
268,Wilo UK,www.wilo.com,3.0,0011300001wDDV4,wilo,wilo,1.0,True,True,4
269,Wirecard,www.wirecard.de,149.0,0014O000025jnvV,wirecard,wirecard,1.0,True,True,8
270,Zalando,www.zalando.de,22.0,0014O000025jpQf,zalando,zalando,1.0,True,True,7


In [17]:
inner_account_T = inner_account_T[['Account ID', 'Account Name', 'Company Name Cleaned', 'Website']]

In [18]:
inner_company_T = pd.merge(inner_account_T,
                      df3,
                      on = 'Company Name Cleaned',
                      how = 'left',)
                      #indicator = True).drop(columns = '_merge')

In [19]:
inner_company_T.head()

Unnamed: 0,Account ID,Account Name,Company Name Cleaned,Website,Company Name,ID,description,comment,url,city,...,tag 1 (required),tag 2,tag 3,logo,acquired?,acquisition price,investor 1 (required),investor 2,CB score,ST score
0,0014O000025jov3,2iQ Research,2iq research,www.2iqresearch.com,2iQ Research,,2iQ is the leading Insider Transaction data pr...,,2iqresearch.com/en,Frankfurt,...,Design,Financial Services,,2iqresearch,,,,,141818,
1,0014O000025jnUT,360T,360t,www.360t.com,360T,,360T is the leading global provider of web-bas...,,360t.com,Frankfurt,...,Lending and Investments,,,360t,Was Acquired,,Summit Partners,Brockhaus Private Equity,107403,
2,001a000001RZDGP,ABB Limited,abb,http://www.abb.co.uk,ABB,,ABB provides power and automation technologies...,,abb.com,Zürich,...,Hardware,Manufacturing,,abb,Made Acquisitions,,,,8459,
3,0014O000025jmaO,ABOX42,abox42,www.abox42.com,ABOX42,,"ABOX42 is a provider of IPTV, OTT and hybrid D...",,abox42.com,Karlsruhe,...,,,,abox42,,,,,169523,
4,0014O00002EuyJD,ABP Induction,abp induction,www.abpinduction.com,ABP Induction,,ABP Induction is a leading supplier of inducti...,,abpinduction.com,Dortmund,...,,,,abpinduction,,,,,335374,


In [20]:
# Accounts that we have in our system

inner_company_T.to_csv('output/old_accounts.csv') #Old accounts

---

# Concatenation function

In [44]:
# Getting the list of ID's
for i in range(0, len(inner_company_T['Account ID']), 60):
   concat = ','.join(inner_company_T['Account ID'].tolist()[i:i+60])
   print(concat, f" size :{len(concat)}")

0014O000025jov3,0014O000025jnUT,001a000001RZDGP,0014O000025jmaO,0014O00002EuyJD,0014O000025jnWB,0014O0000293E8g,0014O000025jpOZ,0014O000025jnWa,0014O000025jq0N,0011B000029NXaM,0014O0000292zN5,0014O000025jql1,0014O0000292jmk,0014O00002LDqqB,0014O000025jmVv,0014O000025jmfz,0011300001wDBTG,0014O00002LD40z,0014O000025jpcl,0014O000025jqvh,0014O00002LCKce,0014O000025jmeh,0014O000025jnvx,0014O0000293Hwe,0014O000025jphI,0014O000025jmbd,0014O00002935zR,0014O000025jmss,0014O00002LCIIX,0014O0000293G4n,0014O000025jomj,0014O0000293B08,001a000001RZDUG,001a000001TfuYP,001a000001RZDUn,0014O000025jpW9,0011B00002D9HGs,0014O0000292gKJ,0014O000025jpEk,0014O000025jmk0,0014O00002LDtHG,0014O000025jo6O,0014O000025jomg,0014O000025jsiP,0014O000025jpF2,0014O000025jqwr,0014O000025jotB,0014O000025jpS1,0014O000025jpSM,0014O000025jpEz,0014O000025jpDy,0014O000025jmfN,0014O00002933nD,0014O0000293KYa,0014O0000293I6w,0014O000025jmfO,0014O00002LDwI7,0014O000025jnXl,0014O000025joz2  size :959
0014O000025jpRu,0014O000025jv

---

In [21]:
inner_company_T_list = inner_company_T['Company Name'].tolist()

In [22]:
new_accounts = df3[~df3['Company Name'].isin(inner_company_T_list)].reset_index(drop=True)

In [23]:
new_accounts.to_csv('output/new_accounts.csv')

## Using the Website parameter



### Cleansing the old-new DataFrame