## Entity Resolution - Deeptech Engineer Challenge


### 1. Imports

In [2]:
import pandas as pd
import re
from rapidfuzz import fuzz

### 2. Visualize dataset and preprocessing data

In [11]:
# import the dataset and show the first 5 entries
df = pd.read_parquet('./veridion_entity_resolution_challenge.snappy.parquet')
df.head()

Unnamed: 0,company_name,company_legal_names,company_commercial_names,main_country_code,main_country,main_region,main_city_district,main_city,main_postcode,main_street,...,generated_description,generated_business_tags,status,domains,all_domains,revenue,revenue_type,employee_count,employee_count_type,inbound_links_count
0,Owens Liquors,,Owens Liquors,US,United States,South Carolina,,Pawleys Island,29585,Ocean Highway,...,Owens Liquors is a retail establishment locate...,Retail Trade | Liquor Stores | Wine & Liquor,Active,,,,,,,
1,Club Tarneit,,Club Tarneit,AU,Australia,Victoria,Tarneit,City Of Wyndham,3029,,...,,,Active,,,,,9.0,extracted,
2,AAA Auto Otrokovice Zlín,,AAA Auto Otrokovice Zlín,CZ,Czechia,Zlín,Kvítkovice U Otrokovic,Otrokovice,765 02,Zlínská,...,AAA Auto Otrokovice (Zlín) is a car dealership...,In-store Shopping | Investment Management Serv...,Active,,,,,,,
3,Gisinger GmbH,Gisinger GmbH,,DE,Germany,Baden-Württemberg,,Ühlingen-Birkendorf,79777,Berauer Straße,...,,,Active,,,,,,,
4,Kasana Life,,Kasana Life,US,United States,Connecticut,,Litchfield,06759,,...,,,Active,,,,,,,


In [12]:
# normalize the name of the company names
def normalize_name(name):
    if pd.isnull(name):
        return ""
    return re.sub(r'[^a-z0-9]', '', name.lower())

df['normalized_name'] = df['company_name'].apply(normalize_name)
df.head()

Unnamed: 0,company_name,company_legal_names,company_commercial_names,main_country_code,main_country,main_region,main_city_district,main_city,main_postcode,main_street,...,generated_business_tags,status,domains,all_domains,revenue,revenue_type,employee_count,employee_count_type,inbound_links_count,normalized_name
0,Owens Liquors,,Owens Liquors,US,United States,South Carolina,,Pawleys Island,29585,Ocean Highway,...,Retail Trade | Liquor Stores | Wine & Liquor,Active,,,,,,,,owensliquors
1,Club Tarneit,,Club Tarneit,AU,Australia,Victoria,Tarneit,City Of Wyndham,3029,,...,,Active,,,,,9.0,extracted,,clubtarneit
2,AAA Auto Otrokovice Zlín,,AAA Auto Otrokovice Zlín,CZ,Czechia,Zlín,Kvítkovice U Otrokovic,Otrokovice,765 02,Zlínská,...,In-store Shopping | Investment Management Serv...,Active,,,,,,,,aaaautootrokovicezln
3,Gisinger GmbH,Gisinger GmbH,,DE,Germany,Baden-Württemberg,,Ühlingen-Birkendorf,79777,Berauer Straße,...,,Active,,,,,,,,gisingergmbh
4,Kasana Life,,Kasana Life,US,United States,Connecticut,,Litchfield,06759,,...,,Active,,,,,,,,kasanalife


### 3. Adding columns and groupping unique and duplicate company names

In [13]:
# group the duplicate ones
groups = {}
group_id = 0
name_to_group = {}

unique_names = df['normalized_name'].unique()
for name in unique_names:
    assigned = False
    for rep in groups.keys():
        if fuzz.ratio(name, rep) > 90:
            name_to_group[name] = groups[rep]
            assigned = True
            break
    if not assigned:
        groups[name] = group_id
        name_to_group[name] = group_id
        group_id += 1

df['company_group'] = df['normalized_name'].map(name_to_group)
df.head()

Unnamed: 0,company_name,company_legal_names,company_commercial_names,main_country_code,main_country,main_region,main_city_district,main_city,main_postcode,main_street,...,status,domains,all_domains,revenue,revenue_type,employee_count,employee_count_type,inbound_links_count,normalized_name,company_group
0,Owens Liquors,,Owens Liquors,US,United States,South Carolina,,Pawleys Island,29585,Ocean Highway,...,Active,,,,,,,,owensliquors,0
1,Club Tarneit,,Club Tarneit,AU,Australia,Victoria,Tarneit,City Of Wyndham,3029,,...,Active,,,,,9.0,extracted,,clubtarneit,1
2,AAA Auto Otrokovice Zlín,,AAA Auto Otrokovice Zlín,CZ,Czechia,Zlín,Kvítkovice U Otrokovic,Otrokovice,765 02,Zlínská,...,Active,,,,,,,,aaaautootrokovicezln,2
3,Gisinger GmbH,Gisinger GmbH,,DE,Germany,Baden-Württemberg,,Ühlingen-Birkendorf,79777,Berauer Straße,...,Active,,,,,,,,gisingergmbh,3
4,Kasana Life,,Kasana Life,US,United States,Connecticut,,Litchfield,06759,,...,Active,,,,,,,,kasanalife,4


In [None]:
# show the group ones
grouped_df = df.groupby('company_group').agg({
    'company_name': lambda x: list(x)
})
grouped_df.sample(15)

Unnamed: 0_level_0,company_name
company_group,Unnamed: 1_level_1
9691,[UnipolRental S.p.A. - Verona]
6578,[DW Don's Automotive Group]
6352,"[Juanitos, Juanitos, Juanitos, Juanitos]"
12080,"[Wei Yeong, Wei Yeong]"
9448,"[First Class Block Paving Cleaning, First Clas..."
14364,[Heritage Coffee Roasting Co - Glacier Cafe]
305,"[Hoeve de Groenzoom, Hoeve de Groenzoom, Hoeve..."
3648,"[Redmond High School, Redmond High School]"
6245,"[Koch & Partner, Koch & Partner, Koch & Partne..."
8706,[Ibiza-Moda Apdo]


### 4. Save the new dataset

In [None]:
# result_all_columns
df.to_parquet('result_all_columns.parquet') 

In [21]:
# result_just_groupped
grouped_df.to_parquet('result_just_groupped.parquet') 