In [4]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re  # For regular expressions

# Your DataFrame
data = {
    'original_name': [
        'asml nv', 'unilever bv', 'shell bv', 'ing bank nv', 'koninklijke filips',
        'adyen nv', 'relx plc', 'prosus group', 'dsm', 'ahold-delheize', 'heineken breweries',
        "apple inc", "Microsoft", "Capgemini India pvt limited", "SAMSUNG ÃŠLECTRONICS Holding, LTD",
    ],
    'source_name': [
        'asml holding nv', 'unilever nv', 'royal dutch shell plc', 'ing group nv', 'koninklijke philips nv',
        'adyen nv', 'relx plc', 'prosus nv', 'koninklijke dsm nv', 'koninklijke ahold delhaize nv', 'heineken nv',
        "apple", "Microsoft inc", "Capgemini", "samsung electronics",
    ]
}

# Create the DataFrame
df = pd.DataFrame(data)

# List of stopwords to remove
stopwords = ['inc', 'pvt', 'limited', 'ltd', 'plc', 'group',  'llc', 'corporation','holding']

# preprocess: lowercase, remove stopwords, and remove punctuation
def preprocess_name(name):
    # Convert to lowercase
    name = name.lower()
    
   
    for stopword in stopwords:
        name = name.replace(stopword, '')  # Remove stopwords
    
    # Remove punctuation and special characters (anything that's not a letter, number, or space)
    name = re.sub(r'[^a-z0-9\s]', '', name)
    
    return name.strip()  # Remove any leading/trailing spaces after removal

# Apply preprocessing to both columns
df['processed_original_name'] = df['original_name'].apply(preprocess_name)
df['processed_source_name'] = df['source_name'].apply(preprocess_name)

# Function to perform fuzzy matching
def match_names(df, column1, column2):
    matches = []
    for orig_name, source_name in zip(df[column1], df[column2]):
        match = process.extractOne(orig_name, df[column2], scorer=fuzz.token_sort_ratio)
        matches.append((orig_name, source_name, match[1]))  # original name, source name, match score
    return matches

# Perform the matching
matches = match_names(df, 'processed_original_name', 'processed_source_name')

# Convert to DataFrame for better readability
match_results = pd.DataFrame(matches, columns=['original_name', 'source_name', 'match_score'])

# Show the results
print(match_results)


         original_name                    source_name  match_score
0              asml nv                       asml  nv          100
1          unilever bv                    unilever nv           91
2             shell bv              royal dutch shell           48
3          ing bank nv                        ing  nv           71
4   koninklijke filips         koninklijke philips nv           72
5             adyen nv                       adyen nv          100
6                 relx                           relx          100
7               prosus                      prosus nv           80
8                  dsm             koninklijke dsm nv           40
9        aholddelheize  koninklijke ahold delhaize nv           57
10  heineken breweries                    heineken nv           55
11               apple                          apple          100
12           microsoft                      microsoft          100
13     capgemini india                      capgemini         

In [1]:
import pandas as pd
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
import torch
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity


# Your DataFrame
data = {
    'original_name': [
        'asml nv', 'unilever bv', 'shell bv', 'ing bank nv', 'koninklijke filips',
        'adyen nv', 'relx plc', 'prosus group', 'dsm', 'ahold-delheize', 'heineken breweries',
        "apple inc", "Microsoft", "Capgemini India pvt limited", "SAMSUNG ÃŠLECTRONICS Holding, LTD",
    ],
    'source_name': [
        'asml holding nv', 'unilever nv', 'royal dutch shell plc', 'ing group nv', 'koninklijke philips nv',
        'adyen nv', 'relx plc', 'prosus nv', 'koninklijke dsm nv', 'koninklijke ahold delhaize nv', 'heineken nv',
        "apple", "Microsoft inc", "Capgemini", "samsung electronics",
    ]
}

# Create the DataFrame
df = pd.DataFrame(data)

# preprocessing : lowercase, remove stopwords, and remove punctuation
def preprocess_name(name):
    stopwords = ['inc', 'pvt', 'limited', 'ltd', 'plc', 'group',  'llc', 'corporation', 'holding']
    name = name.lower()  # Convert to lowercase
    for stopword in stopwords:
        name = name.replace(stopword, '')  # Remove stopwords
    name = re.sub(r'[^a-z0-9\s]', '', name)  # Remove punctuation and special characters
    return name.strip()  # Remove any leading/trailing spaces after removal


df['processed_original_name'] = df['original_name'].apply(preprocess_name)
df['processed_source_name'] = df['source_name'].apply(preprocess_name)

# Load BERT 
model_name = 'bert-base-uncased'  
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Function to get embeddings from BERT
def get_embeddings(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the [CLS] token's embeddings
    embeddings = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    return embeddings

# Generate embeddings for both original and source names
df['original_name_embeddings'] = df['processed_original_name'].apply(lambda x: get_embeddings(model, tokenizer, x))
df['source_name_embeddings'] = df['processed_source_name'].apply(lambda x: get_embeddings(model, tokenizer, x))

# compute cosine similarity
def compute_similarity(embedding1, embedding2):
    return cosine_similarity([embedding1], [embedding2])[0][0]

# calculate similarity scores
similarity_scores = []
for orig_emb, source_emb in zip(df['original_name_embeddings'], df['source_name_embeddings']):
    score = compute_similarity(orig_emb, source_emb)
    similarity_scores.append(score)

# Add similarity scores to the DataFrame
df['similarity_score'] = similarity_scores

# Display the results
print(df[['original_name', 'source_name', 'similarity_score']])


                        original_name                    source_name  \
0                             asml nv                asml holding nv   
1                         unilever bv                    unilever nv   
2                            shell bv          royal dutch shell plc   
3                         ing bank nv                   ing group nv   
4                  koninklijke filips         koninklijke philips nv   
5                            adyen nv                       adyen nv   
6                            relx plc                       relx plc   
7                        prosus group                      prosus nv   
8                                 dsm             koninklijke dsm nv   
9                      ahold-delheize  koninklijke ahold delhaize nv   
10                 heineken breweries                    heineken nv   
11                          apple inc                          apple   
12                          Microsoft                  Microsoft

In [2]:
df.to_excel(r'C:\Users\parnghos\OneDrive - Capgemini\Desktop\Github\entity_matching_results.xlsx', index=False)
