In [1]:
from bidict import bidict
from sentence_transformers import SentenceTransformer, util
from sqlalchemy import select
import sys

sys.path.append("..")

from src.auth.aurora import get_sales_client, get_portal_client
from src.db.database import session
from src.db.models import Client

In [2]:
stmt = select(Client.sales_id).distinct()
matched_ids = session.scalars(stmt).all()

In [3]:
len(matched_ids) / 530

0.9509433962264151

In [7]:
sales_client = get_sales_client()
placeholders = ', '.join(['%s'] * len(matched_ids))
sales_client_df = sales_client.query_to_pandas(
    f"SELECT * FROM t_client WHERE client_id IN ({placeholders})",
    params=matched_ids
)
sales_client_map = bidict(sales_client_df.set_index("client_id")["name"].to_dict())

In [9]:
portal_client = get_portal_client()
portal_client_df = portal_client.query_to_pandas("SELECT * FROM api_clients WHERE parent_id IN (1, 23)")
portal_client_df = portal_client_df[portal_client_df.groupby("name").transform("size") == 1]
portal_client_map = bidict(portal_client_df.set_index("id")["name"].to_dict())

In [10]:
clients = session.scalars(select(Client)).all()
for client in clients:
    sales_id = client.sales_id
    sales_name = sales_client_map[sales_id]
    portal_id = client.portal_id
    portal_name = portal_client_map[portal_id]
    print(f"{sales_name} <-----> {portal_name}")

Reading School <-----> Reading School
Elite Electrical Systems Ltd <-----> Elite Electrical Systems Ltd
Astron Fire & Security Ltd <-----> Astron Fire & Security Ltd
BOSS Security Solutions <-----> BOSS Security Solutions
A & E Fire Equipment Ltd <-----> A & E Fire Equipment Ltd
Alarmtec Ltd <-----> Alarmtec Ltd
Marlowe Fire & Security <-----> Marlowe Fire & Security
MAT Fire Systems Ltd <-----> MAT Fire Systems Ltd (MFS) 
MECE Fire and Security <-----> MECE Fire and Security
Micro Alarms Fire & Security Limited <-----> Micro Alarms
Mirza Sharif Ahmad Foundation <-----> Mirza Sharif Ahmad Foundation
Morris Line Engineering Limited <-----> Morris Line Engineering Limited
National Gallery <-----> National Gallery
Newmech Limited <-----> Newmech Limited 
SRS Alert Fire Systems Ltd <-----> SRS Alert Fire Systems Ltd
NW Solutions Group Ltd <-----> NW Solutions Group Ltd
Oakray Ltd <-----> Oakray Ltd
Onset Fire Limited <-----> Onset Fire Limited
Pacific Fire & Security Limited <-----> Pacifi

KeyError: 7327

In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2')

def get_embedding_dict(client_map):
    client_names = list(client_map.values())
    embeddings = model.encode(client_names, convert_to_tensor=True)
    sales_embeddings = {
        client_id: embedding
        for client_id, embedding
        in zip(client_map.keys(), embeddings)
    }
    return sales_embeddings

In [8]:
sales_embeddings = get_embedding_dict(sales_client_map)
portal_embeddings = get_embedding_dict(portal_client_map)

In [9]:
for sales_id, sales_embedding in sales_embeddings.items():
    sales_name = sales_client_map[sales_id]
    similarity_dict = {}
    print(sales_name)
    for portal_id, portal_embedding in portal_embeddings.items():
        similarity = util.pytorch_cos_sim(sales_embedding, portal_embedding)
        similarity_dict[portal_id] = similarity.item()
    names_list = []
    for id, score in sorted(
        similarity_dict.items(), key=lambda item: item[1], reverse=True
    )[:5]:
        print(f"------> {portal_client_map[id]} = {score}")
        names_list.append(portal_client_map[id])
    sys.stdout.flush()
    user_input = input("What is the match: ")
    while (user_input not in portal_client_map.values()) and (user_input != "skip"):
        user_input = input("Invalid input. Please enter a valid name: ")
    if user_input == "skip":
        print("Skipping this match.")
    else:
        portal_id = portal_client_map.inverse[user_input]
        print("Linking " + user_input + " to " + sales_name)
        new_client = Client(
            sales_id=sales_id,
            portal_id=portal_id,
        )
        session.add(new_client)
        session.commit()
    sys.stdout.flush()

Royal Household
------> Historic England = 0.43260112404823303
------> My Organisation = 0.3554822504520416
------> CBES Ltd = 0.35392817854881287
------> Princess Alexandra Hospital = 0.35226789116859436
------> Royal Surrey County Hospital NHS = 0.3450230658054352


Skipping this match.
YMCA Bath Group
------> YMCA Derbyshire = 0.6228280067443848
------> The Kirby Group = 0.47630810737609863
------> PPHE Hotel Group  = 0.46878960728645325
------> Ventro Group  = 0.4180353879928589
------> JPR Group = 0.40297242999076843
Skipping this match.
Armadillo Shield Ltd
------> Shield Fire Protection Ltd = 0.6842349767684937
------> Secureshield Ltd = 0.579161524772644
------> _Shield Co. (NT) = 0.5741448998451233
------> Oakray Ltd = 0.5305051207542419
------> Allied Fire Protection Ltd = 0.5124648809432983


KeyboardInterrupt: Interrupted by user