#### Imports

In [None]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import json

from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

from generate_dataset import generate_dataset

#### Check GPU Availability

In [2]:
print("Is using gpu:", torch.cuda.is_available())

Is using gpu: True


### Load ICP Dataset or generate if not exists

In [3]:
try:
    df = pd.read_csv("companies.csv")
    print(f"Loaded {len(df)} companies from CSV")
except:
    print("Dataset does not exists, generating one.")
    df = generate_dataset()

df.head(5)

Dataset does not exists, generating one.


Unnamed: 0,company_name,company_description,industry,size,region
0,Boogle_000,Cloud-native analytics for hospitals. Focus on...,healthcare,enterprise,LATAM
1,Grab_001,Real-time transaction monitoring using ML. Fra...,fintech,enterprise,EMEA
2,EnergyGrid_002,"Integrates with Shopify, Magento, and AWS. Rea...",retail,SMB,CIS
3,IoTAnalytics_003,Edge-device data aggregation. Industrial IoT a...,iot,SMB,CIS
4,BRACK_004,Automated feedback generation. Used by 50 univ...,edtech,SMB,Global


### Defining the ideal customer profile (ICP) text

In [4]:
icp_text = "AI-powered HR platform for talent acquisition, employee performance tracking, and predictive workforce analytics, focusing on data privacy and scalable HR workflows."
print (icp_text)

AI-powered HR platform for talent acquisition, employee performance tracking, and predictive workforce analytics, focusing on data privacy and scalable HR workflows.


#### Load Model

In [5]:
model = SentenceTransformer("all-MiniLM-L6-v2")

### Convert company description into embeddings

In [6]:
company_texts = df["company_description"].fillna("").astype(str).tolist()

company_embs = model.encode(company_texts, show_progress_bar=True, normalize_embeddings=True)
#print(company_embs)

Batches: 100%|███████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 14.65it/s]


### Convert the ICP description into an embedding

In [7]:
icp_emb = model.encode([icp_text], show_progress_bar=False, normalize_embeddings=True)[0]
#print(icp_emb)

### Compute similarity scores and ranking from best to worse

In [8]:
#Cosine similarity
sims = (company_embs @ icp_emb.reshape(-1,1)).flatten()
sims = sims.clip(-1, 1)

df['similarity_score'] = sims

df = df.sort_values('similarity_score', ascending=False).reset_index(drop=True)

df.head(10)

Unnamed: 0,company_name,company_description,industry,size,region,similarity_score
0,Petronas_047,AI-powered talent acquisition and resume scree...,hrm,SMB,Global,0.73765
1,Retailio_167,Workforce planning and automated HR analytics ...,hrm,startup,EMEA,0.716345
2,HealthAI_179,AI-powered talent acquisition and resume scree...,hrm,SMB,Global,0.68105
3,Uthao_083,AI-powered talent acquisition and resume scree...,hrm,startup,APAC,0.657765
4,ShopSense_035,AI-powered talent acquisition and resume scree...,hrm,SMB,LATAM,0.638868
5,TelekomMalaysia_011,Resume parsing and automated interview schedul...,hrm,startup,LATAM,0.630343
6,ShopSense_095,Resume parsing and automated interview schedul...,hrm,SMB,Global,0.598385
7,IoTAnalytics_191,Workforce planning and automated HR analytics ...,hrm,startup,APAC,0.560232
8,Retailio_059,Resume parsing and automated interview schedul...,hrm,enterprise,APAC,0.545379
9,Medlytics_155,Used by 100+ companies for recruitment and ret...,hrm,SMB,EMEA,0.538407


### Explanation

#### Top matching sentence with explanation function

In [13]:
def top_matching_sentences(text, icp_emb, model, top_k=2):
    # Split text into sentences
    sents = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
    if not sents:
        return []
        
    sent_embs = model.encode(sents, show_progress_bar=False, normalize_embeddings=True)
    sims = (sent_embs @ icp_emb.reshape(-1,1)).flatten()

    topidx = sims.argsort()[::-1][:top_k]
    return [sents[i] for i in topidx]

#### Ranked list with explanation

In [10]:
df['explanation'] = df['company_description'].apply(
    lambda x: " ".join(top_matching_sentences(x, icp_emb, model, top_k=2))
)

df.head(10)

Unnamed: 0,company_name,company_description,industry,size,region,similarity_score,explanation
0,Petronas_047,AI-powered talent acquisition and resume scree...,hrm,SMB,Global,0.73765,Workforce planning and automated HR analytics ...
1,Retailio_167,Workforce planning and automated HR analytics ...,hrm,startup,EMEA,0.716345,Workforce planning and automated HR analytics ...
2,HealthAI_179,AI-powered talent acquisition and resume scree...,hrm,SMB,Global,0.68105,Workforce planning and automated HR analytics ...
3,Uthao_083,AI-powered talent acquisition and resume scree...,hrm,startup,APAC,0.657765,AI-powered talent acquisition and resume scree...
4,ShopSense_035,AI-powered talent acquisition and resume scree...,hrm,SMB,LATAM,0.638868,AI-powered talent acquisition and resume scree...
5,TelekomMalaysia_011,Resume parsing and automated interview schedul...,hrm,startup,LATAM,0.630343,Employee engagement dashboards and AI-driven f...
6,ShopSense_095,Resume parsing and automated interview schedul...,hrm,SMB,Global,0.598385,Predictive attrition analytics for enterprises...
7,IoTAnalytics_191,Workforce planning and automated HR analytics ...,hrm,startup,APAC,0.560232,Workforce planning and automated HR analytics ...
8,Retailio_059,Resume parsing and automated interview schedul...,hrm,enterprise,APAC,0.545379,Workforce planning and automated HR analytics ...
9,Medlytics_155,Used by 100+ companies for recruitment and ret...,hrm,SMB,EMEA,0.538407,AI-powered talent acquisition and resume scree...


### Generating a JSON output 

In [11]:
import json

results = df[['company_name', 'similarity_score', 'explanation']].to_dict(orient='records')

with open('results.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print("Saved results.json with top company matches")


Saved results.json with top company matches
