In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

In [2]:
# Define a function to compute cosine similarity between two texts
def sim_vec(model_path, query, docs):
    model = SentenceTransformer(model_path)
    q = model.encode([query], normalize_embeddings=True)
    d = model.encode(docs, normalize_embeddings=True)
    sims = model.similarity(q, d)
    return sims[0]  



## Data Prep

In [3]:
# Load data
df = pd.read_csv("potential-talents - Aspiring human resources - seeking human resources.csv")
# Clean and convert 'connection' column
df["connection"] = df["connection"].astype(str).str.replace("+", "", regex=False).astype(int)
df.connection.min()
scaler = MinMaxScaler()
df["connection"] = scaler.fit_transform(df[["connection"]])
df

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",0.168337,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,1.000000,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",0.086172,
3,4,People Development Coordinator at Ryan,"Denton, Texas",1.000000,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",1.000000,
...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",0.204409,
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",1.000000,
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,0.096192,
102,103,Always set them up for Success,Greater Los Angeles Area,1.000000,


In [4]:
# Remove duplicate entries based on job_title, location, and connection
df_nodup = df.copy().drop_duplicates(subset=["job_title", "location", "connection"],keep='first')
df_nodup.drop(columns= ["location","fit"], inplace=True)
df_nodup.reset_index(drop=True, inplace=True)
doc = df_nodup['job_title'].tolist()
query = "Aspiring human resources"

df_nodup.shape #cuts by half

(53, 3)

In [5]:
# Similarirty scores
df_nodup["mnet_sim"] = sim_vec("sentence-transformers/all-mpnet-base-v2",query,doc)
df_nodup["mLM_sim"] = sim_vec("all-MiniLM-L6-v2",query,doc)

In [6]:
df_nodup["quen_sim"] = sim_vec("Qwen/Qwen3-Embedding-0.6B", query,doc) 
df_clean = df_nodup.copy()


In [7]:
df_clean["mnet_score"] = (df_clean["mnet_sim"] *0.90) + (df_clean["connection"] *0.1)
df_clean["mLM_score"] = (df_clean["mLM_sim"] *0.90) + (df_clean["connection"] *0.1)
df_clean["quen_score"] = (df_clean["quen_sim"] *0.90) + (df_clean["connection"] *0.1)


In [8]:
df_clean["rank"] = (
    ((df_clean["mnet_score"] >= df_clean["mnet_score"].quantile(0.75)).astype(int) +
     (df_clean["mLM_score"] >= df_clean["mLM_score"].quantile(0.75)).astype(int) +
     (df_clean["quen_score"] >= df_clean["quen_score"].quantile(0.75)).astype(int))
    .apply(lambda x: 1 if x >= 1 else 0)
)


In [9]:
df_clean[df_clean["rank"] == 1]

Unnamed: 0,id,job_title,connection,mnet_sim,mLM_sim,quen_sim,mnet_score,mLM_score,quen_score,rank
0,1,2019 C.T. Bauer College of Business Graduate (...,0.168337,0.721934,0.573268,0.635231,0.666574,0.532775,0.588542,1
2,3,Aspiring Human Resources Professional,0.086172,0.879234,0.949807,0.959759,0.799928,0.863444,0.872401,1
5,6,Aspiring Human Resources Specialist,0.0,0.864964,0.928035,0.945432,0.778467,0.835231,0.850889,1
6,7,Student at Humber College and Aspiring Human R...,0.12024,0.728648,0.757811,0.781263,0.667808,0.694054,0.715161,1
8,10,Seeking Human Resources HRIS and Generalist Po...,1.0,0.591348,0.744711,0.830001,0.632214,0.77024,0.847001,1
12,27,Aspiring Human Resources Management student se...,1.0,0.63158,0.720099,0.844989,0.668422,0.748089,0.86049,1
13,28,Seeking Human Resources Opportunities,0.779559,0.696302,0.799642,0.854704,0.704628,0.797633,0.847189,1
14,66,Experienced Retail Manager and aspiring Human ...,0.112224,0.652588,0.696065,0.842892,0.598551,0.637681,0.769826,1
15,67,"Human Resources, Staffing and Recruiting Profe...",1.0,0.697636,0.745484,0.801545,0.727872,0.770935,0.821391,1
16,68,Human Resources Specialist at Luxottica,1.0,0.565116,0.622302,0.712178,0.608604,0.660072,0.74096,1


In [10]:
X = df_clean[["mnet_score","mLM_score","quen_score"]]
y = df_clean["rank"]

## Modeling

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [20]:
query_train = [X_train.shape[0]]
query_test = [X_test.shape[0]]

In [21]:
train_data = lgb.Dataset(X_train, label=y_train, group=query_train)
test_data = lgb.Dataset(X_test, label=y_test, group=query_test)

In [22]:
params = {
   'objective': 'lambdarank',
   'metric': 'ndcg',
   'learning_rate': 0.1,
   'num_leaves': 31,
   'min_data_in_leaf': 1,
   'verbose': -1
}

In [23]:
gbm = lgb.train(
   params,
   train_data,
   valid_sets=[test_data],
   num_boost_round=100,
   callbacks=[lgb.early_stopping(stopping_rounds=10)]
)

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1]	valid_0's ndcg@1: 1	valid_0's ndcg@2: 1	valid_0's ndcg@3: 1	valid_0's ndcg@4: 1	valid_0's ndcg@5: 1


In [24]:
y_pred = gbm.predict(X_test)
score = ndcg_score([y_test], [y_pred])
print(f"NDCG Score: {score}")

NDCG Score: 1.0
