# Potential Talents: A Ranking Model

**Modules/Functions needed**

In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score, confusion_matrix, classification_report


**Quick and easy function to caluclate cosine similartity**

In [3]:
# Define a function to compute cosine similarity between two texts
def sim_vec(model_path, query, docs):
    model = SentenceTransformer(model_path)
    q = model.encode([query], normalize_embeddings=True)
    d = model.encode(docs, normalize_embeddings=True)
    sims = model.similarity(q, d)
    return sims[0]  



## Space Vector Model

In [4]:
# Load data
df = pd.read_csv("potential-talents - Aspiring human resources - seeking human resources.csv")
# Clean and convert 'connection' column
df["connection"] = df["connection"].astype(str).str.replace("+", "", regex=False).astype(int)
df.connection.min()
# Normalize 'connection' column
scaler = MinMaxScaler()
df["connection"] = scaler.fit_transform(df[["connection"]])
df

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",0.168337,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,1.000000,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",0.086172,
3,4,People Development Coordinator at Ryan,"Denton, Texas",1.000000,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",1.000000,
...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",0.204409,
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",1.000000,
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,0.096192,
102,103,Always set them up for Success,Greater Los Angeles Area,1.000000,


In [5]:
# Remove duplicate entries based on job_title, location, and connection
df_nodup = df.copy().drop_duplicates(subset=["job_title", "location", "connection"],keep='first')
df_nodup.drop(columns= ["location","fit"], inplace=True)
df_nodup.reset_index(drop=True, inplace=True)
doc = df_nodup['job_title'].tolist()
query1 = "Aspiring human resources"
qurey2 = "seeking human resources"

df_nodup.shape #cuts by half

(53, 3)

In [6]:
# Similarirty scores -query1
df_nodup["mnet_sim_q1"] = sim_vec("sentence-transformers/all-mpnet-base-v2",query1,doc)
df_nodup["mLM_sim_q1"] = sim_vec("all-MiniLM-L6-v2",query1,doc)
df_nodup["quen_sim_q1"] = sim_vec("Qwen/Qwen3-Embedding-0.6B", query1,doc)

In [7]:
# Similarirty scores -query2
df_nodup["mnet_sim_q2"] = sim_vec("sentence-transformers/all-mpnet-base-v2",qurey2,doc)
df_nodup["mLM_sim_q2"] = sim_vec("all-MiniLM-L6-v2",qurey2,doc)
df_nodup["quen_sim_q2"] = sim_vec("Qwen/Qwen3-Embedding-0.6B", qurey2,doc)  

In [16]:
df_nodup["rank_score_q1"] = (
    0.8 * df_nodup[["mnet_sim_q1", "mLM_sim_q1", "quen_sim_q1"]].mean(axis=1)
    + 0.2 * df_nodup["connection"]
)
df_nodup["rank_score_q2"] = (
    0.8 * df_nodup[["mnet_sim_q2", "mLM_sim_q2", "quen_sim_q2"]].mean(axis=1)
    + 0.2 * df_nodup["connection"]
)   
df_nodup["avg_rank_score"] = (df_nodup["rank_score_q1"] + df_nodup["rank_score_q2"]) / 2    
df_nodup.sort_values(by="avg_rank_score", ascending=False, inplace=True)
df_clean = df_nodup.copy()[["job_title", "connection", "avg_rank_score"]]
df_clean.reset_index(drop=True, inplace=True)
df_clean.to_csv("ranked_candidates.csv", index=False)

In [15]:
pd.set_option("display.max_colwidth", None)  # ensures full text is shown

top10 = df_clean.head(10)
top10[["job_title","avg_rank_score"]]

Unnamed: 0,job_title,avg_rank_score
0,Seeking Human Resources Opportunities,0.825267
1,"Human Resources, Staffing and Recruiting Professional",0.809404
2,Seeking Human Resources HRIS and Generalist Positions,0.807303
3,Aspiring Human Resources Management student seeking an internship,0.757966
4,Human Resources Generalist at Loparex,0.714796
5,Human Resources Generalist at Schwan's,0.714026
6,Human Resources Specialist at Luxottica,0.707437
7,Aspiring Human Resources Professional,0.702619
8,Aspiring Human Resources Professional,0.691798
9,"Human Resources Generalist at ScottMadden, Inc.",0.687342


## Learning-To-Rank Model

In [40]:
df_clean["rank"] = (
    (df_nodup["avg_rank_score"] >= df_nodup["avg_rank_score"].quantile(0.80))
    .astype(int)
)

In [41]:
X = df_nodup[["rank_score_q1", "rank_score_q2", "connection"]]
y = df_clean["rank"]

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [43]:
query_train = [X_train.shape[0]]
query_test = [X_test.shape[0]]

In [44]:
train_data = lgb.Dataset(X_train, label=y_train, group=query_train)
test_data = lgb.Dataset(X_test, label=y_test, group=query_test)

In [45]:
params = {
   'objective': 'lambdarank',
   'metric': ['ndcg', 'map'],
   'learning_rate': 0.05,
   'num_leaves': 31,
   'min_data_in_leaf': 1,
   'verbose': -1,
   "ndcg_eval_at": [5, 10, 20]
}

In [46]:
gbm = lgb.train(
   params,
   train_data,
   valid_sets=[test_data],
   num_boost_round=100,
   callbacks=[lgb.early_stopping(stopping_rounds=10)]
)

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[1]	valid_0's ndcg@5: 0.181542	valid_0's ndcg@10: 0.322809	valid_0's ndcg@20: 0.453711	valid_0's map@5: 0.0666667	valid_0's map@10: 0.140741	valid_0's map@20: 0.23165


In [47]:
y_pred = gbm.predict(X_test)
score = ndcg_score([y_test], [y_pred])
print(f"NDCG Score: {score}")

NDCG Score: 0.46370519382593256
