In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
url = 'https://docs.google.com/spreadsheets/d/117X6i53dKiO7w6kuA1g1TpdTlv1173h_dPlJt5cNNMU/export?format=csv'
df = pd.read_csv(url)

# Define the keywords related to searching
keywords = ["aspiring human resources", "seeking human resources"]

# Combine the keywords into a single string for comparison
keyword_str = " ".join(keywords)

# Step 1: Use TF-IDF to vectorize both job titles and keywords
tfidf = TfidfVectorizer()
job_titles_tfidf = tfidf.fit_transform(df['job_title'].fillna(''))
keyword_tfidf = tfidf.transform([keyword_str])

# Step 2: Calculate similarity between job titles and the keywords
similarity_scores = cosine_similarity(job_titles_tfidf, keyword_tfidf).flatten()

# Step 3: Normalize similarity scores
scores = (similarity_scores - similarity_scores.min()) / (similarity_scores.max() - similarity_scores.min())

# Step 4: Update the 'fit' column with the calculated fitness scores
df['fit'] = scores

# Step 5: Rank the candidates by fit score
df_ranked = df.sort_values(by='fit', ascending=False)

# Display the ranked dataset
print(df_ranked[['id', 'job_title', 'location', 'connection', 'fit']])

      id                                          job_title  \
72    73  Aspiring Human Resources Manager, seeking inte...   
45    46              Aspiring Human Resources Professional   
57    58              Aspiring Human Resources Professional   
16    17              Aspiring Human Resources Professional   
32    33              Aspiring Human Resources Professional   
..   ...                                                ...   
22    23    Advisory Board Member at Celal Bayar University   
21    22             People Development Coordinator at Ryan   
19    20  Native English Teacher at EPIK (English Progra...   
17    18             People Development Coordinator at Ryan   
103  104   Director Of Administration at Excellence Logging   

                                location connection       fit  
72                   Houston, Texas Area          7  1.000000  
45   Raleigh-Durham, North Carolina Area         44  0.936737  
57   Raleigh-Durham, North Carolina Area         44

In [3]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.1.0-py3-none-any.whl.metadata (23 kB)
Downloading sentence_transformers-3.1.0-py3-none-any.whl (249 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.1/249.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.1.0


In [16]:
from sentence_transformers import SentenceTransformer

# Pre-trained SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Convert keywords into embeddings
keyword_embeddings = model.encode(keywords)

# Convert job titles into embeddings
job_title_embeddings = model.encode(df['job_title'].fillna('').tolist())

# Calculate cosine similarity between job titles and the keywords
fit_scores = cosine_similarity(job_title_embeddings, np.mean(keyword_embeddings, axis=0).reshape(1, -1)).flatten()

# Update the 'fit' column with the calculated fitness scores
df['fit'] = fit_scores

# View the dataset with fitness scores
print(df[['id', 'job_title', 'fit']])

# Rank the candidates by fit score
df_ranked = df.sort_values(by='fit', ascending=False)

# Display the ranked dataset
print(df_ranked[['id', 'job_title', 'location', 'connection', 'fit']])




      id                                          job_title       fit
0      1  2019 C.T. Bauer College of Business Graduate (...  0.502562
1      2  Native English Teacher at EPIK (English Progra...  0.237043
2      3              Aspiring Human Resources Professional  0.870847
3      4             People Development Coordinator at Ryan  0.378838
4      5    Advisory Board Member at Celal Bayar University  0.252375
..   ...                                                ...       ...
99   100  Aspiring Human Resources Manager | Graduating ...  0.722427
100  101              Human Resources Generalist at Loparex  0.634931
101  102   Business Intelligence and Analytics at Travelers  0.256748
102  103                     Always set them up for Success  0.213875
103  104   Director Of Administration at Excellence Logging  0.275158

[104 rows x 3 columns]
      id                                          job_title  \
29    30              Seeking Human Resources Opportunities   
27    28  

In [17]:
# Identify the 7th candidate and set 'starred' to 1
df.loc[df_ranked.index[6], 'starred'] = 1
df['starred'] = df['starred'].fillna(0)

# Use the SentenceTransformer model (assuming it's still named 'model')
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Calculate the mean embedding of starred candidates
starred_embeddings = model.encode(df[df['starred'] == 1]['job_title'].fillna('').tolist())

if len(starred_embeddings) > 0:
  starred_embeddings = np.mean(starred_embeddings, axis=0)
else:
  starred_embeddings = np.zeros(model.vector_size)

# Calculate similarity with starred candidates
fit_scores = cosine_similarity(job_title_embeddings, starred_embeddings.reshape(1, -1)).flatten()

# Update the 'fit' column with the calculated fitness scores
df['fit'] = fit_scores

# Rank the candidates by fit score
df_ranked = df.sort_values(by='fit', ascending=False)

# Display the ranked dataset
print(df_ranked[['id', 'job_title', 'location', 'connection', 'fit']])



      id                                          job_title  \
20    21              Aspiring Human Resources Professional   
96    97              Aspiring Human Resources Professional   
2      3              Aspiring Human Resources Professional   
57    58              Aspiring Human Resources Professional   
16    17              Aspiring Human Resources Professional   
..   ...                                                ...   
1      2  Native English Teacher at EPIK (English Progra...   
15    16  Native English Teacher at EPIK (English Progra...   
19    20  Native English Teacher at EPIK (English Progra...   
84    85  RRP Brand Portfolio Executive at JTI (Japan To...   
102  103                     Always set them up for Success   

                                location connection       fit  
20   Raleigh-Durham, North Carolina Area         44  0.948653  
96                  Kokomo, Indiana Area         71  0.948653  
2    Raleigh-Durham, North Carolina Area         44

In [11]:
import gensim
from gensim.models import Word2Vec
from gensim.models import FastText


model = Word2Vec(sentences=df['job_title'].fillna('').tolist(), vector_size=100, window=5, min_count=1, workers=4)
# Calculate keyword embeddings, handling cases with no matching words
keyword_embeddings = []
for word in keywords:
    if word in model.wv:
        keyword_embeddings.append(model.wv[word])
# Calculate the mean of keyword embeddings only if keyword_embeddings is not empty
if keyword_embeddings:
    keyword_embeddings = np.mean(keyword_embeddings, axis=0)
else:
    # Use a zero vector if no keywords are found in the vocabulary
    keyword_embeddings = np.zeros(model.vector_size)

# Calculate job title embeddings, handling cases with no matching words
job_title_embeddings = []
for job_title in df['job_title'].fillna('').tolist():
  word_embeddings = [model.wv[word] for word in job_title.split() if word in model.wv]
  if word_embeddings:
    job_title_embeddings.append(np.mean(word_embeddings, axis=0))
  else:
    # Use a zero vector if no words are found in the vocabulary
    job_title_embeddings.append(np.zeros(model.vector_size))

# Convert job_title_embeddings to a NumPy array with consistent shape
job_title_embeddings = np.array(job_title_embeddings)

fit_scores = cosine_similarity(job_title_embeddings, keyword_embeddings.reshape(1, -1)).flatten()
df['fit'] = fit_scores
df_ranked = df.sort_values(by='fit', ascending=False)
print(df_ranked[['id', 'job_title', 'location', 'connection', 'fit']])



      id                                          job_title  \
0      1  2019 C.T. Bauer College of Business Graduate (...   
1      2  Native English Teacher at EPIK (English Progra...   
76    77  Human Resources|\nConflict Management|\nPolici...   
75    76  Aspiring Human Resources Professional | Passio...   
74    75  Nortia Staffing is seeking Human Resources, Pa...   
..   ...                                                ...   
31    32  Native English Teacher at EPIK (English Progra...   
30    31  2019 C.T. Bauer College of Business Graduate (...   
29    30              Seeking Human Resources Opportunities   
28    29  Aspiring Human Resources Management student se...   
103  104   Director Of Administration at Excellence Logging   

                   location connection  fit  
0            Houston, Texas         85  0.0  
1                    Kanada      500+   0.0  
76   Dallas/Fort Worth Area        409  0.0  
75       New York, New York        212  0.0  
74     San Jo

In [13]:
# Fasttext to calculate the similarity
model = FastText(sentences=df['job_title'].fillna('').tolist(), vector_size=100, window=5, min_count=1, workers=4)

keyword_embeddings = []
for word in keywords:
    if word in model.wv:
        keyword_embeddings.append(model.wv[word])
if keyword_embeddings:
    keyword_embeddings = np.mean(keyword_embeddings, axis=0)

job_title_embeddings = []
for job_title in df['job_title'].fillna('').tolist():
  word_embeddings = [model.wv[word] for word in job_title.split() if word in model.wv]
  if word_embeddings:
    job_title_embeddings.append(np.mean(word_embeddings, axis=0))

job_title_embeddings = np.array(job_title_embeddings)

fit_scores = cosine_similarity(job_title_embeddings, keyword_embeddings.reshape(1, -1)).flatten()
df['fit'] = fit_scores
df_ranked = df.sort_values(by='fit', ascending=False)
print(df_ranked[['id', 'job_title', 'location', 'connection', 'fit']])



    id                                          job_title  \
48  49                Aspiring Human Resources Specialist   
5    6                Aspiring Human Resources Specialist   
35  36                Aspiring Human Resources Specialist   
23  24                Aspiring Human Resources Specialist   
59  60                Aspiring Human Resources Specialist   
..  ..                                                ...   
54  55  SVP, CHRO, Marketing & Communications, CSR Off...   
63  64  SVP, CHRO, Marketing & Communications, CSR Off...   
85  86  Information Systems Specialist and Programmer ...   
76  77  Human Resources|\nConflict Management|\nPolici...   
95  96  Student at Indiana University Kokomo - Busines...   

                      location connection       fit  
48  Greater New York City Area          1  0.496755  
5   Greater New York City Area          1  0.496755  
35  Greater New York City Area          1  0.496755  
23  Greater New York City Area          1  0.496755

In [26]:
from sklearn.preprocessing import normalize

# Train the FastText model on job titles
model = FastText(sentences=df['job_title'].fillna('').tolist(), vector_size=100, window=5, min_count=1, workers=4)

# Step 1: Calculate the embedding for job titles
job_title_embeddings = []
for job_title in df['job_title'].fillna('').tolist():
    word_embeddings = [model.wv[word] for word in job_title.split() if word in model.wv]
    if word_embeddings:
        job_title_embeddings.append(np.mean(word_embeddings, axis=0))
    else:
        job_title_embeddings.append(np.zeros(model.vector_size))

job_title_embeddings = np.array(job_title_embeddings)

# Normalize the job title embeddings for similarity comparison
job_title_embeddings = normalize(job_title_embeddings)

# Step 2: Star the 7th candidate (or any chosen candidate)
# We assume the 7th candidate is starred
starred_candidate_index = 6
starred_candidate_embedding = job_title_embeddings[starred_candidate_index].reshape(1, -1)

# Step 3: Re-rank the candidates based on similarity to the starred candidate
# Calculate cosine similarity between all job titles and the starred candidate's job title
fit_scores = cosine_similarity(job_title_embeddings, starred_candidate_embedding).flatten()

# Step 4: Add the fit scores to the DataFrame and sort by fit score
df['fit'] = fit_scores
df_ranked = df.sort_values(by='fit', ascending=False)

# Output the re-ranked results
print(df_ranked[['id', 'job_title', 'location', 'connection', 'fit']])





      id                                          job_title  \
36    37  Student at Humber College and Aspiring Human R...   
6      7  Student at Humber College and Aspiring Human R...   
38    39  Student at Humber College and Aspiring Human R...   
51    52  Student at Humber College and Aspiring Human R...   
49    50  Student at Humber College and Aspiring Human R...   
..   ...                                                ...   
41    42  SVP, CHRO, Marketing & Communications, CSR Off...   
63    64  SVP, CHRO, Marketing & Communications, CSR Off...   
74    75  Nortia Staffing is seeking Human Resources, Pa...   
76    77  Human Resources|\nConflict Management|\nPolici...   
102  103                     Always set them up for Success   

                     location connection       fit  
36                     Kanada         61  1.000000  
6                      Kanada         61  1.000000  
38                     Kanada         61  1.000000  
51                     Kanada  

We are interested in a robust algorithm, tell us how your solution works and show us how your ranking gets better with each starring action.

* After starred one of the candidate the re-rank model come to calculate the similerity more relevan to the data point, so suggest to mannual select the top candidate as the model criteria

How can we filter out candidates which in the first place should not be in this list?

* Set the condition such like location or connection over 500.
* Remove the candidates not meet the reqirement
* Train a model from historical data

Can we determine a cut-off point that would work for other roles without losing high potential candidates?

* Instead of manually setting a cut-off point, use historical data to learn what an effective cut-off point looks like for different roles. This approach can help minimize the chances of losing high-potential candidates.

Do you have any ideas that we should explore so that we can even automate this procedure to prevent human bias?

* Train the model multi time with slightly change of the criteria, or apply different technique for training