In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
potential_talents = pd.read_csv("potential-talents - Aspiring human resources - seeking human resources.csv")
df = potential_talents.copy()

In [3]:
# Tokenize the keywords
keywords = ["Aspiring human resources", "seeking human resources"]
tokenized_keywords = [word_tokenize(keyword.lower()) for keyword in keywords]

In [4]:
def load_glove_model(File):
    print("Loading Glove Model")
    glove_model = {}
    with open(File,'r', encoding='utf-8') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded!")
    return glove_model
embedding_model = load_glove_model('glove.6B.300d.txt')

Loading Glove Model
400000 words loaded!


In [5]:
# Initialize TF-IDF vectorizer
#vectorizer = TfidfVectorizer()
    
# Process job_title column using NLTK and TF-IDF
tokenized_titles = [word_tokenize(title.lower()) for title in df['job_title']]
#tfidf_matrix = vectorizer.fit_transform([' '.join(title) for title in tokenized_titles])

# Calculate the mean embedding for each keyword
keyword_embeddings = []
for keyword_tokens in tokenized_keywords:
    keyword_embedding = np.mean([embedding_model.get(word, np.zeros_like(embedding_model["a"])) for word in keyword_tokens], axis=0)
    keyword_embeddings.append(tuple(keyword_embedding))


In [6]:
# Calculate the embeddings for each candidate's job title
candidate_embeddings = []
for candidate_title in tokenized_titles:
    embedding_sum = np.zeros_like(embedding_model["a"])
    word_count = 0
    for word in candidate_title:
        if word in embedding_model:
            embedding_sum += embedding_model[word]
            word_count += 1
    if word_count > 0:
        candidate_embedding = embedding_sum / word_count
    else:
        candidate_embedding = np.zeros_like(embedding_model["a"])
    candidate_embeddings.append(candidate_embedding)
    
    
# Calculate similarity for each candidate
similarity_scores = []
for keyword_embedding in keyword_embeddings:
    candidate_similarities = []
    for candidate_embedding in candidate_embeddings:
        similarity = cosine_similarity([candidate_embedding], [keyword_embedding])[0, 0]
        candidate_similarities.append(similarity)
    similarity_scores.append(candidate_similarities)
    
    

# Combine the similarity scores for all keywords
combined_scores = np.mean(similarity_scores, axis=0)

# Check if the length of combined_scores matches the DataFrame length
if len(combined_scores) != len(df):
    raise ValueError("Length of 'combined_scores' does not match the length of the DataFrame.")

# Update the 'fit' column in the DataFrame
df['fit'] = combined_scores


# Combine the similarity scores for all keywords
combined_scores = np.mean(similarity_scores, axis=0)

# Update the 'fit' column in the DataFrame
df['fit'] = combined_scores

# Sort the DataFrame based on the initial fit scores
df.sort_values(by='fit', ascending=False, inplace=True)

# View the resulting DataFrame
df.head()

Unnamed: 0,id,job_title,location,connection,fit
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.899883
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.899883
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1,0.888238
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.888238
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.888238


In [11]:
# Determine the index of the starred candidate
starred_ID = 10  # ID of the starred candidate
starred_index = df[df['id'] == starred_ID].index[0] #index of starred candidate

# Check if the starred_index is within the range of the DataFrame
if starred_index >= len(df):
    raise IndexError("The starred_index is out of range.")

# Get the fit score of the starred candidate
starred_fit = df.loc[starred_index, 'fit']

# Calculate the similarity between the starred candidate and other candidates
starred_candidate_embedding = candidate_embeddings[starred_index]

# Calculate the re-ranked fit scores based on the similarity to the starred candidate and the previous fit scores
#similarity_to_star_scores = []

re_ranked_fit_scores = []
for i, candidate_embedding in enumerate(candidate_embeddings):
    similarity = cosine_similarity([starred_candidate_embedding], [candidate_embedding])[0, 0]
    re_ranked_fit_score = similarity  
    re_ranked_fit_scores.append(re_ranked_fit_score)

# Update the 're_ranked' column with the re-ranked fit scores

df['re_ranked'] = re_ranked_fit_scores

# Sort the DataFrame based on the re-ranked fit scores in descending order
df = df.sort_values('re_ranked', ascending=False)

df.head()



Unnamed: 0,id,job_title,location,connection,fit,re_ranked
102,103,Always set them up for Success,Greater Los Angeles Area,500+,0.512251,0.826606
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.826606,0.826606
77,78,Human Resources Generalist at Schwan's,Amerika Birleşik Devletleri,500+,0.700956,0.826606
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.888238,0.826606
6,7,Student at Humber College and Aspiring Human R...,Kanada,61,0.68262,0.802231
