In [1]:
import pandas as pd 
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel
import torch
from sentence_transformers import SentenceTransformer, util
from gensim.models import FastText

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\M-ODE\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\M-ODE\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\M-ODE\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
talents_df=pd.read_csv('C:/Users/M-ODE/Desktop/Apziva/projects/3rd Project/potential_talents/data/potential_talents_CSV.csv')

In [3]:
talents_df

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,
...,...,...,...,...,...
99,100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,
100,101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,
102,103,Always set them up for Success,Greater Los Angeles Area,500+,


In [4]:
talents_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          104 non-null    int64  
 1   job_title   104 non-null    object 
 2   location    104 non-null    object 
 3   connection  104 non-null    object 
 4   fit         0 non-null      float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.2+ KB


In [5]:
print('Total number of rows',len(talents_df))

Total number of rows 104


In [6]:
#check missing values
talents_df.isnull().sum()

id              0
job_title       0
location        0
connection      0
fit           104
dtype: int64

In [7]:
#merging location with job title
def merge_text(row):
    return f"{row['job_title']} ({row['location']})"

talents_df['job_title'] = talents_df.apply(merge_text, axis=1)

# After applying the function, you can drop the 'location' column if it's no longer needed.
talents_df = talents_df.drop('location', axis=1)

talents_df.head()



Unnamed: 0,id,job_title,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,85,
1,2,Native English Teacher at EPIK (English Progra...,500+,
2,3,Aspiring Human Resources Professional (Raleigh...,44,
3,4,People Development Coordinator at Ryan (Denton...,500+,
4,5,Advisory Board Member at Celal Bayar Universit...,500+,


In [8]:
# Preprocessing 
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
lemma = WordNetLemmatizer()

talents_cleaned=talents_df.copy()

def preprocess_text(review):
    # Remove non-alphabet characters, convert to lowercase, and tokenize
    review = word_tokenize(re.sub('[^a-zA-Z]', ' ', review.lower()))
    # Remove stopwords
    review = [w for w in review if w not in stop_words]
    # Apply stemming 
    # review = [stemmer.stem(w) for w in review]
    # Apply lemmatization
    review = [lemma.lemmatize(w, pos='v') for w in review]
    # Join the processed words back into a string
    review = ' '.join(review)
    return review

talents_cleaned['job_title'] = talents_cleaned['job_title'].apply(preprocess_text)
talents_cleaned.head()


Unnamed: 0,id,job_title,connection,fit
0,1,c bauer college business graduate magna cum la...,85,
1,2,native english teacher epik english program ko...,500+,
2,3,aspire human resources professional raleigh du...,44,
3,4,people development coordinator ryan denton texas,500+,
4,5,advisory board member celal bayar university z...,500+,


##  FastText 

In [9]:

data = talents_df
# Preprocess the data
data['job_title'] = data['job_title'].apply(lambda x: x.lower())  # Convert to lowercase

# Train a FastText model
fasttext_model = FastText(sentences=[str(title).split() for title in data['job_title']], vector_size=100, window=5, min_count=1, workers=4, sg=1)

# Keywords
keywords = ["aspiring human resources", "seeking human resources"]

# Generate embeddings for keywords
keyword_embeddings = [fasttext_model.wv[keyword] for keyword in keywords]

# Calculate similarity scores
job_title_embeddings = [fasttext_model.wv[title] for title in data['job_title']]
similarity_scores = cosine_similarity(np.stack(job_title_embeddings), np.stack(keyword_embeddings))

# Assign fit scores
data['fit'] = np.mean(similarity_scores, axis=1)

# Sort 
ranked_data = talents_df.sort_values(by='fit', ascending=False)

ranked_data.head()


Unnamed: 0,id,job_title,connection,fit
72,73,"aspiring human resources manager, seeking inte...",7,0.988569
99,100,aspiring human resources manager | graduating ...,103,0.984133
28,29,aspiring human resources management student se...,500+,0.983524
26,27,aspiring human resources management student se...,500+,0.983524
59,60,aspiring human resources specialist (greater n...,1,0.980825


## word to vec

In [10]:
model_name = "word2vec-google-news-300"
word_vectors = api.load(model_name)

keywords = ["Aspiring human resources", "seeking human resources"]

keyword_similarities = []

# Tokenize job titles and obtain their Word2Vec embeddings
job_title_tokens = [title.split() for title in talents_cleaned['job_title']]
job_title_embeddings = []

for keyword in keywords:
    # Tokenize the current keyword and obtain its Word2Vec embeddings
    keyword_tokens = keyword.split()
    keyword_embedding = np.mean([word_vectors[word] for word in keyword_tokens if word in word_vectors], axis=0)
    
    # Calculate cosine similarities between the current keyword and job title embeddings
    similarities = []
    for tokens in job_title_tokens:
        title_vector = np.mean([word_vectors[word] for word in tokens if word in word_vectors], axis=0)
        similarity = cosine_similarity([keyword_embedding], [title_vector])[0][0]
        similarities.append(similarity)
    
    keyword_similarities.append(similarities)

# Combine or average the similarity scores based on all keywords for each job title
average_similarities = np.mean(keyword_similarities, axis=0)

# Add the average similarity scores to the DataFrame for ranking
talents_cleaned['fit'] = average_similarities

# Sort 
ranked_data = talents_cleaned.sort_values(by='fit', ascending=False)

ranked_data.head()


Unnamed: 0,id,job_title,connection,fit
72,73,aspire human resources manager seek internship...,7,0.682205
52,53,seek human resources hris generalist position ...,500+,0.657578
9,10,seek human resources hris generalist position ...,500+,0.657578
39,40,seek human resources hris generalist position ...,500+,0.657578
61,62,seek human resources hris generalist position ...,500+,0.657578


## TF-IDF  

In [11]:
keywords = ["Aspiring human resources", "seeking human resources"]

# Combine the job titles into a list for TF-IDF processing
job_titles = talents_cleaned['job_title'].tolist()

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(job_titles)

# Calculate cosine similarities for each keyword
keyword_similarities = []
for keyword in keywords:
    keyword_tfidf = tfidf_vectorizer.transform([keyword])
    similarities = cosine_similarity(keyword_tfidf, tfidf_matrix)
    keyword_similarities.append(similarities[0])

# Calculate the average similarity for each job title based on all keywords
combined_scores = np.mean(keyword_similarities, axis=0)
talents_cleaned['fit'] = combined_scores

# Sort 
ranked_data = talents_cleaned.sort_values(by='fit', ascending=False)

ranked_data.head()


Unnamed: 0,id,job_title,connection,fit
72,73,aspire human resources manager seek internship...,7,0.484356
73,74,human resources professional greater boston area,16,0.316795
38,39,student humber college aspire human resources ...,61,0.287056
24,25,student humber college aspire human resources ...,61,0.287056
51,52,student humber college aspire human resources ...,61,0.287056


## #Glovec

In [12]:
# Tokenize the keywords
keywords = ["Aspiring human resources", "seeking human resources"]
tokenized_keywords = [word_tokenize(keyword.lower()) for keyword in keywords]


#Load Glove model
def load_glove_model(File):
    print("Loading Glove Model")
    glove_model = {}
    with open(File,'r', encoding='utf-8') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded!")
    return glove_model
embedding_model = load_glove_model('glove.6B.300d.txt')

# Process job_title column using NLTK 
tokenized_titles = [word_tokenize(title.lower()) for title in talents_df['job_title']]

# Calculate the mean embedding for each keyword
keyword_embeddings = []
for keyword_tokens in tokenized_keywords:
    keyword_embedding = np.mean([embedding_model.get(word, np.zeros_like(embedding_model["a"])) for word in keyword_tokens], axis=0)
    keyword_embeddings.append(tuple(keyword_embedding))

Loading Glove Model
400001 words loaded!


In [13]:
# Calculate the embeddings for each candidate's job title
candidate_embeddings = []
for candidate_title in tokenized_titles:
    embedding_sum = np.zeros_like(embedding_model["a"])
    word_count = 0
    for word in candidate_title:
        if word in embedding_model:
            embedding_sum += embedding_model[word]
            word_count += 1
    if word_count > 0:
        candidate_embedding = embedding_sum / word_count
    else:
        candidate_embedding = np.zeros_like(embedding_model["a"])
    candidate_embeddings.append(candidate_embedding)
    
    
# Calculate similarity for each candidate
similarity_scores = []
for keyword_embedding in keyword_embeddings:
    candidate_similarities = []
    for candidate_embedding in candidate_embeddings:
        similarity = cosine_similarity([candidate_embedding], [keyword_embedding])[0, 0]
        candidate_similarities.append(similarity)
    similarity_scores.append(candidate_similarities)
    

# Combine the similarity scores for all keywords
combined_scores = np.mean(similarity_scores, axis=0)

talents_df['fit'] = combined_scores

# Sort 
ranked_data=talents_df.sort_values(by='fit', ascending=False)

ranked_data.head()

Unnamed: 0,id,job_title,connection,fit
72,73,"aspiring human resources manager, seeking inte...",7,0.749722
29,30,seeking human resources opportunities (chicago...,390,0.707636
27,28,seeking human resources opportunities (chicago...,390,0.707636
52,53,seeking human resources hris and generalist po...,500+,0.700583
9,10,seeking human resources hris and generalist po...,500+,0.700583


## Bert

In [14]:
# Load a pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

keywords = ["Aspiring human resources", "seeking human resources"]

keyword_similarities = []

# Tokenize job titles and obtain their BERT embeddings
job_title_tokens = [tokenizer(title, return_tensors="pt") for title in talents_df['job_title']]
job_title_embeddings = []

for keyword in keywords:
    # Tokenize the current keyword and obtain its BERT embeddings
    keyword_tokens = tokenizer(keyword, return_tensors="pt")
    with torch.no_grad():
        keyword_embeddings = model(**keyword_tokens).last_hidden_state.mean(dim=1)
    
    # Calculate cosine similarities between the current keyword and job title embeddings
    similarities = []
    with torch.no_grad():
        for tokens in job_title_tokens:
            embeddings = model(**tokens).last_hidden_state.mean(dim=1)
            similarity = cosine_similarity(keyword_embeddings, embeddings).item()
            similarities.append(similarity)

    keyword_similarities.append(similarities)

# Combine or average the similarity scores based on all keywords for each job title
average_similarities = np.mean(keyword_similarities, axis=0)

# Add the average similarity scores to the DataFrame for ranking
talents_df['fit'] = average_similarities

# Sort 
ranked_data = talents_df.sort_values(by='fit', ascending=False)

ranked_data.head()


Unnamed: 0,id,job_title,connection,fit
29,30,seeking human resources opportunities (chicago...,390,0.661207
27,28,seeking human resources opportunities (chicago...,390,0.661207
72,73,"aspiring human resources manager, seeking inte...",7,0.658726
52,53,seeking human resources hris and generalist po...,500+,0.654759
39,40,seeking human resources hris and generalist po...,500+,0.654759


## Sbert

In [15]:
# Load a pre-trained SBERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

keywords = ["Aspiring human resources", "seeking human resources"]

keyword_similarities = []

# Tokenize job titles and obtain their SBERT embeddings
job_title_embeddings = model.encode(talents_df['job_title'].tolist(), convert_to_tensor=True)

for keyword in keywords:
    # Tokenize the current keyword and obtain its SBERT embedding
    keyword_embedding = model.encode(keyword, convert_to_tensor=True)
    
    # Calculate cosine similarities between the current keyword and job title embeddings
    similarities = util.pytorch_cos_sim(keyword_embedding, job_title_embeddings)
    
    # Convert the PyTorch tensor to a NumPy array before appending
    keyword_similarities.append(similarities.cpu().numpy())

# Combine or average the similarity scores based on all keywords for each job title
average_similarities = np.mean(keyword_similarities, axis=0)

# Reshape the average_similarities array to (104,)
average_similarities = average_similarities[0]

talents_df['fit'] = average_similarities.tolist()

# Sort 
ranked_data = talents_df.sort_values(by='fit', ascending=False)

ranked_data.head()


Unnamed: 0,id,job_title,connection,fit
29,30,seeking human resources opportunities (chicago...,390,0.730671
27,28,seeking human resources opportunities (chicago...,390,0.730671
16,17,aspiring human resources professional (raleigh...,44,0.705262
2,3,aspiring human resources professional (raleigh...,44,0.705262
57,58,aspiring human resources professional (raleigh...,44,0.705262


# Re Rannking

There is two methods to re rank the list:
- First one: get the starred candidate embedding; get the keyword embedding; average both; then calculate cosine similarity 

- Second Method: Merge the starred candidate with the keyword then calculate its embedding;calculate cosine similarity 


In [17]:
# Add cell to manually input starred 

#starred =    # fixed

star_index = int(input("Enter the index of the starred job title: "))
# Check if the input index is valid
if star_index < 0 or star_index >= len(talents_df):
    print("Invalid index. Please enter a valid index.")
    exit()
starred_candidate = talents_df['job_title'][star_index]

Enter the index of the starred job title:  98


In [18]:
starred_candidate

'seeking human resources position (las vegas, nevada area)'

## Method 1

### wor2vec 

In [19]:
# Load the Word2Vec model
model_name = "word2vec-google-news-300"
word_vectors = api.load(model_name)

keywords = ["Aspiring human resources", "seeking human resources"]
star = starred_candidate #example; id=98

keyword_similarities = []

# Tokenize job titles and obtain their Word2Vec embeddings
job_title_tokens = [title.split() for title in talents_cleaned['job_title']]
job_title_embeddings = []

for keyword in keywords:
    # Tokenize the current keyword and obtain its Word2Vec embeddings
    keyword_tokens = keyword.split()
    keyword_vector = np.mean([word_vectors[word] for word in keyword_tokens if word in word_vectors], axis=0)
    
    # Tokenize the "star" snippet and obtain its Word2Vec embeddings
    star_tokens = star.split()
    star_vector = np.mean([word_vectors[word] for word in star_tokens if word in word_vectors], axis=0)
    
    # Calculate the mean embedding
    combined_embedding = (keyword_vector + star_vector) / 2
    
    # Calculate cosine similarities between the current keyword and job title embeddings
    similarities = []
    for tokens in job_title_tokens:
        title_vector = np.mean([word_vectors[word] for word in tokens if word in word_vectors], axis=0)
        similarity = cosine_similarity([combined_embedding], [title_vector])[0][0]
        similarities.append(similarity)

    keyword_similarities.append(similarities)

# Combine or average the similarity scores based on all keywords for each job title
average_similarities = np.mean(keyword_similarities, axis=0)

# Add the average similarity scores to the DataFrame for ranking
talents_cleaned['fit'] = average_similarities

# Sort 
ranked_data = talents_cleaned.sort_values(by='fit', ascending=False)

ranked_data.head()


Unnamed: 0,id,job_title,connection,fit
72,73,aspire human resources manager seek internship...,7,0.764217
52,53,seek human resources hris generalist position ...,500+,0.759148
9,10,seek human resources hris generalist position ...,500+,0.759148
61,62,seek human resources hris generalist position ...,500+,0.759148
39,40,seek human resources hris generalist position ...,500+,0.759148


### TF IDF

In [20]:
# Add cell to manually input starred 

#starred =    # fixed

star_index = int(input("Enter the index of the starred job title: "))
# Check if the input index is valid
if star_index < 0 or star_index >= len(talents_df):
    print("Invalid index. Please enter a valid index.")
    exit()
starred_candidate = talents_cleaned['job_title'][star_index]

Enter the index of the starred job title:  45


In [21]:
keywords = ["Aspiring human resources", "seeking human resources"]
starred = starred_candidate  #example; id=45

# Combine the job titles into a list for TF-IDF processing
job_titles = talents_cleaned['job_title'].tolist()

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(job_titles)

# Calculate cosine similarities for each keyword
keyword_similarities = []
starred_tfidf = tfidf_vectorizer.transform([starred_candidate]) # Transform starred once

for keyword in keywords:
    keyword_tfidf = tfidf_vectorizer.transform([keyword])
    mean_tfidf = np.mean([keyword_tfidf.toarray(), starred_tfidf.toarray()], axis=0)
    similarities = cosine_similarity(mean_tfidf, tfidf_matrix)
    keyword_similarities.append(similarities[0])

# Calculate the average similarity for each job title based on all keywords
combined_scores = np.mean(keyword_similarities, axis=0)
talents_cleaned['fit'] = combined_scores

# Sort 
ranked_data = talents_cleaned.sort_values(by='fit', ascending=False)

ranked_data.head()


Unnamed: 0,id,job_title,connection,fit
32,33,aspire human resources professional raleigh du...,44,0.796495
45,46,aspire human resources professional raleigh du...,44,0.796495
57,58,aspire human resources professional raleigh du...,44,0.796495
16,17,aspire human resources professional raleigh du...,44,0.796495
20,21,aspire human resources professional raleigh du...,44,0.796495


### glovec 

In [22]:
# Add cell to manually input starred 

#starred =    # fixed

star_index = int(input("Enter the index of the starred job title: "))
# Check if the input index is valid
if star_index < 0 or star_index >= len(talents_df):
    print("Invalid index. Please enter a valid index.")
    exit()
starred_candidate = talents_df['job_title'][star_index]

Enter the index of the starred job title:  23


In [23]:
keywords = ["Aspiring human resources", "seeking human resources"]
tokenized_keywords = [word_tokenize(keyword.lower()) for keyword in keywords]

#Load Glove model
def load_glove_model(File):
    print("Loading Glove Model")
    glove_model = {}
    with open(File,'r', encoding='utf-8') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded!")
    return glove_model
embedding_model = load_glove_model('glove.6B.300d.txt')


# Process job_title column using NLTK 
tokenized_titles = [word_tokenize(title.lower()) for title in talents_df['job_title']]

#embedding for starred 
star = starred_candidate                 #example; id= 23
tokenized_star = word_tokenize(star.lower())

word_embeddings = [embedding_model.get(word, np.zeros_like(embedding_model["a"])) for word in tokenized_star]
starx = np.mean(word_embeddings, axis=0)

# Calculate the mean embedding for each keyword
keyword_embeddings = []
for keyword_tokens in tokenized_keywords:
    keyword_embedding = np.mean([embedding_model.get(word, np.zeros_like(embedding_model["a"])) for word in keyword_tokens], axis=0)
    keyword_embeddings.append(tuple(keyword_embedding))

Loading Glove Model
400001 words loaded!


In [24]:
# Combine the keyword embeddings with starx
combined_embeddings = [embedding + starx for embedding in keyword_embeddings]

In [25]:
# Calculate the embeddings for each candidate's job title
candidate_embeddings = []
for candidate_title in tokenized_titles:
    embedding_sum = np.zeros_like(embedding_model["a"])
    word_count = 0
    for word in candidate_title:
        if word in embedding_model:
            embedding_sum += embedding_model[word]
            word_count += 1
    if word_count > 0:
        candidate_embedding = embedding_sum / word_count
    else:
        candidate_embedding = np.zeros_like(embedding_model["a"])
    candidate_embeddings.append(candidate_embedding)
    
    
# Calculate similarity for each candidate
similarity_scores = []
for keyword_embedding in combined_embeddings:
    candidate_similarities = []
    for candidate_embedding in candidate_embeddings:
        similarity = cosine_similarity([candidate_embedding], [keyword_embedding])[0, 0]
        candidate_similarities.append(similarity)
    similarity_scores.append(candidate_similarities)
    

# Combine the similarity scores for all keywords
combined_scores = np.mean(similarity_scores, axis=0)

talents_df['fit'] = combined_scores

ranked_data=talents_df.sort_values(by='fit', ascending=False)

ranked_data.head()

Unnamed: 0,id,job_title,connection,fit
23,24,aspiring human resources specialist (greater n...,1,0.894733
59,60,aspiring human resources specialist (greater n...,1,0.894733
48,49,aspiring human resources specialist (greater n...,1,0.894733
5,6,aspiring human resources specialist (greater n...,1,0.894733
35,36,aspiring human resources specialist (greater n...,1,0.894733


### Bert 

In [26]:
# Add cell to manually input starred 

#starred =    # fixed

star_index = int(input("Enter the index of the starred job title: "))
# Check if the input index is valid
if star_index < 0 or star_index >= len(talents_df):
    print("Invalid index. Please enter a valid index.")
    exit()
starred_candidate = talents_df['job_title'][star_index]

Enter the index of the starred job title:  98


In [27]:
# Load a pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

keywords = ["Aspiring human resources", "seeking human resources"]

star = starred_candidate   #example; id=98

keyword_similarities = []

# Tokenize job titles and obtain their BERT embeddings
job_title_tokens = [tokenizer(title, return_tensors="pt") for title in talents_df['job_title']]
job_title_embeddings = []

for keyword in keywords:
    # Tokenize the current keyword and obtain its BERT embeddings
    keyword_tokens = tokenizer(keyword, return_tensors="pt")
    with torch.no_grad():
        keyword_embeddings = model(**keyword_tokens).last_hidden_state.mean(dim=1).detach().numpy()  # Convert to NumPy
        
    # Tokenize the "star" snippet and obtain its BERT embeddings
    star_tokens = tokenizer(star, return_tensors="pt")
    with torch.no_grad():
        star_embeddings = model(**star_tokens).last_hidden_state.mean(dim=1).detach().numpy()  # Convert to NumPy
    
    # Calculate the mean embedding
    combined_embeddings = np.mean([keyword_embeddings, star_embeddings], axis=0)
    
    # Calculate cosine similarities between the current keyword and job title embeddings
    similarities = []
    with torch.no_grad():
        for tokens in job_title_tokens:
            embeddings = model(**tokens).last_hidden_state.mean(dim=1).detach().numpy()  # Convert to NumPy
            similarity = cosine_similarity(combined_embeddings, embeddings).item()
            similarities.append(similarity)
    
    keyword_similarities.append(similarities)


# Combine or average the similarity scores based on all keywords for each job title
average_similarities = np.mean(keyword_similarities, axis=0)

# Add the average similarity scores to the DataFrame for ranking
talents_df['fit'] = average_similarities

# Sort 
ranked_data = talents_df.sort_values(by='fit', ascending=False)

ranked_data.head()

Unnamed: 0,id,job_title,connection,fit
98,99,"seeking human resources position (las vegas, n...",48,0.908844
72,73,"aspiring human resources manager, seeking inte...",7,0.834817
29,30,seeking human resources opportunities (chicago...,390,0.819818
27,28,seeking human resources opportunities (chicago...,390,0.819818
65,66,experienced retail manager and aspiring human ...,57,0.816122


### Sbert 

In [28]:
# Add cell to manually input starred 

#starred =    # fixed

star_index = int(input("Enter the index of the starred job title: "))
# Check if the input index is valid
if star_index < 0 or star_index >= len(talents_df):
    print("Invalid index. Please enter a valid index.")
    exit()
starred_candidate = talents_df['job_title'][star_index]

Enter the index of the starred job title:  32


In [29]:
# Load a pre-trained SBERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

keywords = ["Aspiring human resources", "seeking human resources"]
star = starred_candidate           #example; id= 32

keyword_similarities = []

# Tokenize job titles and obtain their SBERT embeddings
job_title_embeddings = model.encode(talents_df['job_title'].tolist(), convert_to_tensor=True)

for keyword in keywords:
    # Tokenize the current keyword and obtain its SBERT embedding
    keyword_embedding = model.encode(keyword, convert_to_tensor=True)
    # Tokenize the "star" snippet and obtain its BERT embeddings
    star_embedding = model.encode(star, convert_to_tensor=True)
    mean_embedding = (keyword_embedding + star_embedding) / 2
    
    # Calculate cosine similarities between the current keyword and job title embeddings
    similarities = util.pytorch_cos_sim(mean_embedding, job_title_embeddings)
    
    # Convert the PyTorch tensor to a NumPy array before appending
    keyword_similarities.append(similarities.cpu().numpy())

# Combine or average the similarity scores based on all keywords for each job title
average_similarities = np.mean(keyword_similarities, axis=0)

# Reshape the average_similarities array to (104,)
average_similarities = average_similarities[0]

talents_df['fit'] = average_similarities.tolist()

ranked_data = talents_df.sort_values(by='fit', ascending=False)

ranked_data.head()


Unnamed: 0,id,job_title,connection,fit
57,58,aspiring human resources professional (raleigh...,44,0.906051
2,3,aspiring human resources professional (raleigh...,44,0.906051
32,33,aspiring human resources professional (raleigh...,44,0.906051
45,46,aspiring human resources professional (raleigh...,44,0.906051
20,21,aspiring human resources professional (raleigh...,44,0.906051


## Method 2

### Word2Vec

In [30]:
# Add cell to manually input starred 

#starred =    # fixed

star_index = int(input("Enter the index of the starred job title: "))
# Check if the input index is valid
if star_index < 0 or star_index >= len(talents_df):
    print("Invalid index. Please enter a valid index.")
    exit()
starred_candidate = talents_df['job_title'][star_index]

Enter the index of the starred job title:  98


In [31]:
model_name = "word2vec-google-news-300"
word_vectors = api.load(model_name)

keywords = ["Aspiring human resources", "seeking human resources"]
starred = starred_candidate   #example; id=98
keywords = ["Aspiring human resources", "seeking human resources"] + [starred]

# Create an empty list to store the similarity scores for each keyword
keyword_similarities = []

# Tokenize job titles and obtain their Word2Vec embeddings
job_title_tokens = [title.split() for title in talents_cleaned['job_title']]
job_title_embeddings = []

for keyword in keywords:
    # Tokenize the current keyword and obtain its Word2Vec embeddings
    keyword_tokens = keyword.split()
    keyword_embedding = np.mean([word_vectors[word] for word in keyword_tokens if word in word_vectors], axis=0)
    
    # Calculate cosine similarities between the current keyword and job title embeddings
    similarities = []
    for tokens in job_title_tokens:
        title_vector = np.mean([word_vectors[word] for word in tokens if word in word_vectors], axis=0)
        similarity = cosine_similarity([keyword_embedding], [title_vector])[0][0]
        similarities.append(similarity)
    
    keyword_similarities.append(similarities)

# Combine or average the similarity scores based on all keywords for each job title
average_similarities = np.mean(keyword_similarities, axis=0)

# Add the average similarity scores to the DataFrame for ranking
talents_cleaned['fit'] = average_similarities

# Sort 
ranked_data = talents_cleaned.sort_values(by='fit', ascending=False)

ranked_data.head()

Unnamed: 0,id,job_title,connection,fit
72,73,aspire human resources manager seek internship...,7,0.707149
52,53,seek human resources hris generalist position ...,500+,0.698815
61,62,seek human resources hris generalist position ...,500+,0.698815
39,40,seek human resources hris generalist position ...,500+,0.698815
9,10,seek human resources hris generalist position ...,500+,0.698815


### TF-IDF  

In [32]:
# Add cell to manually input starred 

#starred =    # fixed

star_index = int(input("Enter the index of the starred job title: "))
# Check if the input index is valid
if star_index < 0 or star_index >= len(talents_df):
    print("Invalid index. Please enter a valid index.")
    exit()
starred_candidate = talents_cleaned['job_title'][star_index]

Enter the index of the starred job title:  45


In [33]:
keywords = ["Aspiring human resources", "seeking human resources"]
starred = starred_candidate  #example; id= 45
keywords = ["Aspiring human resources", "seeking human resources"] + [starred]

# Combine the job titles into a list for TF-IDF processing
job_titles = talents_cleaned['job_title'].tolist()

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(job_titles)

# Calculate cosine similarities for each keyword
keyword_similarities = []
for keyword in keywords:
    keyword_tfidf = tfidf_vectorizer.transform([keyword])
    similarities = cosine_similarity(keyword_tfidf, tfidf_matrix)
    keyword_similarities.append(similarities[0])

# Calculate the average similarity for each job title based on all keywords
combined_scores = np.mean(keyword_similarities, axis=0)
talents_cleaned['fit'] = combined_scores

# Sort 
ranked_data = talents_cleaned.sort_values(by='fit', ascending=False)

ranked_data.head()


Unnamed: 0,id,job_title,connection,fit
57,58,aspire human resources professional raleigh du...,44,0.512539
2,3,aspire human resources professional raleigh du...,44,0.512539
32,33,aspire human resources professional raleigh du...,44,0.512539
16,17,aspire human resources professional raleigh du...,44,0.512539
45,46,aspire human resources professional raleigh du...,44,0.512539


### Glovec

In [34]:
# Add cell to manually input starred 

#starred =    # fixed

star_index = int(input("Enter the index of the starred job title: "))
# Check if the input index is valid
if star_index < 0 or star_index >= len(talents_df):
    print("Invalid index. Please enter a valid index.")
    exit()
starred_candidate = talents_df['job_title'][star_index]

Enter the index of the starred job title:  23


In [35]:
keywords = ["Aspiring human resources", "seeking human resources"]
starred = starred_candidate    #example; id= 23
# Merge all 
keywords = ["Aspiring human resources", "seeking human resources"] + [starred]

tokenized_keywords = [word_tokenize(keyword.lower()) for keyword in keywords]

#Load Glove model
def load_glove_model(File):
    print("Loading Glove Model")
    glove_model = {}
    with open(File,'r', encoding='utf-8') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded!")
    return glove_model
embedding_model = load_glove_model('glove.6B.300d.txt')

# Process job_title column using NLTK 
tokenized_titles = [word_tokenize(title.lower()) for title in talents_df['job_title']]

# Calculate the mean embedding for each keyword
keyword_embeddings = []
for keyword_tokens in tokenized_keywords:
    keyword_embedding = np.mean([embedding_model.get(word, np.zeros_like(embedding_model["a"])) for word in keyword_tokens], axis=0)
    keyword_embeddings.append(tuple(keyword_embedding))

Loading Glove Model
400001 words loaded!


In [36]:
# Calculate the embeddings for each candidate's job title
candidate_embeddings = []
for candidate_title in tokenized_titles:
    embedding_sum = np.zeros_like(embedding_model["a"])
    word_count = 0
    for word in candidate_title:
        if word in embedding_model:
            embedding_sum += embedding_model[word]
            word_count += 1
    if word_count > 0:
        candidate_embedding = embedding_sum / word_count
    else:
        candidate_embedding = np.zeros_like(embedding_model["a"])
    candidate_embeddings.append(candidate_embedding)
    
# Calculate similarity for each candidate
similarity_scores = []
for keyword_embedding in keyword_embeddings:
    candidate_similarities = []
    for candidate_embedding in candidate_embeddings:
        similarity = cosine_similarity([candidate_embedding], [keyword_embedding])[0, 0]
        candidate_similarities.append(similarity)
    similarity_scores.append(candidate_similarities)

# Combine the similarity scores for all keywords
combined_scores = np.mean(similarity_scores, axis=0)

talents_df['fit'] = combined_scores

ranked_data=talents_df.sort_values(by='fit', ascending=False)

ranked_data.head()

Unnamed: 0,id,job_title,connection,fit
72,73,"aspiring human resources manager, seeking inte...",7,0.801039
23,24,aspiring human resources specialist (greater n...,1,0.789056
59,60,aspiring human resources specialist (greater n...,1,0.789056
48,49,aspiring human resources specialist (greater n...,1,0.789056
5,6,aspiring human resources specialist (greater n...,1,0.789056


### Bert

In [37]:
# Add cell to manually input starred 

#starred =    # fixed

star_index = int(input("Enter the index of the starred job title: "))
# Check if the input index is valid
if star_index < 0 or star_index >= len(talents_df):
    print("Invalid index. Please enter a valid index.")
    exit()
starred_candidate = talents_df['job_title'][star_index]

Enter the index of the starred job title:  98


In [38]:
# Load a pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

keywords = ["Aspiring human resources", "seeking human resources"]
starred = starred_candidate      #example; id=98
keywords = ["Aspiring human resources", "seeking human resources"] + [starred]

keyword_similarities = []

# Tokenize job titles and obtain their BERT embeddings
job_title_tokens = [tokenizer(title, return_tensors="pt") for title in talents_df['job_title']]
job_title_embeddings = []

for keyword in keywords:
    # Tokenize the current keyword and obtain its BERT embeddings
    keyword_tokens = tokenizer(keyword, return_tensors="pt")
    with torch.no_grad():
        keyword_embeddings = model(**keyword_tokens).last_hidden_state.mean(dim=1)
    
    # Calculate cosine similarities between the current keyword and job title embeddings
    similarities = []
    with torch.no_grad():
        for tokens in job_title_tokens:
            embeddings = model(**tokens).last_hidden_state.mean(dim=1)
            similarity = cosine_similarity(keyword_embeddings, embeddings).item()
            similarities.append(similarity)
    
    keyword_similarities.append(similarities)

# Combine or average the similarity scores based on all keywords for each job title
average_similarities = np.mean(keyword_similarities, axis=0)

# Add the average similarity scores to the DataFrame for ranking
talents_df['fit'] = average_similarities

# Sort 
ranked_data = talents_df.sort_values(by='fit', ascending=False)

ranked_data.head()

Unnamed: 0,id,job_title,connection,fit
98,99,"seeking human resources position (las vegas, n...",48,0.75745
72,73,"aspiring human resources manager, seeking inte...",7,0.721521
29,30,seeking human resources opportunities (chicago...,390,0.713726
27,28,seeking human resources opportunities (chicago...,390,0.713726
66,67,"human resources, staffing and recruiting profe...",500+,0.705985


### Sbert

In [39]:
# Add cell to manually input starred 

#starred =    # fixed

star_index = int(input("Enter the index of the starred job title: "))
# Check if the input index is valid
if star_index < 0 or star_index >= len(talents_df):
    print("Invalid index. Please enter a valid index.")
    exit()
starred_candidate = talents_df['job_title'][star_index]

Enter the index of the starred job title:  32


In [40]:
# Load a pre-trained SBERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Keywords for ranking
keywords = ["Aspiring human resources", "seeking human resources"]
starred = starred_candidate  #example; id= 32
keywords = ["Aspiring human resources", "seeking human resources"] + [starred]

keyword_similarities = []

# Tokenize job titles and obtain their SBERT embeddings
job_title_embeddings = model.encode(talents_df['job_title'].tolist(), convert_to_tensor=True)

for keyword in keywords:
    # Tokenize the current keyword and obtain its SBERT embedding
    keyword_embedding = model.encode(keyword, convert_to_tensor=True)
    
    # Calculate cosine similarities between the current keyword and job title embeddings
    similarities = util.pytorch_cos_sim(keyword_embedding, job_title_embeddings)
    
    # Convert the PyTorch tensor to a NumPy array before appending
    keyword_similarities.append(similarities.cpu().numpy())

# Combine or average the similarity scores based on all keywords for each job title
average_similarities = np.mean(keyword_similarities, axis=0)

# Reshape the average_similarities array to (104,)
average_similarities = average_similarities[0]

# Add the similarity scores to the DataFrame for ranking
talents_df['fit'] = average_similarities.tolist()

# Sort 
ranked_data = talents_df.sort_values(by='fit', ascending=False)

ranked_data.head()

Unnamed: 0,id,job_title,connection,fit
16,17,aspiring human resources professional (raleigh...,44,0.803508
2,3,aspiring human resources professional (raleigh...,44,0.803508
57,58,aspiring human resources professional (raleigh...,44,0.803508
32,33,aspiring human resources professional (raleigh...,44,0.803508
45,46,aspiring human resources professional (raleigh...,44,0.803508
