In [1]:
from google.colab import files
uploaded = files.upload()

Saving prj3_cleaned_data.csv to prj3_cleaned_data (9).csv


In [2]:
import numpy as np
import pandas as pd

data = pd.read_csv("prj3_cleaned_data.csv")
data.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 ct bauer college of business graduate mag...,houston texas,85,
1,2,native english teacher at epik english program...,kanada,500+,
2,3,aspiring human resources professional,raleigh-durham north carolina area,44,
3,4,people development coordinator at ryan,denton texas,500+,
4,5,advisory board member at celal bayar university,i̇zmir türkiye,500+,


# [Bag-of-Words Model](https://spotintelligence.com/2022/12/20/bag-of-words-python/)


### Extract words

In [3]:
# joining all excisting columns
data_concat = pd.DataFrame()
data_concat["data_concat"] = data[["job_title", "location", "connection"]].apply(" ".join, axis=1)

data_concat.head()

Unnamed: 0,data_concat
0,2019 ct bauer college of business graduate mag...
1,native english teacher at epik english program...
2,aspiring human resources professional raleigh-...
3,people development coordinator at ryan denton ...
4,advisory board member at celal bayar universit...


In [4]:
# making dataframe into a list of srings
data_concat_list = data_concat["data_concat"]
data_concat_list = data_concat_list.tolist()
print(data_concat_list)

['2019 ct bauer college of business graduate magna cum laude and aspiring human resources professional houston texas 85', 'native english teacher at epik english program in korea kanada 500+ ', 'aspiring human resources professional raleigh-durham north carolina area 44', 'people development coordinator at ryan denton texas 500+ ', 'advisory board member at celal bayar university i̇zmir türkiye 500+ ', 'aspiring human resources specialist greater new york city area 1', 'student at humber college and aspiring human resources generalist kanada 61', 'hr senior specialist san francisco bay area 500+ ', 'student at humber college and aspiring human resources generalist kanada 61', 'seeking human resources hris and generalist positions greater philadelphia area 500+ ', 'student at chapman university lake forest california 2', 'svp chro marketing & communications csr officer  engie  houston  the woodlands  energy  gphr  sphr houston texas area 500+ ', 'human resources coordinator at intercont

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_1 = CountVectorizer(ngram_range=(1,3), max_features=200, stop_words='english')
bow_countvec = vectorizer_1.fit(data_concat_list)

vectorizer_2 = CountVectorizer()
query = ["aspiring  human resources", "seeking human resources"]
query_countvec = vectorizer_2.fit(query)

print(sorted(bow_countvec.vocabulary_))
print(sorted(query_countvec.vocabulary_))

['2019', '2019 ct', '2019 ct bauer', '44', '500', '61', '85', 'area', 'area 44', 'area 500', 'aspiring', 'aspiring human', 'aspiring human resources', 'atlanta', 'bauer', 'bauer college', 'bauer college business', 'bay', 'bay area', 'bay area 500', 'business', 'business graduate', 'business graduate magna', 'california', 'carolina', 'carolina area', 'carolina area 44', 'city', 'city area', 'college', 'college aspiring', 'college aspiring human', 'college business', 'college business graduate', 'communications', 'communications csr', 'communications csr officer', 'coordinator', 'coordinator intercontinental', 'coordinator intercontinental buckhead', 'coordinator ryan', 'coordinator ryan denton', 'ct', 'ct bauer', 'ct bauer college', 'cum', 'cum laude', 'cum laude aspiring', 'denton', 'denton texas', 'denton texas 500', 'development', 'development coordinator', 'development coordinator ryan', 'durham', 'durham north', 'durham north carolina', 'energy gphr', 'energy gphr sphr', 'engie', '

### Score words and create vector

In [6]:
vector_1 = bow_countvec.transform(data_concat_list)
vector_bow = vector_1.toarray()
print('Bag of Words Vector:')
print('shape: ',vector_bow.shape)
print(vector_bow)

Bag of Words Vector:
shape:  (104, 200)
[[1 1 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
vector_2 = bow_countvec.transform(query)
vector_bow_query = vector_2.toarray()
print('Bag of Words Query Vector:')
print('shape: ',vector_bow_query.shape)
print(vector_bow_query)

###Get Cosine Similarity Score

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(vector_bow_query, vector_bow)

print("array of cosine similarities: ")
print("shape: ", similarities.shape)
print(similarities)

###Top 7 Profiles

In [9]:
ranking = np.argsort(similarities[0])[::-1]

top_n = 7
#top_profiles = data_concat.iloc[ranking[:top_n]]
print('Top 7 Profiles: \n', data_concat.iloc[ranking[:top_n]])

Top 7 Profiles: 
                                           data_concat
78  liberal arts major aspiring human resources an...
99  aspiring human resources manager  graduating m...
71  business management major and aspiring human r...
96  aspiring human resources professional kokomo i...
72  aspiring human resources manager seeking inter...
81  aspiring human resources professional  an ener...
65  experienced retail manager and aspiring human ...


###Re-ranking Based on Preference

In [10]:
# display available profiles
print("Available Profiles:")
print(data_concat.iloc[ranking[:top_n]], '\n')

# prompt user to select a profile
selected_index = input("Enter the index of the profile you want to select: ")

# convert the input to an integer
selected_index = int(selected_index)

# filter the DataFrame based on the selected ID
##selected_profile = top_profiles[top_profiles['id'] == selected_id]
selected_profile = data_concat['data_concat'].iloc[selected_index]

# display the selected profile
print("\nSelected Profile:")
print(selected_profile)

Available Profiles:
                                          data_concat
78  liberal arts major aspiring human resources an...
99  aspiring human resources manager  graduating m...
71  business management major and aspiring human r...
96  aspiring human resources professional kokomo i...
72  aspiring human resources manager seeking inter...
81  aspiring human resources professional  an ener...
65  experienced retail manager and aspiring human ... 

Enter the index of the profile you want to select: 96

Selected Profile:
aspiring human resources professional kokomo indiana area 71


In [11]:
new_query = [selected_profile]
print(new_query)

['aspiring human resources professional kokomo indiana area 71']


In [None]:
vector_2 = bow_countvec.transform(new_query)
vector_bow_query = vector_2.toarray()
print('Bag of Words Query Vector:')
print('shape: ',vector_bow_query.shape)
print(vector_bow_query)

In [None]:
similarities = cosine_similarity(vector_bow_query, vector_bow)
print("array of cosine similarities: ")
print("shape: ", similarities.shape)
print(similarities)

In [14]:
ranking = np.argsort(similarities[0])[::-1]

top_n = 7
#top_profiles = data_concat.iloc[ranking[:top_n]]
print('Top 7 Profiles: \n', data[['id', 'job_title']].iloc[ranking[:top_n]])

Top 7 Profiles: 
     id                                          job_title
96  97              aspiring human resources professional
81  82  aspiring human resources professional  an ener...
65  66  experienced retail manager and aspiring human ...
78  79  liberal arts major aspiring human resources an...
73  74                       human resources professional
83  84  human resources professional for the world lea...
71  72  business management major and aspiring human r...


#[TF IDF Model](https://www.geeksforgeeks.org/understanding-tf-idf-term-frequency-inverse-document-frequency/)

###Preprocess

In [3]:
# joining all excisting columns
data_concat = pd.DataFrame()
data_concat["data_concat"] = data[["job_title", "location", "connection"]].apply(" ".join, axis=1)

data_concat.head()

Unnamed: 0,data_concat
0,2019 ct bauer college of business graduate mag...
1,native english teacher at epik english program...
2,aspiring human resources professional raleigh-...
3,people development coordinator at ryan denton ...
4,advisory board member at celal bayar universit...


In [4]:
# making dataframe into a list of srings
data_concat_list = data_concat["data_concat"]
data_concat_list = data_concat_list.tolist()
print(data_concat_list)

['2019 ct bauer college of business graduate magna cum laude and aspiring human resources professional houston texas 85', 'native english teacher at epik english program in korea kanada 500+ ', 'aspiring human resources professional raleigh-durham north carolina area 44', 'people development coordinator at ryan denton texas 500+ ', 'advisory board member at celal bayar university i̇zmir türkiye 500+ ', 'aspiring human resources specialist greater new york city area 1', 'student at humber college and aspiring human resources generalist kanada 61', 'hr senior specialist san francisco bay area 500+ ', 'student at humber college and aspiring human resources generalist kanada 61', 'seeking human resources hris and generalist positions greater philadelphia area 500+ ', 'student at chapman university lake forest california 2', 'svp chro marketing & communications csr officer  engie  houston  the woodlands  energy  gphr  sphr houston texas area 500+ ', 'human resources coordinator at intercont

###Get TF IDF Values

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

query_tfidf = ["aspiring  human resources", "seeking human resources"]

tfidf = TfidfVectorizer(stop_words='english')

tfidf_value = tfidf.fit_transform(data_concat_list + query_tfidf)

terms = tfidf.get_feature_names_out()
idf_scores = tfidf.idf_
terms_idf_scores = list(zip(terms, idf_scores))
terms_idf_scores.sort(key=lambda x: x[1], reverse=True)

# get idf values
print('\nidf values:')
for ele1, ele2 in terms_idf_scores:
    print(ele1, ':', ele2)

# get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)

# display tf-idf values
print('\n tf-idf value:')
print(tfidf_value)
print(tfidf_value.shape)

###Get Cosine Similarity Score

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarities_tfidf = cosine_similarity(tfidf_value[-len(query_tfidf):], tfidf_value[:-len(query_tfidf)])

print("array of cosine similarities: ")
print("shape: ", similarities_tfidf.shape)
print(similarities_tfidf)

###Top 7 Profiles

In [7]:
ranking_tfidf = np.argsort(similarities_tfidf[0])[::-1]

top_n = 7
print('Top 7 Profiles: \n', data_concat.iloc[ranking_tfidf[:top_n]])

Top 7 Profiles: 
                                           data_concat
72  aspiring human resources manager seeking inter...
59  aspiring human resources specialist greater ne...
5   aspiring human resources specialist greater ne...
48  aspiring human resources specialist greater ne...
35  aspiring human resources specialist greater ne...
23  aspiring human resources specialist greater ne...
51  student at humber college and aspiring human r...


###Re-ranking Based on Preference

In [8]:
# display available profiles
print("Available Profiles:")
print(data_concat.iloc[ranking_tfidf[:top_n]], '\n')

# prompt user to select a profile
selected_index = input("Enter the index of the profile you want to select: ")

# convert the input to an integer
selected_index = int(selected_index)

# filter the DataFrame based on the selected index
selected_profile_tfidf = data_concat['data_concat'].iloc[selected_index]

# display the selected profile
print("\nSelected Profile:")
print(selected_profile_tfidf)

Available Profiles:
                                          data_concat
72  aspiring human resources manager seeking inter...
59  aspiring human resources specialist greater ne...
5   aspiring human resources specialist greater ne...
48  aspiring human resources specialist greater ne...
35  aspiring human resources specialist greater ne...
23  aspiring human resources specialist greater ne...
51  student at humber college and aspiring human r... 

Enter the index of the profile you want to select: 96

Selected Profile:
aspiring human resources professional kokomo indiana area 71


In [9]:
new_query = [selected_profile_tfidf]
print(new_query)

['aspiring human resources professional kokomo indiana area 71']


In [None]:
tfidf_value = tfidf.fit_transform(data_concat_list + new_query)

similarities_tfidf = cosine_similarity(tfidf_value[-len(new_query):], tfidf_value[:-len(new_query)])

print("array of cosine similarities: ", similarities_tfidf)
print("shape: ", similarities_tfidf.shape)

In [11]:
ranking_tfidf = np.argsort(similarities_tfidf[0])[::-1]

top_n = 7
print('Top 7 Profiles: \n', data[['id', 'job_title']].iloc[ranking_tfidf[:top_n]])

Top 7 Profiles: 
     id                                          job_title
96  97              aspiring human resources professional
95  96  student at indiana university kokomo - busines...
16  17              aspiring human resources professional
45  46              aspiring human resources professional
2    3              aspiring human resources professional
32  33              aspiring human resources professional
20  21              aspiring human resources professional


# [Word2Vec Model](https://medium.com/@dilip.voleti/classification-using-word2vec-b1d79d375381)

###Preprocessing for Word2Vec

In [None]:
import nltk
nltk.download()

In [4]:
# joining all excisting columns
data_concat = pd.DataFrame()
data_concat["data_concat"] = data[["job_title", "location", "connection"]].apply(" ".join, axis=1)

data_concat.head()

Unnamed: 0,data_concat
0,2019 ct bauer college of business graduate mag...
1,native english teacher at epik english program...
2,aspiring human resources professional raleigh-...
3,people development coordinator at ryan denton ...
4,advisory board member at celal bayar universit...


In [5]:
data_concat_w2v = pd.DataFrame()
data_concat_w2v["text"] = data_concat["data_concat"]

data_concat_w2v.head()

Unnamed: 0,text
0,2019 ct bauer college of business graduate mag...
1,native english teacher at epik english program...
2,aspiring human resources professional raleigh-...
3,people development coordinator at ryan denton ...
4,advisory board member at celal bayar universit...


In [6]:
# remove all punctuation, remove stop words and tokenize using built in data cleaner in gensim
import gensim

data_concat_w2v['clean_text'] = data_concat_w2v['text'].apply(lambda x: gensim.parsing.preprocessing.remove_stopwords(x),
                                                                    lambda x: gensim.parsing.preprocessing.remove_strip_punctuation(x))

data_concat_w2v.head()

Unnamed: 0,text,clean_text
0,2019 ct bauer college of business graduate mag...,2019 ct bauer college business graduate magna ...
1,native english teacher at epik english program...,native english teacher epik english program ko...
2,aspiring human resources professional raleigh-...,aspiring human resources professional raleigh-...
3,people development coordinator at ryan denton ...,people development coordinator ryan denton tex...
4,advisory board member at celal bayar universit...,advisory board member celal bayar university i...


In [7]:
from gensim.parsing.preprocessing import preprocess_string

data_concat_w2v['processed_text'] = [preprocess_string(sentence, [lambda x: x.lower()]) for sentence in data_concat_w2v['clean_text']]
data_concat_w2v.head()

Unnamed: 0,text,clean_text,processed_text
0,2019 ct bauer college of business graduate mag...,2019 ct bauer college business graduate magna ...,"[2019, ct, bauer, college, business, graduate,..."
1,native english teacher at epik english program...,native english teacher epik english program ko...,"[native, english, teacher, epik, english, prog..."
2,aspiring human resources professional raleigh-...,aspiring human resources professional raleigh-...,"[aspiring, human, resources, professional, ral..."
3,people development coordinator at ryan denton ...,people development coordinator ryan denton tex...,"[people, development, coordinator, ryan, dento..."
4,advisory board member at celal bayar universit...,advisory board member celal bayar university i...,"[advisory, board, member, celal, bayar, univer..."


In [8]:
data_vectors_w2v = data_concat_w2v['processed_text'].tolist()
print(data_vectors_w2v)

[['2019', 'ct', 'bauer', 'college', 'business', 'graduate', 'magna', 'cum', 'laude', 'aspiring', 'human', 'resources', 'professional', 'houston', 'texas', '85'], ['native', 'english', 'teacher', 'epik', 'english', 'program', 'korea', 'kanada', '500+'], ['aspiring', 'human', 'resources', 'professional', 'raleigh-durham', 'north', 'carolina', 'area', '44'], ['people', 'development', 'coordinator', 'ryan', 'denton', 'texas', '500+'], ['advisory', 'board', 'member', 'celal', 'bayar', 'university', 'i̇zmir', 'türkiye', '500+'], ['aspiring', 'human', 'resources', 'specialist', 'greater', 'new', 'york', 'city', 'area', '1'], ['student', 'humber', 'college', 'aspiring', 'human', 'resources', 'generalist', 'kanada', '61'], ['hr', 'senior', 'specialist', 'san', 'francisco', 'bay', 'area', '500+'], ['student', 'humber', 'college', 'aspiring', 'human', 'resources', 'generalist', 'kanada', '61'], ['seeking', 'human', 'resources', 'hris', 'generalist', 'positions', 'greater', 'philadelphia', 'area',

In [9]:
query_w2v = ["aspiring  human resources", "seeking human resources"]
print(query_w2v)

['aspiring  human resources', 'seeking human resources']


In [10]:
query_w2v_df = pd.DataFrame(query_w2v, columns=['query'])
query_w2v_df['processed_query'] = [preprocess_string(sentence, [lambda x: x.lower()]) for sentence in query_w2v_df['query']]
query_w2v_df.head()

Unnamed: 0,query,processed_query
0,aspiring human resources,"[aspiring, human, resources]"
1,seeking human resources,"[seeking, human, resources]"


In [11]:
query_vectors_w2v = query_w2v_df['processed_query']
print(type(query_vectors_w2v))

<class 'pandas.core.series.Series'>


In [12]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(data_vectors_w2v, min_count=0)

#query_vectors_w2v = query_vectors_w2v.apply(lambda x: w2v_model.wv[x])
#data_vectors_w2v = data_vectors_w2v.apply(lambda x: w2v_model.wv[x])

data_concat_w2v['processed_text_vec'] = data_concat_w2v['processed_text'].apply(lambda tokens: np.mean([w2v_model.wv[word] for word in tokens], axis=0))
sentence_vectors_w2v = [np.mean([w2v_model.wv[word] for word in sentence.split()], axis=0) for sentence in query_w2v]
query_vector_w2v = np.mean(sentence_vectors_w2v, axis=0)

In [None]:
print(data_concat_w2v)

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
data_concat_w2v['cosine_similarity'] = data_concat_w2v['processed_text'].apply(lambda profile_tokens: cosine_similarity([query_vector_w2v], [np.mean([w2v_model.wv[word] for word in profile_tokens], axis=0)])[0][0])

In [15]:
data_concat_w2v.head()

Unnamed: 0,text,clean_text,processed_text,processed_text_vec,cosine_similarity
0,2019 ct bauer college of business graduate mag...,2019 ct bauer college business graduate magna ...,"[2019, ct, bauer, college, business, graduate,...","[0.00096714054, -0.002497302, 0.0017096938, 0....",0.579338
1,native english teacher at epik english program...,native english teacher epik english program ko...,"[native, english, teacher, epik, english, prog...","[0.0011231703, 0.0028241435, -0.0007436086, 0....",0.163021
2,aspiring human resources professional raleigh-...,aspiring human resources professional raleigh-...,"[aspiring, human, resources, professional, ral...","[0.00070970313, 0.0031459108, -1.7731762e-05, ...",0.559604
3,people development coordinator at ryan denton ...,people development coordinator ryan denton tex...,"[people, development, coordinator, ryan, dento...","[-0.0037920654, 0.0027148367, -0.004551845, -0...",0.051331
4,advisory board member at celal bayar universit...,advisory board member celal bayar university i...,"[advisory, board, member, celal, bayar, univer...","[-0.0017228854, 0.0063283537, 0.00059926434, -...",0.081977


###Top 7 Profiles

In [16]:
cols_w2v = ['text', 'cosine_similarity']
sorted_profiles = data_concat_w2v[cols_w2v].copy()
sorted_profiles.sort_values(by='cosine_similarity', ascending=False).head(7)

Unnamed: 0,text,cosine_similarity
72,aspiring human resources manager seeking inter...,0.833893
99,aspiring human resources manager graduating m...,0.755304
36,student at humber college and aspiring human r...,0.704878
38,student at humber college and aspiring human r...,0.704878
6,student at humber college and aspiring human r...,0.704878
8,student at humber college and aspiring human r...,0.704878
24,student at humber college and aspiring human r...,0.704878


###Reranking Based on Preference

In [17]:
# display available profiles
print("Available Profiles:")
print(sorted_profiles.sort_values(by='cosine_similarity', ascending=False).head(7)['text'], '\n')

# prompt user to select a profile
selected_index = input("Enter the index of the profile you want to select: ")

# convert the input to an integer
selected_index = int(selected_index)

# filter the DataFrame based on the selected ID
selected_profile_w2v = sorted_profiles['text'].iloc[selected_index]

# display the selected profile
print("\nSelected Profile:")
print(selected_profile_w2v)

Available Profiles:
72    aspiring human resources manager seeking inter...
99    aspiring human resources manager  graduating m...
36    student at humber college and aspiring human r...
38    student at humber college and aspiring human r...
6     student at humber college and aspiring human r...
8     student at humber college and aspiring human r...
24    student at humber college and aspiring human r...
Name: text, dtype: object 

Enter the index of the profile you want to select: 96

Selected Profile:
aspiring human resources professional kokomo indiana area 71


In [18]:
query_w2v = [selected_profile_w2v]
print(query_w2v)

['aspiring human resources professional kokomo indiana area 71']


In [19]:
sentence_vectors_w2v = [
    np.mean([w2v_model.wv[word] for word in sentence.split() if word in w2v_model.wv], axis=0)
    for sentence in query_w2v
]
query_vector_w2v = np.mean(sentence_vectors_w2v, axis=0)

data_concat_w2v['cosine_similarity'] = data_concat_w2v['processed_text'].apply(lambda profile_tokens: cosine_similarity([query_vector_w2v], [np.mean([w2v_model.wv[word] for word in profile_tokens], axis=0)])[0][0])
data_concat_w2v.head()

Unnamed: 0,text,clean_text,processed_text,processed_text_vec,cosine_similarity
0,2019 ct bauer college of business graduate mag...,2019 ct bauer college business graduate magna ...,"[2019, ct, bauer, college, business, graduate,...","[0.00096714054, -0.002497302, 0.0017096938, 0....",0.621924
1,native english teacher at epik english program...,native english teacher epik english program ko...,"[native, english, teacher, epik, english, prog...","[0.0011231703, 0.0028241435, -0.0007436086, 0....",0.139142
2,aspiring human resources professional raleigh-...,aspiring human resources professional raleigh-...,"[aspiring, human, resources, professional, ral...","[0.00070970313, 0.0031459108, -1.7731762e-05, ...",0.752929
3,people development coordinator at ryan denton ...,people development coordinator ryan denton tex...,"[people, development, coordinator, ryan, dento...","[-0.0037920654, 0.0027148367, -0.004551845, -0...",0.133088
4,advisory board member at celal bayar universit...,advisory board member celal bayar university i...,"[advisory, board, member, celal, bayar, univer...","[-0.0017228854, 0.0063283537, 0.00059926434, -...",0.074076


In [22]:
cols_w2v = ['text', 'cosine_similarity']
sorted_profiles = data_concat_w2v[cols_w2v].copy()
sorted_profiles.sort_values(by='cosine_similarity', ascending=False).head(7)

Unnamed: 0,text,cosine_similarity
96,aspiring human resources professional kokomo i...,1.0
16,aspiring human resources professional raleigh-...,0.752929
2,aspiring human resources professional raleigh-...,0.752929
45,aspiring human resources professional raleigh-...,0.752929
32,aspiring human resources professional raleigh-...,0.752929
57,aspiring human resources professional raleigh-...,0.752929
20,aspiring human resources professional raleigh-...,0.752929


#BERT Model

###Preprocess

In [3]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
tqdm.pandas()

In [5]:
# loading BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


In [6]:
# joining all excisting columns
data_concat = pd.DataFrame()
data_concat["data_concat"] = data[["job_title", "location", "connection"]].apply(" ".join, axis=1)

data_concat.head()

Unnamed: 0,data_concat
0,2019 ct bauer college of business graduate mag...
1,native english teacher at epik english program...
2,aspiring human resources professional raleigh-...
3,people development coordinator at ryan denton ...
4,advisory board member at celal bayar universit...


In [7]:
# Preprocess query sentences
query_sentences = ["aspiring human resources", "seeking human resources"]
query_embeddings = [model(**tokenizer(sentence, return_tensors='pt'))['last_hidden_state'] for sentence in query_sentences]

# Preprocess profiles
profile_embeddings = data_concat["data_concat"].progress_apply(lambda title: model(**tokenizer(title, return_tensors='pt'))['last_hidden_state'])

100%|██████████| 104/104 [00:10<00:00,  9.97it/s]


In [8]:
# Calculate cosine similarity
similarities = profile_embeddings.apply(lambda profile: cosine_similarity(profile.mean(axis=1).detach().numpy(), query_embeddings[0].mean(axis=1).detach().numpy()).item())
similarities += profile_embeddings.apply(lambda profile: cosine_similarity(profile.mean(axis=1).detach().numpy(), query_embeddings[1].mean(axis=1).detach().numpy()).item())
data_concat['cosine_similarity'] = similarities

###Top 7 Profiles

In [9]:
# Get the 7 most similar profiles
##print(data_concat.nlargest(7, 'cosine_similarity'))
data_concat.sort_values(by='cosine_similarity', ascending=False).head(7)

Unnamed: 0,data_concat,cosine_similarity
27,seeking human resources opportunities chicago ...,1.548901
29,seeking human resources opportunities chicago ...,1.548901
72,aspiring human resources manager seeking inter...,1.425465
61,seeking human resources hris and generalist po...,1.379925
9,seeking human resources hris and generalist po...,1.379925
52,seeking human resources hris and generalist po...,1.379925
39,seeking human resources hris and generalist po...,1.379925


###Reranking Based on Prefernece

In [10]:
# display available profiles
print("Available Profiles:")
print(data_concat.sort_values(by='cosine_similarity', ascending=False).head(7)['data_concat'], '\n')

# prompt user to select a profile
selected_index = input("Enter the index of the profile you want to select: ")

# convert the input to an integer
selected_index = int(selected_index)

# filter the DataFrame based on the selected ID
selected_profile_bert = data_concat['data_concat'].iloc[selected_index]

# display the selected profile
print("\nSelected Profile:")
print(selected_profile_bert)

Available Profiles:
27    seeking human resources opportunities chicago ...
29    seeking human resources opportunities chicago ...
72    aspiring human resources manager seeking inter...
61    seeking human resources hris and generalist po...
9     seeking human resources hris and generalist po...
52    seeking human resources hris and generalist po...
39    seeking human resources hris and generalist po...
Name: data_concat, dtype: object 

Enter the index of the profile you want to select: 96

Selected Profile:
aspiring human resources professional kokomo indiana area 71


In [11]:
query_sentences = [selected_profile_bert]
query_embeddings = [model(**tokenizer(sentence, return_tensors='pt'))['last_hidden_state'] for sentence in query_sentences]

In [12]:
# Calculate cosine similarity
similarities = profile_embeddings.apply(lambda profile: cosine_similarity(profile.mean(axis=1).detach().numpy(), query_embeddings[0].mean(axis=1).detach().numpy()).item())
data_concat['cosine_similarity'] = similarities

In [13]:
data_concat.sort_values(by='cosine_similarity', ascending=False).head(7)

Unnamed: 0,data_concat,cosine_similarity
96,aspiring human resources professional kokomo i...,1.0
65,experienced retail manager and aspiring human ...,0.809298
72,aspiring human resources manager seeking inter...,0.794323
88,director human resources at ey greater atlant...,0.787946
80,senior human resources business partner at hei...,0.787779
20,aspiring human resources professional raleigh-...,0.780534
2,aspiring human resources professional raleigh-...,0.780534


#SBERT Model

###Preprocess

In [None]:
!pip install pandas sentence-transformers

In [4]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

In [5]:
# loading SBERT model
sbert_model_name = "paraphrase-distilroberta-base-v1"
sbert_model = SentenceTransformer(sbert_model_name)

In [6]:
# joining all excisting columns
data_concat = pd.DataFrame()
data_concat["data_concat"] = data[["job_title", "location", "connection"]].apply(" ".join, axis=1)

data_concat.head()

Unnamed: 0,data_concat
0,2019 ct bauer college of business graduate mag...
1,native english teacher at epik english program...
2,aspiring human resources professional raleigh-...
3,people development coordinator at ryan denton ...
4,advisory board member at celal bayar universit...


In [7]:
# Preprocess query sentences
query_sentences_sbert = ["aspiring human resources", "seeking human resources"]
query_embeddings_sbert = sbert_model.encode(query_sentences_sbert)

# Preprocess profiles
profile_embeddings_sbert = sbert_model.encode(data_concat['data_concat'])

In [8]:
import torch

# Calculate cosine similarity for each query sentence separately
similarities_query_1 = util.pytorch_cos_sim(torch.tensor(query_embeddings_sbert[0]), torch.tensor(profile_embeddings_sbert))
similarities_query_2 = util.pytorch_cos_sim(torch.tensor(query_embeddings_sbert[1]), torch.tensor(profile_embeddings_sbert))

# Combine the cosine similarities (e.g., taking the average)
similarities_combined = (similarities_query_1 + similarities_query_2) / 2

similarities_combined = similarities_combined.squeeze()

# Assign the combined similarities to the DataFrame
data_concat['cosine_similarity_sbert'] = similarities_combined.cpu().numpy().tolist()

###Top 7 Profiles

In [9]:
# Ranking profiles based on cosine similarity score
##print(data_concat.nlargest(7, 'cosine_similarity_sbert'))
data_concat.sort_values(by='cosine_similarity_sbert', ascending=False).head(7)

Unnamed: 0,data_concat,cosine_similarity_sbert
96,aspiring human resources professional kokomo i...,0.55828
98,seeking human resources position las vegas nev...,0.519909
20,aspiring human resources professional raleigh-...,0.499703
2,aspiring human resources professional raleigh-...,0.499703
57,aspiring human resources professional raleigh-...,0.499703
16,aspiring human resources professional raleigh-...,0.499703
45,aspiring human resources professional raleigh-...,0.499703


###Reranking Based on Preference

In [10]:
# display available profiles
print("Available Profiles:")
print(data_concat.sort_values(by='cosine_similarity_sbert', ascending=False).head(7)['data_concat'], '\n')

# prompt user to select a profile
selected_index = input("Enter the index of the profile you want to select: ")

# convert the input to an integer
selected_index = int(selected_index)

# filter the DataFrame based on the selected ID
selected_profile_sbert = data_concat['data_concat'].iloc[selected_index]

# display the selected profile
print("\nSelected Profile:")
print(selected_profile_sbert)

Available Profiles:
96    aspiring human resources professional kokomo i...
98    seeking human resources position las vegas nev...
20    aspiring human resources professional raleigh-...
2     aspiring human resources professional raleigh-...
57    aspiring human resources professional raleigh-...
16    aspiring human resources professional raleigh-...
45    aspiring human resources professional raleigh-...
Name: data_concat, dtype: object 

Enter the index of the profile you want to select: 96

Selected Profile:
aspiring human resources professional kokomo indiana area 71


In [11]:
# Preprocess query sentences
query_sentences_sbert = [selected_profile_sbert]
query_embeddings_sbert = sbert_model.encode(query_sentences_sbert)

In [12]:
# Calculate cosine similarity for query sentence
similarities = util.pytorch_cos_sim(torch.tensor(query_embeddings_sbert[0]), torch.tensor(profile_embeddings_sbert))

similarities = similarities.squeeze()

# Assign the combined similarities to the DataFrame
data_concat['cosine_similarity_sbert'] = similarities.cpu().numpy().tolist()

In [13]:
# Ranking profiles based on cosine similarity score
data_concat.sort_values(by='cosine_similarity_sbert', ascending=False).head(7)

Unnamed: 0,data_concat,cosine_similarity_sbert
96,aspiring human resources professional kokomo i...,1.0
2,aspiring human resources professional raleigh-...,0.588413
57,aspiring human resources professional raleigh-...,0.588413
20,aspiring human resources professional raleigh-...,0.588413
16,aspiring human resources professional raleigh-...,0.588413
32,aspiring human resources professional raleigh-...,0.588413
45,aspiring human resources professional raleigh-...,0.588413
