In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
url = 'https://docs.google.com/spreadsheets/d/117X6i53dKiO7w6kuA1g1TpdTlv1173h_dPlJt5cNNMU/export?format=csv'
df = pd.read_csv(url)

# Define the keywords related to searching
keywords = ["aspiring human resources", "seeking human resources"]

# Combine the keywords into a single string for comparison
keyword_str = " ".join(keywords)

# Step 1: Use TF-IDF to vectorize both job titles and keywords
tfidf = TfidfVectorizer()
job_titles_tfidf = tfidf.fit_transform(df['job_title'].fillna(''))
keyword_tfidf = tfidf.transform([keyword_str])

# Step 2: Calculate similarity between job titles and the keywords
similarity_scores = cosine_similarity(job_titles_tfidf, keyword_tfidf).flatten()

# Step 3: Normalize similarity scores
scores = (similarity_scores - similarity_scores.min()) / (similarity_scores.max() - similarity_scores.min())

# Step 4: Update the 'fit' column with the calculated fitness scores
df['fit'] = scores

# Step 5: Rank the candidates by fit score
df_ranked = df.sort_values(by='fit', ascending=False)

# Display the ranked dataset
print(df_ranked[['id', 'job_title', 'location', 'connection', 'fit']])

      id                                          job_title  \
72    73  Aspiring Human Resources Manager, seeking inte...   
45    46              Aspiring Human Resources Professional   
57    58              Aspiring Human Resources Professional   
16    17              Aspiring Human Resources Professional   
32    33              Aspiring Human Resources Professional   
..   ...                                                ...   
22    23    Advisory Board Member at Celal Bayar University   
21    22             People Development Coordinator at Ryan   
19    20  Native English Teacher at EPIK (English Progra...   
17    18             People Development Coordinator at Ryan   
103  104   Director Of Administration at Excellence Logging   

                                location connection       fit  
72                   Houston, Texas Area          7  1.000000  
45   Raleigh-Durham, North Carolina Area         44  0.936737  
57   Raleigh-Durham, North Carolina Area         44