#SBERT Model

##Installing and Importing Packages

In [None]:
!pip install pandas sentence-transformers

In [2]:
# Importing packages
from google.colab import files
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch

##Uploading Raw Data

In [3]:
# Function for loading the data
def load_data(file_path):
    uploaded = files.upload()
    data = pd.read_csv(file_path)
    return data

##Preprocessing Raw Data

In [4]:
def preprocess_data(data):

  # Changing all uppercase letters to lowercase
  data['job_title'] = data['job_title'].str.lower()
  data['location'] = data['location'].str.lower()
  data['connection'] = data['connection'].str.lower()

  # Getting rid of punctuation
  data = data.replace('[,\.\|\()\!]','', regex=True)

  # Concatenating data in new df
  data_concat = pd.DataFrame()
  data_concat["data_concat"] = data[["job_title", "location", "connection"]].apply(" ".join, axis=1)
  return data_concat

## Encoding Data and Query with Model

In [5]:
def encoding_data_and_query(model_name, query_sentences):
  model = SentenceTransformer(model_name)

  # Encoding profiles
  profile_embeddings = model.encode(data_concat['data_concat'])

  # Encoding query sentences
  query_embeddings = model.encode(query_sentences)

  return model, profile_embeddings, query_embeddings

## Calculating Cosine Similarities

In [6]:
def initial_cosine_similarities(query_embeddings, profile_embeddings):
  # Calculate cosine similarity for each query sentence separately
  similarities_query_1 = util.pytorch_cos_sim(torch.tensor(query_embeddings[0]), torch.tensor(profile_embeddings))
  similarities_query_2 = util.pytorch_cos_sim(torch.tensor(query_embeddings[1]), torch.tensor(profile_embeddings))

  # Combine the cosine similarities (e.g., taking the average)
  similarities_combined = (similarities_query_1 + similarities_query_2) / 2

  similarities_combined = similarities_combined.squeeze()
  return similarities_combined

## Ranking and Displaying Profiles

In [10]:
# Function for ranking and displaying profiles
def rank_and_display_profiles(data_concat, column_name, top_n=7):
    ranked_profiles = data_concat.sort_values(by=column_name, ascending=False).head(top_n)
    print("Top {} Profiles:".format(top_n))
    print(ranked_profiles['data_concat'], '\n')
    return ranked_profiles

## Re-ranking Based on User Preference

In [11]:
# Function for prompting user to select a profile
def select_and_display_profile(data_concat):
    # Prompt user to select a profile
    selected_index = int(input("Enter the index of the profile you want to select: "))

    # Filter the DataFrame based on the selected index
    selected_profile = data_concat['data_concat'].iloc[selected_index]

    # Display the selected profile
    print("\nSelected Profile:")
    print(selected_profile)
    return selected_profile

In [12]:
# Function for encoding new query
def encoding_new_query(model, selected_profile):
  new_query_sentence = [selected_profile]
  new_query_embedding = model.encode(new_query_sentence)
  return new_query_embedding

## Call Functions

In [13]:
# Load data
file_path = "potential-talents - Aspiring human resources - seeking human resources.csv"
data = load_data(file_path)

# Preprocess data
data_concat = preprocess_data(data)

# Encode data and query with SBERT model
model, profile_embeddings_sbert, query_embeddings_sbert = encoding_data_and_query('paraphrase-distilroberta-base-v1', ['aspiring human resources', 'seeking human resources'])

# Calculate cosine similarities
similarities_combined = initial_cosine_similarities(query_embeddings_sbert, profile_embeddings_sbert)

# Assign the combined similarities to the DataFrame
data_concat['cosine_similarity_sbert'] = similarities_combined.cpu().numpy().tolist()

# Rank and display profiles based on cosine similarity scores
ranked_profiles = rank_and_display_profiles(data_concat, 'cosine_similarity_sbert')

# Select and display a profile
selected_profile = select_and_display_profile(data_concat)

# Encode query sentence for the selected profile
new_query_embedding = encoding_new_query(model, selected_profile)

# Calculate cosine similarity for the new query sentence
similarities = util.pytorch_cos_sim(torch.tensor(new_query_embedding[0]), torch.tensor(profile_embeddings_sbert))

similarities = similarities.squeeze()

# Assign the new similarities to the DataFrame
data_concat['cosine_similarity_sbert'] = similarities.cpu().numpy().tolist()

# Rank and display profiles based on the new cosine similarity score
rank_and_display_profiles(data_concat, 'cosine_similarity_sbert')

Saving potential-talents - Aspiring human resources - seeking human resources.csv to potential-talents - Aspiring human resources - seeking human resources.csv
Top 7 Profiles:
96    aspiring human resources professional kokomo i...
98    seeking human resources position las vegas nev...
20    aspiring human resources professional raleigh-...
2     aspiring human resources professional raleigh-...
57    aspiring human resources professional raleigh-...
16    aspiring human resources professional raleigh-...
45    aspiring human resources professional raleigh-...
Name: data_concat, dtype: object 

Enter the index of the profile you want to select: 98

Selected Profile:
seeking human resources position las vegas nevada area 48
Top 7 Profiles:
98    seeking human resources position las vegas nev...
52    seeking human resources hris and generalist po...
39    seeking human resources hris and generalist po...
61    seeking human resources hris and generalist po...
9     seeking human resour

Unnamed: 0,data_concat,cosine_similarity_sbert
98,seeking human resources position las vegas nev...,1.0
52,seeking human resources hris and generalist po...,0.561073
39,seeking human resources hris and generalist po...,0.561073
61,seeking human resources hris and generalist po...,0.561073
9,seeking human resources hris and generalist po...,0.561073
29,seeking human resources opportunities chicago ...,0.539279
27,seeking human resources opportunities chicago ...,0.539279
