<a href="https://colab.research.google.com/github/philadelphia24/Thesis-Job-Recommender-System-/blob/main/Content-Based%20Filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CBRS: Upload Libraries and files

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
!pip install num2words
from num2words import num2words
import re
from sklearn.metrics.pairwise import linear_kernel
from scipy.stats import kendalltau, spearmanr
import random
from nltk.corpus import wordnet
import nltk
nltk.download('wordnet')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from google.colab import files
uploaded = files.upload()

import io
jobseekers_df= pd.read_csv(io.BytesIO(uploaded['jobseekers.csv']))
vacancies_df= pd.read_csv(io.BytesIO(uploaded['vacancies_description.csv']))
matches_df= pd.read_csv(io.BytesIO(uploaded['matches.csv']))

Saving matches.csv to matches (2).csv
Saving jobseekers.csv to jobseekers (2).csv
Saving vacancies_description.csv to vacancies_description (2).csv


In [None]:
display(vacancies_df)

Unnamed: 0,VacancyID,Job Title,Language,Education Level,Industry,Experience,VacancyDescription
0,49442,Cashier,French,High School,Wholesale and retail,5-10,We are looking for an additional Cashier to st...
1,31482,Data Analyst,Dutch,Bachelor,Administration,0-1,We are looking for an additional Data Analyst ...
2,39069,Janitor,French,High School,Cleaning,10+,We are looking for an additional Janitor to st...
3,31129,Finance Manager,Dutch,Master,Cleaning,5-10,We are looking for an additional Finance Manag...
4,44258,Data Analyst,German,Master,Wholesale and retail,0-1,We are looking for an additional Data Analyst ...
...,...,...,...,...,...,...,...
495,48744,Data Analyst,Spanish,Bachelor,Financial intermediation,5-10,We are looking for an additional Data Analyst ...
496,49498,Finance Manager,German,Bachelor,Manufacturing,5-10,We are looking for an additional Finance Manag...
497,30794,Professor,German,PhD,Research,0-1,We are looking for an additional Professor to ...
498,34852,Janitor,English,High School,Cleaning,5-10,We are looking for an additional Janitor to st...


In [None]:
# Merge the matches_df with the vacancies_df and jobseekers_df
matches_vacancies_df = pd.merge(matches_df, vacancies_df, left_on='VacancyID', right_on='VacancyID')
matches_jobseekers_df = pd.merge(matches_df, jobseekers_df, left_on='JobseekerID', right_on='JobseekerID')

In [None]:
display(matches_vacancies_df)

Unnamed: 0,JobseekerID,VacancyID,Rating,Job Title,Language,Education Level,Industry,Experience,VacancyDescription
0,20473,30356,4,Finance Manager,Dutch,Master,Technology,10+,We are looking for an additional Finance Manag...
1,24472,30356,4,Finance Manager,Dutch,Master,Technology,10+,We are looking for an additional Finance Manag...
2,16658,30356,4,Finance Manager,Dutch,Master,Technology,10+,We are looking for an additional Finance Manag...
3,10182,30356,5,Finance Manager,Dutch,Master,Technology,10+,We are looking for an additional Finance Manag...
4,13789,30356,4,Finance Manager,Dutch,Master,Technology,10+,We are looking for an additional Finance Manag...
...,...,...,...,...,...,...,...,...,...
45312,10629,31068,4,Finance Manager,English,Bachelor,Financial intermediation,5-10,We are looking for an additional Finance Manag...
45313,10007,31068,4,Finance Manager,English,Bachelor,Financial intermediation,5-10,We are looking for an additional Finance Manag...
45314,12063,31068,1,Finance Manager,English,Bachelor,Financial intermediation,5-10,We are looking for an additional Finance Manag...
45315,13127,31068,1,Finance Manager,English,Bachelor,Financial intermediation,5-10,We are looking for an additional Finance Manag...


In [None]:
display(matches_jobseekers_df)

Unnamed: 0,JobseekerID,VacancyID,Rating,Name,Language,Education Level,Industry,Experience
0,20473,30356,4,Joren Van Campenhout,Dutch,PhD,Technology,2-4
1,22762,43396,1,Romy Derycke,Dutch,PhD,Hotels and restaurants,0-1
2,10466,34808,4,Gabriel Lammens,German,PhD,Administration,10+
3,10466,36153,2,Gabriel Lammens,German,PhD,Administration,10+
4,10466,38808,3,Gabriel Lammens,German,PhD,Administration,10+
...,...,...,...,...,...,...,...,...
45312,21446,40802,3,Sylvia Blommaert,French,High school,Cleaning,5-10
45313,21446,40283,5,Sylvia Blommaert,French,High school,Cleaning,5-10
45314,21446,42948,2,Sylvia Blommaert,French,High school,Cleaning,5-10
45315,21446,27350,2,Sylvia Blommaert,French,High school,Cleaning,5-10


## Clean Data Preprocessing

In [None]:
# Define a function to preprocess the text data
def preprocess_text(text):
    # Convert "High School" to "high_school"
    text = re.sub(r'\bHigh School\b', 'high_school', text) #added so that school is not extracted twice, once as upper, once lower case
    # Convert "High school" to "high_school"
    text = re.sub(r'\bHigh\s+school\b', 'high_school', text)
    # Replace integer ranges like "5-10" with "five_to_ten"
    text = re.sub(r'\b(\d+)-(\d+)\b', lambda match: f"{num2words(int(match.group(1)))}_to_{num2words(int(match.group(2)))}", text)
    # Replace "10+" with "ten_plus"
    text = re.sub(r'10\+', 'ten_plus', text)
    return text

### TF-IDF

In [None]:
# Create a TfidfVectorizer object with your preferred settings
tfidf = TfidfVectorizer(max_df=0.7, min_df=2, stop_words='english', preprocessor=preprocess_text)

# Use the vectorizer to transform your text data
vacancies_tfidf = tfidf.fit_transform(vacancies_df['VacancyDescription'])

In [None]:
# Extract the feature names from the fitted vectorizer object
feature_names = list(tfidf.vocabulary_.keys())

# Print the feature names that have been extracted from the text data
print(feature_names)

['Cashier', 'Wholesale', 'retail', 'French', 'five_to_ten', 'high_school', 'Data', 'Analyst', 'Administration', 'Dutch', 'zero_to_one', 'Bachelor', 'Janitor', 'Cleaning', 'ten_plus', 'Finance', 'Manager', 'Master', 'German', 'two_to_four', 'Manufacturing', 'Spanish', 'Research', 'Logistics', 'Cleaner', 'Administrative', 'Clerk', 'Truck', 'Driver', 'Transport', 'Professor', 'PhD', 'Education', 'Pharmaceutical', 'English', 'Agriculture', 'Technology', 'Hotels', 'restaurants', 'Construction', 'Financial', 'intermediation', 'Health', 'social', 'services', 'Energy']


In [None]:
vacancies_tfidf.shape

(500, 46)

*  For the 500 job vacancies there are 46 unique terms across all job vacancies. 

*  These dimensions are useful to understand the size of the tf-idf matrices that are being used to compute the cosine similarity between the job vacancies and job seekers. The larger the number of documents and terms, the more computationally expensive it can be to compute the cosine similarity.

=> 8 (job titles) + 5 (languages) + 4(education levels) + 16 (industries) + 4 (years of experience) =37

46, because 
1. Wholesale, retail +1
2. Data, Analyst +1
3. Finance, Manager +1
4. Administrative, Clerk +1
5. Truck, Driver +1
6. Hotels, restaurants +1
7. Financial, intermediation +1
8. Health, social, services +2



####Top-matched vacancy 


Before recommending, the similarity matrix has to be computed.

In [None]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(vacancies_tfidf, vacancies_tfidf)

# Since you have used the TF-IDF vectorizer, calculating the dot product between each vector will directly give you the cosine similarity score. 
# Therefore, you will use sklearn's linear_kernel() instead of cosine_similarities() since it is faster.
# Each vacancy will be a 1x500 column vector where each column will be a similarity score with each vacancy.

In [None]:
cosine_sim.shape

(500, 500)

In [None]:
cosine_sim[1]

# cosine_sim[1] gives you an array containing the cosine similarity scores between the first vacancy and all other vacancies in the vacancies_df. 
# The length of the array will be the same as the number of rows in vacancies_df.

array([0.        , 1.        , 0.        , 0.16138958, 0.47102728,
       0.        , 0.        , 0.45574472, 0.12491067, 0.        ,
       0.        , 0.30244206, 0.16019249, 0.57764401, 0.        ,
       0.61510243, 0.        , 0.        , 0.        , 0.        ,
       0.85784152, 0.17114662, 0.        , 0.        , 0.14446734,
       0.        , 0.59704211, 0.33163754, 0.        , 0.15317963,
       0.        , 0.17352448, 0.        , 0.15317963, 0.46861712,
       0.14841769, 0.        , 0.        , 0.        , 0.        ,
       0.14825406, 0.32390423, 0.        , 0.        , 0.        ,
       0.59690621, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.35664681, 0.        , 0.15259666, 0.30505161,
       0.        , 0.46573486, 0.        , 0.14463733, 0.        ,
       0.46669897, 0.        , 0.12281211, 0.        , 0.34152662,
       0.        , 0.16174791, 0.        , 0.18634159, 0.17352448,
       0.        , 0.        , 0.28032171, 0.        , 0.46225

Recommends the top 10 most similar vacancies to the top-rated vacancy that the jobseeker has already rated

In [None]:
#Prompt the user to input the jobseeker ID
jobseeker_id = int(input("Enter the jobseeker ID: "))
       
# Retrieve the VacancyID of the job vacancies matched with the given JobseekerID
matches_for_jobseeker = matches_df[matches_df['JobseekerID'] == jobseeker_id]

# Sort the matches by rating in descending order
sorted_matches = matches_for_jobseeker.sort_values(by='Rating', ascending=False)

# Select the VacancyID of the top match for the jobseeker
top_match_vacancy_id = sorted_matches.iloc[0]['VacancyID']
print(f"The top match for jobseeker {jobseeker_id} is Vacancy {top_match_vacancy_id}")

# Find the row with Jobseeker and print the JobTitle and Industry columns
print(f'The characteristics of Jobseeker {jobseeker_id} are:')
row_j = jobseekers_df.loc[jobseekers_df['JobseekerID'] == jobseeker_id, ['JobseekerID', 'Industry', 'Language', 'Experience', 'Education Level']]
display(row_j)

# Find the row with the highest rated VacancyID and print the JobTitle and Industry columns
print(f'The characteristics of the highest rated Vacancy {top_match_vacancy_id} are:')
row_v = vacancies_df.loc[vacancies_df['VacancyID'] == top_match_vacancy_id, ['VacancyID','Job Title', 'Industry', 'Language', 'Experience', 'Education Level']]
display(row_v)

# Check if there are any matches for the jobseeker
if not sorted_matches.empty:
    # Get the VacancyIDs that the jobseeker has already rated
    rated_vacancy_ids = sorted_matches['VacancyID'].values

    # Define the modified get_recommendations function
    def get_recommendations(VacancyID, cosine_sim=cosine_sim, min_similarity=0.4): #ADDED: min_similarity=...

        # Construct a reverse map of indices and VacancyIDs
        indices = pd.Series(vacancies_df.index, index=vacancies_df['VacancyID']).drop_duplicates()
            
        # Get the index of the movie that matches the VacancyID (the highest rated one)
        idx = indices[VacancyID]

        # Get the pairwise similarity scores of all vacancies with that vacancy
        sim_scores = list(enumerate(cosine_sim[idx]))
        # ADDED: Filter the similarity scores by the minimum threshold
        sim_scores = [score for score in sim_scores if score[1] >= min_similarity]
       
        # Sort the vacancies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores and indices of the 10 most similar vacancies
        sim_scores = sim_scores[1:11]
        vacancy_indices = [i[0] for i in sim_scores] #NEW
        cosine_sim_scores = [i[1] for i in sim_scores] #NE
      
        # Get the details of the most similar vacancies, including the cosine similarity scores
        vacancies = pd.DataFrame(list(zip(vacancy_indices, cosine_sim_scores)), columns=['index', 'cosine_similarity_score'])
        vacancies = vacancies.merge(vacancies_df[['VacancyID', 'Job Title','Industry','Language','Experience','Education Level']], how='left', left_on='index', right_index=True)

        # Return the top 10 most similar vacancies
        return vacancies[['VacancyID', 'Job Title','Industry','Language','Experience','Education Level', 'cosine_similarity_score']]

    # Get the recommendations for the jobseeker
    recommendations = get_recommendations(top_match_vacancy_id, cosine_sim=cosine_sim, min_similarity=0.4)
  

    print(f"Recommended vacancies based on Vacancy {top_match_vacancy_id} are:")
    display(recommendations)
else:
    print(f"No matches found for jobseeker {jobseeker_id}")

Enter the jobseeker ID: 10466
The top match for jobseeker 10466 is Vacancy 46131
The characteristics of Jobseeker 10466 are:


Unnamed: 0,JobseekerID,Industry,Language,Experience,Education Level
2,10466,Administration,German,10+,PhD


The characteristics of the highest rated Vacancy 46131 are:


Unnamed: 0,VacancyID,Job Title,Industry,Language,Experience,Education Level
225,46131,Administrative Clerk,Administration,German,10+,Bachelor


Recommended vacancies based on Vacancy 46131 are:


Unnamed: 0,VacancyID,Job Title,Industry,Language,Experience,Education Level,cosine_similarity_score
0,46131,Administrative Clerk,Administration,German,10+,Bachelor,1.0
1,29753,Administrative Clerk,Administration,German,10+,Bachelor,1.0
2,25023,Administrative Clerk,Administration,German,10+,High school,0.895055
3,34743,Administrative Clerk,Administration,German,10+,High school,0.895055
4,38808,Administrative Clerk,Administration,German,0-1,Bachelor,0.878606
5,31567,Administrative Clerk,Administration,German,0-1,Bachelor,0.878606
6,48966,Administrative Clerk,Administration,French,10+,Bachelor,0.851457
7,31106,Administrative Clerk,Administration,French,10+,Bachelor,0.851457
8,38375,Administrative Clerk,Administration,English,10+,Bachelor,0.850547
9,44216,Administrative Clerk,Administration,English,10+,Bachelor,0.850547


###Evaluation

> 






Kendall_corr and spearman_corr

In [None]:
#For particular jobseeker

jobseeker_id = 10466
from scipy.stats import kendalltau, spearmanr

# Get the actual vacancies that the jobseeker applied for
actual_vacancies = matches_df[matches_df['JobseekerID'] == jobseeker_id]['VacancyID'].values
indices = np.argsort(-matches_df.loc[(matches_df['JobseekerID'] == jobseeker_id) & (matches_df['VacancyID'].isin(actual_vacancies)), 'Rating'])
actual_vacancies = actual_vacancies[indices]  
print("actual_vacancies", actual_vacancies)

# Get the recommended vacancies
recommended_vacancies = recommendations['VacancyID'].values
print("recommended_vacancies", recommended_vacancies)

# Find the common vacancies in actual and recommended vacancies
common_vacancies = np.intersect1d(actual_vacancies, recommended_vacancies, assume_unique=True)
print("common", common_vacancies)

# Keep only the common vacancies in actual and recommended vacancies
actual_vacancies2 = actual_vacancies[np.isin(actual_vacancies, common_vacancies)]
recommended_vacancies2 = recommended_vacancies[np.isin(recommended_vacancies, common_vacancies)]
print("actual_vacancies2", actual_vacancies2)
print("recommended_vacancies2", recommended_vacancies2)

# Calculate Kendall rank correlation
kendall_corr, kendall_pvalue = kendalltau(actual_vacancies2, recommended_vacancies2)
kendall_corr = round(kendall_corr, 4)
kendall_pvalue = round(kendall_pvalue, 4)
print(f"Kendall rank correlation: {kendall_corr}, p-value: {kendall_pvalue}")

# Calculate Spearman rank correlation
spearman_corr, spearman_pvalue = spearmanr(actual_vacancies2, recommended_vacancies2)
spearman_corr = round(spearman_corr, 4)
spearman_pvalue = round(spearman_pvalue, 4)
print(f"Spearman rank correlation: {spearman_corr}, p-value: {spearman_pvalue}")

actual_vacancies [46131 34808 33811 25023 34743 38808 36153 45516 38183 31567 35983 28764]
recommended_vacancies [46131 29753 25023 34743 38808 31567 48966 31106 38375 44216]
common [25023 31567 34743 38808 46131]
actual_vacancies2 [46131 25023 34743 38808 31567]
recommended_vacancies2 [46131 25023 34743 38808 31567]
Kendall rank correlation: 1.0, p-value: 0.0167
Spearman rank correlation: 1.0, p-value: 0.0


In [None]:
#For whole model
# Create empty lists to store the correlations and p-values
kendall_corrs = []
kendall_pvalues = []
spearman_corrs = []
spearman_pvalues = []

# Iterate over each unique jobseeker ID
for jobseeker_id in matches_df['JobseekerID'].unique():
    
    # Get the actual vacancies that the jobseeker applied for
    actual_vacancies = matches_df[matches_df['JobseekerID'] == jobseeker_id]['VacancyID'].values
    indices = np.argsort(-matches_df.loc[(matches_df['JobseekerID'] == jobseeker_id) & (matches_df['VacancyID'].isin(actual_vacancies)), 'Rating'])

    # Select the VacancyID of the top match for the jobseeker
    top_match_vacancy_id= matches_df.loc[matches_df['JobseekerID'] == jobseeker_id, 'VacancyID'].max()
    
    # Get the recommended vacancies based on the top-vacancy for each jobseeker
    recommended_vacancies = get_recommendations(top_match_vacancy_id, cosine_sim=cosine_sim, min_similarity=0.4)['VacancyID'].values

    # Find the common vacancies in actual and recommended vacancies
    common_vacancies = np.intersect1d(actual_vacancies, recommended_vacancies, assume_unique=True)
    
     # Only calculate correlations if there are at least 2 common vacancies //explanation on this in doc notes inside model
    if len(common_vacancies) >= 2:

        # Keep only the common vacancies in actual and recommended vacancies
        actual_vacancies = actual_vacancies[np.isin(actual_vacancies, common_vacancies)]
        recommended_vacancies = recommended_vacancies[np.isin(recommended_vacancies, common_vacancies)]

        # Calculate Kendall rank correlation
        kendall_corr, kendall_pvalue = kendalltau(actual_vacancies, recommended_vacancies)
        kendall_corrs.append(kendall_corr)
        kendall_pvalues.append(kendall_pvalue)

        # Calculate Spearman rank correlation
        spearman_corr, spearman_pvalue = spearmanr(actual_vacancies, recommended_vacancies)
        spearman_corrs.append(spearman_corr)
        spearman_pvalues.append(spearman_pvalue)


# Print the average correlations and p-values
print(f"Average Kendall rank correlation: {np.mean(kendall_corrs)}, p-value: {np.mean(kendall_pvalues)}")
print(f"Average Spearman rank correlation: {np.mean(spearman_corrs)}, p-value: {np.mean(spearman_pvalues)}")

Average Kendall rank correlation: 0.10762236601957856, p-value: 0.8037760863152849
Average Spearman rank correlation: 0.09473203915712626, p-value: nan


###User Profile




In [None]:
def get_top_n_vacancies(jobseeker_id, n=10):
    # Retrieve the VacancyID of the job vacancies matched with the given JobseekerID
    matches_for_jobseeker = matches_df[matches_df['JobseekerID'] == jobseeker_id]
    matched_vacancies = matches_for_jobseeker['VacancyID'].tolist()

    # Find the indices of the matched vacancies in the vacancies_df
    matched_indices = [vacancies_df[vacancies_df['VacancyID'] == vid].index[0] for vid in matched_vacancies]

    # Extract the vectorized data for the matched vacancies
    matched_vectors = vacancies_tfidf[matched_indices]

    # Compute the average values in the vectorized data for each feature.
    user_profile = np.asarray(np.mean(matched_vectors, axis=0))

    # Compute the cosine similarity between the user profile and the vacancies
    similarity_scores = cosine_similarity(user_profile.reshape(1, -1), vacancies_tfidf)

    # Sort the similarity scores from high to low
    sorted_scores_indices = np.argsort(similarity_scores)[0][::-1]

    # Find the row with Jobseeker and print the JobTitle and Industry columns
    row_j = jobseekers_df.loc[jobseekers_df['JobseekerID'] == jobseeker_id, ['JobseekerID', 'Industry', 'Language', 'Experience', 'Education Level']]

    # Get the top N recommended vacancies
    top_matches = sorted_scores_indices[:n]

    # Create a dataframe of the recommended vacancies
    recommendations = pd.DataFrame({'VacancyID': [vacancies_df.iloc[i]['VacancyID'] for i in top_matches],
                                     'Job Title': [vacancies_df.iloc[i]['Job Title'] for i in top_matches],
                                     'Industry': [vacancies_df.iloc[i]['Industry'] for i in top_matches],
                                     'Language': [vacancies_df.iloc[i]['Language'] for i in top_matches],
                                     'Experience': [vacancies_df.iloc[i]['Experience'] for i in top_matches],
                                     'Education Level': [vacancies_df.iloc[i]['Education Level'] for i in top_matches],
                                     'cosine_similarity_score': [similarity_scores[0][i] for i in top_matches]})
    recommendations = recommendations[['VacancyID', 'Job Title', 'Industry', 'Language', 'Experience', 'Education Level', 'cosine_similarity_score']]

    return recommendations

# Prompt the user to input the jobseeker ID
jobseeker_id = int(input("Enter the jobseeker ID: "))

print(f'The characteristics of Jobseeker {jobseeker_id} are:')
row_j = jobseekers_df.loc[jobseekers_df['JobseekerID'] == jobseeker_id, ['JobseekerID', 'Industry', 'Language', 'Experience', 'Education Level']]
display(row_j)

# Print the top 10 most similar vacancies
print(f"Top 10 job matches for jobseeker with ID {jobseeker_id}:")
recommended_vacancies = get_top_n_vacancies(jobseeker_id)
print(recommended_vacancies)
#display(recommendations)

Enter the jobseeker ID: 10466
The characteristics of Jobseeker 10466 are:


Unnamed: 0,JobseekerID,Industry,Language,Experience,Education Level
2,10466,Administration,German,10+,PhD


Top 10 job matches for jobseeker with ID 10466:
   VacancyID             Job Title        Industry Language Experience  \
0      26181  Administrative Clerk  Administration   German        0-1   
1      34808  Administrative Clerk  Administration   German        0-1   
2      45516  Administrative Clerk  Administration   German        0-1   
3      28764  Administrative Clerk  Administration   German        0-1   
4      25023  Administrative Clerk  Administration   German        10+   
5      34743  Administrative Clerk  Administration   German        10+   
6      38808  Administrative Clerk  Administration   German        0-1   
7      31567  Administrative Clerk  Administration   German        0-1   
8      35983  Administrative Clerk  Administration   German       5-10   
9      33811  Administrative Clerk  Administration   German        10+   

  Education Level  cosine_similarity_score  
0     High school                 0.949353  
1     High school                 0.949353  
2 

###Evaluation

Kendall_corr and spearman_corr

In [None]:
#one particular jobseeker
jobseeker_id = 10466
from scipy.stats import kendalltau, spearmanr

# Get the actual vacancies that the jobseeker applied for
actual_vacancies = matches_df[matches_df['JobseekerID'] == jobseeker_id]['VacancyID'].values
indices = np.argsort(-matches_df.loc[(matches_df['JobseekerID'] == jobseeker_id) & (matches_df['VacancyID'].isin(actual_vacancies)), 'Rating'])
actual_vacancies = actual_vacancies[indices]  
print("actual_vacancies", actual_vacancies)

# Get the recommended vacancies
recommended_vacancies = recommendations['VacancyID'].values.tolist()
print("recommended_vacancies", recommended_vacancies)

# Find the common vacancies in actual and recommended vacancies
common_vacancies = np.intersect1d(actual_vacancies, recommended_vacancies, assume_unique=True)
print("common", common_vacancies)

# Keep only the common vacancies in actual and recommended vacancies
actual_vacancies2 = actual_vacancies[np.isin(actual_vacancies, common_vacancies)]
print("actual_vacancies2", actual_vacancies2)
recommended_vacancies2 = [v for v in recommended_vacancies if v in common_vacancies]
print("recommended_vacancies2", recommended_vacancies2)

# Calculate Kendall rank correlation
kendall_corr, kendall_pvalue = kendalltau(actual_vacancies2, recommended_vacancies2)
kendall_corr = round(kendall_corr, 4)
kendall_pvalue = round(kendall_pvalue, 4)
print(f"Kendall rank correlation: {kendall_corr}, p-value: {kendall_pvalue}")

# Calculate Spearman rank correlation
spearman_corr, spearman_pvalue = spearmanr(actual_vacancies2, recommended_vacancies2)
spearman_corr = round(spearman_corr, 4)
spearman_pvalue = round(spearman_pvalue, 4)
print(f"Spearman rank correlation: {spearman_corr}, p-value: {spearman_pvalue}")

actual_vacancies [46131 34808 33811 25023 34743 38808 36153 45516 38183 31567 35983 28764]
recommended_vacancies [26181, 34808, 45516, 28764, 25023, 34743, 38808, 31567, 35983, 33811]
common [25023 28764 31567 33811 34743 34808 35983 38808 45516]
actual_vacancies2 [34808 33811 25023 34743 38808 45516 31567 35983 28764]
recommended_vacancies2 [34808, 45516, 28764, 25023, 34743, 38808, 31567, 35983, 33811]
Kendall rank correlation: 0.3889, p-value: 0.1802
Spearman rank correlation: 0.5333, p-value: 0.1392


In [None]:
#whole model
#Correct
# Create empty lists to store the correlations and p-values
kendall_corrs = []
kendall_pvalues = []
spearman_corrs = []
spearman_pvalues = []

# Iterate over each unique jobseeker ID
for jobseeker_id in matches_df['JobseekerID'].unique():
    
    # Get the actual vacancies that the jobseeker applied for
    actual_vacancies = matches_df[matches_df['JobseekerID'] == jobseeker_id]['VacancyID'].values
    indices = np.argsort(-matches_df.loc[(matches_df['JobseekerID'] == jobseeker_id) & (matches_df['VacancyID'].isin(actual_vacancies)), 'Rating'])
    actual_vacancies = actual_vacancies[indices][1:]  # remove the highest rated vacancyID

    # Get the recommended vacancies
    recommended_vacancies = get_top_n_vacancies(jobseeker_id)['VacancyID'].values.tolist() #list of the vacancyID of the recommended vacancies for a particular jobseeker

    # Find the common vacancies in actual and recommended vacancies
    common_vacancies = np.intersect1d(actual_vacancies, recommended_vacancies, assume_unique=True)
    
     # Only calculate correlations if there are at least 2 common vacancies //explanation on this in doc notes inside model
    if len(common_vacancies) >= 2:

        # Keep only the common vacancies in actual and recommended vacancies
        actual_vacancies = actual_vacancies[np.isin(actual_vacancies, common_vacancies)]
        recommended_vacancies = [v for v in recommended_vacancies if v in common_vacancies]

        # Calculate Kendall rank correlation
        kendall_corr, kendall_pvalue = kendalltau(actual_vacancies, recommended_vacancies)
        kendall_corrs.append(kendall_corr)
        kendall_pvalues.append(kendall_pvalue)

        # Calculate Spearman rank correlation
        spearman_corr, spearman_pvalue = spearmanr(actual_vacancies, recommended_vacancies)
        spearman_corrs.append(spearman_corr)
        spearman_pvalues.append(spearman_pvalue)


# Print the average correlations and p-values
print(f"Average Kendall rank correlation: {np.mean(kendall_corrs)}, p-value: {np.mean(kendall_pvalues)}")
print(f"Average Spearman rank correlation: {np.mean(spearman_corrs)}, p-value: {np.mean(spearman_pvalues)}")

Average Kendall rank correlation: 0.05116144018583044, p-value: 0.7059941852202287
Average Spearman rank correlation: 0.05761639888006582, p-value: nan


##NOISE 10%

This preprocess_textnoise function is similar to the previous preprocess_text function we discussed, but with the addition of adding noise to the text data by replacing some words with synonyms or similar words. The function uses the Natural Language Toolkit (NLTK) library to access WordNet, which is a large lexical database of English words

In [None]:

job_title = ['Truck Driver', 'Janitor', 'Cleaner', 'Administrative Clerk', 'Cashier', 'Data Analyst', 'Finance Manager', 'Professor']
education_levels = ['High School', 'Bachelor', 'Master', 'PhD']
industry = ['Transport', 'Logistics', 'Cleaning', 'Hotels and restaurants', 'Construction', 'Manufacturing', 'Financial intermediation', 'Pharmaceutical', 'Health and social services', 'Wholesale and retail', 'Technology', 'Research', 'Administration', 'Education', 'Energy', 'Agriculture']

# Define a function to preprocess the text data
random.seed(42)
def preprocess_textnoise(text, noise_level=0.1):
    # Remove "High School" to "high_school"
    # Replace integer ranges like "5-10" with "five_to_ten"
    text = re.sub(r'\b(\d+)-(\d+)\b', lambda match: f"{num2words(int(match.group(1)))}_to_{num2words(int(match.group(2)))}", text)
    # Replace "10+" with "ten_plus"
    text = re.sub(r'10\+', 'ten_plus', text)
    # Split the text into words
    words = text.split()
    # Replace some words with synonyms 
    for i, word in enumerate(words):
        # Randomly replace some job titles, education levels, or industries with a synonym
        if random.random() < noise_level and word in job_title + education_levels + industry:
            synsets = wordnet.synsets(word)
            if synsets:
                synonyms = synsets[0].lemma_names()
                if synonyms:
                    words[i] = random.choice(synonyms)
    # Join the words back into a string
    text_with_noise = ' '.join(words)
    return text_with_noise

In [None]:

for text in vacancies_df['VacancyDescription']:
    preprocessed_textnoise = preprocess_textnoise(text)
    print(preprocessed_textnoise[:150])

We are looking for an additional Cashier to strengthen our team in the Wholesale and retail industry with knowledge of French and five_to_ten years of
We are looking for an additional Data Analyst to strengthen our team in the Administration industry with knowledge of Dutch and zero_to_one years of e
We are looking for an additional Janitor to strengthen our team in the Cleaning industry with knowledge of French and ten_plus years of experience, pr
We are looking for an additional Finance Manager to strengthen our team in the Cleaning industry with knowledge of Dutch and five_to_ten years of expe
We are looking for an additional Data Analyst to strengthen our team in the Wholesale and retail industry with knowledge of German and zero_to_one yea
We are looking for an additional janitor to strengthen our team in the Cleaning industry with knowledge of German and two_to_four years of experience,
We are looking for an additional Cashier to strengthen our team in the Wholesale and retail in

### TF-IDF

In [None]:
# Create a TfidfVectorizer object with your preferred settings
tfidf = TfidfVectorizer(max_df=0.7, min_df=2, stop_words='english', preprocessor=preprocess_textnoise)

# Use the vectorizer to transform your text data
vacancies_tfidf = tfidf.fit_transform(vacancies_df['VacancyDescription'])

In [None]:
# Extract the feature names from the fitted vectorizer object
feature_names = list(tfidf.vocabulary_.keys())

# Print the feature names that have been extracted from the text data
print(feature_names)

['Cashier', 'Wholesale', 'retail', 'French', 'five_to_ten', 'High', 'School', 'Data', 'Analyst', 'Administration', 'Dutch', 'zero_to_one', 'Bachelor', 'Janitor', 'Cleaning', 'ten_plus', 'Finance', 'Manager', 'Master', 'German', 'cleanup', 'two_to_four', 'Manufacturing', 'Spanish', 'maestro', 'teller', 'Research', 'logistics', 'Cleaner', 'Administrative', 'Clerk', 'Truck', 'Driver', 'Transport', 'cleaner', 'Professor', 'research', 'PhD', 'Education', 'Pharmaceutical', 'English', 'prof', 'Agriculture', 'Technology', 'school', 'Hotels', 'restaurants', 'Construction', 'unmarried_man', 'Financial', 'intermediation', 'Health', 'social', 'services', 'professor', 'janitor', 'bachelor', 'Energy', 'cleanser', 'cleansing', 'master', 'cleaning', 'Logistics', 'transport', 'educational_activity']


In [None]:
vacancies_tfidf.shape

(500, 65)

####Top-matched vacancy 


Before recommending, the similarity matrix has to be computed.

In [None]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(vacancies_tfidf, vacancies_tfidf)

# Since you have used the TF-IDF vectorizer, calculating the dot product between each vector will directly give you the cosine similarity score. 
# Therefore, you will use sklearn's linear_kernel() instead of cosine_similarities() since it is faster.
# Each vacancy will be a 1x500 column vector where each column will be a similarity score with each vacancy.

Recommends the top 10 most similar vacancies to the top-rated vacancy that the jobseeker has already rated

In [None]:
#Prompt the user to input the jobseeker ID
jobseeker_id = int(input("Enter the jobseeker ID: "))
       
# Retrieve the VacancyID of the job vacancies matched with the given JobseekerID
matches_for_jobseeker = matches_df[matches_df['JobseekerID'] == jobseeker_id]

# Sort the matches by rating in descending order
sorted_matches = matches_for_jobseeker.sort_values(by='Rating', ascending=False)

# Select the VacancyID of the top match for the jobseeker
top_match_vacancy_id = sorted_matches.iloc[0]['VacancyID']
print(f"The top match for jobseeker {jobseeker_id} is Vacancy {top_match_vacancy_id}")

# Find the row with Jobseeker and print the JobTitle and Industry columns
print(f'The characteristics of Jobseeker {jobseeker_id} are:')
row_j = jobseekers_df.loc[jobseekers_df['JobseekerID'] == jobseeker_id, ['JobseekerID', 'Industry', 'Language', 'Experience', 'Education Level']]
display(row_j)

# Find the row with the highest rated VacancyID and print the JobTitle and Industry columns
print(f'The characteristics of the highest rated Vacancy {top_match_vacancy_id} are:')
row_v = vacancies_df.loc[vacancies_df['VacancyID'] == top_match_vacancy_id, ['VacancyID','Job Title', 'Industry', 'Language', 'Experience', 'Education Level']]
display(row_v)

# Check if there are any matches for the jobseeker
if not sorted_matches.empty:
    # Get the VacancyIDs that the jobseeker has already rated
    rated_vacancy_ids = sorted_matches['VacancyID'].values

    # Define the modified get_recommendations function
    def get_recommendations(VacancyID, cosine_sim=cosine_sim, min_similarity=0.4): #ADDED: min_similarity=...

        # Construct a reverse map of indices and VacancyIDs
        indices = pd.Series(vacancies_df.index, index=vacancies_df['VacancyID']).drop_duplicates()
            
        # Get the index of the movie that matches the VacancyID (the highest rated one)
        idx = indices[VacancyID]

        # Get the pairwise similarity scores of all vacancies with that vacancy
        sim_scores = list(enumerate(cosine_sim[idx]))
        # ADDED: Filter the similarity scores by the minimum threshold
        sim_scores = [score for score in sim_scores if score[1] >= min_similarity]
       
        # Sort the vacancies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores and indices of the 10 most similar vacancies
        sim_scores = sim_scores[1:11]
        vacancy_indices = [i[0] for i in sim_scores] #NEW
        cosine_sim_scores = [i[1] for i in sim_scores] #NE
      
        # Get the details of the most similar vacancies, including the cosine similarity scores
        vacancies = pd.DataFrame(list(zip(vacancy_indices, cosine_sim_scores)), columns=['index', 'cosine_similarity_score'])
        vacancies = vacancies.merge(vacancies_df[['VacancyID', 'Job Title','Industry','Language','Experience','Education Level']], how='left', left_on='index', right_index=True)

        # Return the top 10 most similar vacancies
        return vacancies[['VacancyID', 'Job Title','Industry','Language','Experience','Education Level', 'cosine_similarity_score']]

    # Get the recommendations for the jobseeker
    recommendations = get_recommendations(top_match_vacancy_id, cosine_sim=cosine_sim, min_similarity=0.4)
  

    print(f"Recommended vacancies based on Vacancy {top_match_vacancy_id} are:")
    display(recommendations)
else:
    print(f"No matches found for jobseeker {jobseeker_id}")

Enter the jobseeker ID: 10466
The top match for jobseeker 10466 is Vacancy 46131
The characteristics of Jobseeker 10466 are:


Unnamed: 0,JobseekerID,Industry,Language,Experience,Education Level
2,10466,Administration,German,10+,PhD


The characteristics of the highest rated Vacancy 46131 are:


Unnamed: 0,VacancyID,Job Title,Industry,Language,Experience,Education Level
225,46131,Administrative Clerk,Administration,German,10+,Bachelor


Recommended vacancies based on Vacancy 46131 are:


Unnamed: 0,VacancyID,Job Title,Industry,Language,Experience,Education Level,cosine_similarity_score
0,46131,Administrative Clerk,Administration,German,10+,Bachelor,1.0
1,29753,Administrative Clerk,Administration,German,10+,Bachelor,1.0
2,38808,Administrative Clerk,Administration,German,0-1,Bachelor,0.880752
3,31567,Administrative Clerk,Administration,German,0-1,Bachelor,0.880752
4,48966,Administrative Clerk,Administration,French,10+,Bachelor,0.854161
5,31106,Administrative Clerk,Administration,French,10+,Bachelor,0.854161
6,38375,Administrative Clerk,Administration,English,10+,Bachelor,0.853265
7,44216,Administrative Clerk,Administration,English,10+,Bachelor,0.853265
8,46996,Administrative Clerk,Administration,English,10+,Bachelor,0.853265
9,30961,Administrative Clerk,Administration,Spanish,10+,Bachelor,0.852348


###Evaluation

> 






Kendall_corr and spearman_corr

In [None]:
#For particular jobseeker

jobseeker_id = 10466
from scipy.stats import kendalltau, spearmanr

# Get the actual vacancies that the jobseeker applied for
actual_vacancies = matches_df[matches_df['JobseekerID'] == jobseeker_id]['VacancyID'].values
indices = np.argsort(-matches_df.loc[(matches_df['JobseekerID'] == jobseeker_id) & (matches_df['VacancyID'].isin(actual_vacancies)), 'Rating'])
actual_vacancies = actual_vacancies[indices]  
print("actual_vacancies", actual_vacancies)

# Get the recommended vacancies
recommended_vacancies = recommendations['VacancyID'].values
print("recommended_vacancies", recommended_vacancies)

# Find the common vacancies in actual and recommended vacancies
common_vacancies = np.intersect1d(actual_vacancies, recommended_vacancies, assume_unique=True)
print("common", common_vacancies)

# Keep only the common vacancies in actual and recommended vacancies
actual_vacancies2 = actual_vacancies[np.isin(actual_vacancies, common_vacancies)]
recommended_vacancies2 = recommended_vacancies[np.isin(recommended_vacancies, common_vacancies)]
print("actual_vacancies2", actual_vacancies2)
print("recommended_vacancies2", recommended_vacancies2)

# Calculate Kendall rank correlation
kendall_corr, kendall_pvalue = kendalltau(actual_vacancies2, recommended_vacancies2)
kendall_corr = round(kendall_corr, 4)
kendall_pvalue = round(kendall_pvalue, 4)
print(f"Kendall rank correlation: {kendall_corr}, p-value: {kendall_pvalue}")

# Calculate Spearman rank correlation
spearman_corr, spearman_pvalue = spearmanr(actual_vacancies2, recommended_vacancies2)
spearman_corr = round(spearman_corr, 4)
spearman_pvalue = round(spearman_pvalue, 4)
print(f"Spearman rank correlation: {spearman_corr}, p-value: {spearman_pvalue}")

actual_vacancies [46131 34808 33811 25023 34743 38808 36153 45516 38183 31567 35983 28764]
recommended_vacancies [46131 29753 38808 31567 48966 31106 38375 44216 46996 30961]
common [31567 38808 46131]
actual_vacancies2 [46131 38808 31567]
recommended_vacancies2 [46131 38808 31567]
Kendall rank correlation: 1.0, p-value: 0.3333
Spearman rank correlation: 1.0, p-value: 0.0


In [None]:
#For whole model
# Create empty lists to store the correlations and p-values
kendall_corrs = []
kendall_pvalues = []
spearman_corrs = []
spearman_pvalues = []

# Iterate over each unique jobseeker ID
for jobseeker_id in matches_df['JobseekerID'].unique():
    
    # Get the actual vacancies that the jobseeker applied for
    actual_vacancies = matches_df[matches_df['JobseekerID'] == jobseeker_id]['VacancyID'].values
    indices = np.argsort(-matches_df.loc[(matches_df['JobseekerID'] == jobseeker_id) & (matches_df['VacancyID'].isin(actual_vacancies)), 'Rating'])

    # Select the VacancyID of the top match for the jobseeker
    top_match_vacancy_id= matches_df.loc[matches_df['JobseekerID'] == jobseeker_id, 'VacancyID'].max()
    
    # Get the recommended vacancies based on the top-vacancy for each jobseeker
    recommended_vacancies = get_recommendations(top_match_vacancy_id, cosine_sim=cosine_sim, min_similarity=0.4)['VacancyID'].values

    # Find the common vacancies in actual and recommended vacancies
    common_vacancies = np.intersect1d(actual_vacancies, recommended_vacancies, assume_unique=True)
    
     # Only calculate correlations if there are at least 2 common vacancies //explanation on this in doc notes inside model
    if len(common_vacancies) >= 2:

        # Keep only the common vacancies in actual and recommended vacancies
        actual_vacancies = actual_vacancies[np.isin(actual_vacancies, common_vacancies)]
        recommended_vacancies = recommended_vacancies[np.isin(recommended_vacancies, common_vacancies)]

        # Calculate Kendall rank correlation
        kendall_corr, kendall_pvalue = kendalltau(actual_vacancies, recommended_vacancies)
        kendall_corrs.append(kendall_corr)
        kendall_pvalues.append(kendall_pvalue)

        # Calculate Spearman rank correlation
        spearman_corr, spearman_pvalue = spearmanr(actual_vacancies, recommended_vacancies)
        spearman_corrs.append(spearman_corr)
        spearman_pvalues.append(spearman_pvalue)


# Print the average correlations and p-values
print(f"Average Kendall rank correlation: {np.mean(kendall_corrs)}, p-value: {np.mean(kendall_pvalues)}")
print(f"Average Spearman rank correlation: {np.mean(spearman_corrs)}, p-value: {np.mean(spearman_pvalues)}")

Average Kendall rank correlation: -0.09813662859220373, p-value: 0.854082599155854
Average Spearman rank correlation: -0.11709695204535375, p-value: nan


###User Profile




In [None]:
def get_top_n_vacancies(jobseeker_id, n=10):
    # Retrieve the VacancyID of the job vacancies matched with the given JobseekerID
    matches_for_jobseeker = matches_df[matches_df['JobseekerID'] == jobseeker_id]
    matched_vacancies = matches_for_jobseeker['VacancyID'].tolist()

    # Find the indices of the matched vacancies in the vacancies_df
    matched_indices = [vacancies_df[vacancies_df['VacancyID'] == vid].index[0] for vid in matched_vacancies]

    # Extract the vectorized data for the matched vacancies
    matched_vectors = vacancies_tfidf[matched_indices]

    # Compute the average values in the vectorized data for each feature.
    user_profile = np.asarray(np.mean(matched_vectors, axis=0))

    # Compute the cosine similarity between the user profile and the vacancies
    similarity_scores = cosine_similarity(user_profile.reshape(1, -1), vacancies_tfidf)

    # Sort the similarity scores from high to low
    sorted_scores_indices = np.argsort(similarity_scores)[0][::-1]

    # Find the row with Jobseeker and print the JobTitle and Industry columns
    row_j = jobseekers_df.loc[jobseekers_df['JobseekerID'] == jobseeker_id, ['JobseekerID', 'Industry', 'Language', 'Experience', 'Education Level']]

    # Get the top N recommended vacancies
    top_matches = sorted_scores_indices[:n]

    # Create a dataframe of the recommended vacancies
    recommendations = pd.DataFrame({'VacancyID': [vacancies_df.iloc[i]['VacancyID'] for i in top_matches],
                                     'Job Title': [vacancies_df.iloc[i]['Job Title'] for i in top_matches],
                                     'Industry': [vacancies_df.iloc[i]['Industry'] for i in top_matches],
                                     'Language': [vacancies_df.iloc[i]['Language'] for i in top_matches],
                                     'Experience': [vacancies_df.iloc[i]['Experience'] for i in top_matches],
                                     'Education Level': [vacancies_df.iloc[i]['Education Level'] for i in top_matches],
                                     'cosine_similarity_score': [similarity_scores[0][i] for i in top_matches]})
    recommendations = recommendations[['VacancyID', 'Job Title', 'Industry', 'Language', 'Experience', 'Education Level', 'cosine_similarity_score']]

    return recommendations

# Prompt the user to input the jobseeker ID
jobseeker_id = int(input("Enter the jobseeker ID: "))

print(f'The characteristics of Jobseeker {jobseeker_id} are:')
row_j = jobseekers_df.loc[jobseekers_df['JobseekerID'] == jobseeker_id, ['JobseekerID', 'Industry', 'Language', 'Experience', 'Education Level']]
display(row_j)

# Print the top 10 most similar vacancies
print(f"Top 10 job matches for jobseeker with ID {jobseeker_id}:")
recommended_vacancies = get_top_n_vacancies(jobseeker_id)
print(recommended_vacancies)
#display(recommendations)

Enter the jobseeker ID: 10466
The characteristics of Jobseeker 10466 are:


Unnamed: 0,JobseekerID,Industry,Language,Experience,Education Level
2,10466,Administration,German,10+,PhD


Top 10 job matches for jobseeker with ID 10466:
   VacancyID             Job Title        Industry Language Experience  \
0      26181  Administrative Clerk  Administration   German        0-1   
1      34808  Administrative Clerk  Administration   German        0-1   
2      45516  Administrative Clerk  Administration   German        0-1   
3      28764  Administrative Clerk  Administration   German        0-1   
4      25023  Administrative Clerk  Administration   German        10+   
5      34743  Administrative Clerk  Administration   German        10+   
6      36153  Administrative Clerk  Administration   German        2-4   
7      38808  Administrative Clerk  Administration   German        0-1   
8      31567  Administrative Clerk  Administration   German        0-1   
9      33811  Administrative Clerk  Administration   German        10+   

  Education Level  cosine_similarity_score  
0     High school                 0.941206  
1     High school                 0.941206  
2 

###Evaluation

Kendall_corr and spearman_corr

In [None]:
#one particular jobseeker
jobseeker_id = 10466
from scipy.stats import kendalltau, spearmanr

# Get the actual vacancies that the jobseeker applied for
actual_vacancies = matches_df[matches_df['JobseekerID'] == jobseeker_id]['VacancyID'].values
indices = np.argsort(-matches_df.loc[(matches_df['JobseekerID'] == jobseeker_id) & (matches_df['VacancyID'].isin(actual_vacancies)), 'Rating'])
actual_vacancies = actual_vacancies[indices]  
print("actual_vacancies", actual_vacancies)

# Get the recommended vacancies
recommended_vacancies = recommendations['VacancyID'].values.tolist()
print("recommended_vacancies", recommended_vacancies)

# Find the common vacancies in actual and recommended vacancies
common_vacancies = np.intersect1d(actual_vacancies, recommended_vacancies, assume_unique=True)
print("common", common_vacancies)

# Keep only the common vacancies in actual and recommended vacancies
actual_vacancies2 = actual_vacancies[np.isin(actual_vacancies, common_vacancies)]
print("actual_vacancies2", actual_vacancies2)
recommended_vacancies2 = [v for v in recommended_vacancies if v in common_vacancies]
print("recommended_vacancies2", recommended_vacancies2)

# Calculate Kendall rank correlation
kendall_corr, kendall_pvalue = kendalltau(actual_vacancies2, recommended_vacancies2)
kendall_corr = round(kendall_corr, 4)
kendall_pvalue = round(kendall_pvalue, 4)
print(f"Kendall rank correlation: {kendall_corr}, p-value: {kendall_pvalue}")

# Calculate Spearman rank correlation
spearman_corr, spearman_pvalue = spearmanr(actual_vacancies2, recommended_vacancies2)
spearman_corr = round(spearman_corr, 4)
spearman_pvalue = round(spearman_pvalue, 4)
print(f"Spearman rank correlation: {spearman_corr}, p-value: {spearman_pvalue}")

actual_vacancies [46131 34808 33811 25023 34743 38808 36153 45516 38183 31567 35983 28764]
recommended_vacancies [46131, 29753, 38808, 31567, 48966, 31106, 38375, 44216, 46996, 30961]
common [31567 38808 46131]
actual_vacancies2 [46131 38808 31567]
recommended_vacancies2 [46131, 38808, 31567]
Kendall rank correlation: 1.0, p-value: 0.3333
Spearman rank correlation: 1.0, p-value: 0.0


In [None]:
# Create empty lists to store the correlations and p-values
kendall_corrs = []
kendall_pvalues = []
spearman_corrs = []
spearman_pvalues = []

# Iterate over each unique jobseeker ID
for jobseeker_id in matches_df['JobseekerID'].unique():
    
    # Get the actual vacancies that the jobseeker applied for
    actual_vacancies = matches_df[matches_df['JobseekerID'] == jobseeker_id]['VacancyID'].values
    indices = np.argsort(-matches_df.loc[(matches_df['JobseekerID'] == jobseeker_id) & (matches_df['VacancyID'].isin(actual_vacancies)), 'Rating'])
    actual_vacancies = actual_vacancies[indices][1:]  # remove the highest rated vacancyID

    # Get the recommended vacancies
    recommended_vacancies = get_top_n_vacancies(jobseeker_id)['VacancyID'].values.tolist() #list of the vacancyID of the recommended vacancies for a particular jobseeker

    # Find the common vacancies in actual and recommended vacancies
    common_vacancies = np.intersect1d(actual_vacancies, recommended_vacancies, assume_unique=True)
    
     # Only calculate correlations if there are at least 2 common vacancies //explanation on this in doc notes inside model
    if len(common_vacancies) >= 2:

        # Keep only the common vacancies in actual and recommended vacancies
        actual_vacancies = actual_vacancies[np.isin(actual_vacancies, common_vacancies)]
        recommended_vacancies = [v for v in recommended_vacancies if v in common_vacancies]

        # Calculate Kendall rank correlation
        kendall_corr, kendall_pvalue = kendalltau(actual_vacancies, recommended_vacancies)
        kendall_corrs.append(kendall_corr)
        kendall_pvalues.append(kendall_pvalue)

        # Calculate Spearman rank correlation
        spearman_corr, spearman_pvalue = spearmanr(actual_vacancies, recommended_vacancies)
        spearman_corrs.append(spearman_corr)
        spearman_pvalues.append(spearman_pvalue)


# Print the average correlations and p-values
print(f"Average Kendall rank correlation: {np.mean(kendall_corrs)}, p-value: {np.mean(kendall_pvalues)}")
print(f"Average Spearman rank correlation: {np.mean(spearman_corrs)}, p-value: {np.mean(spearman_pvalues)}")

Average Kendall rank correlation: 0.024462521199565348, p-value: 0.7061129256979918
Average Spearman rank correlation: 0.027859298588665194, p-value: nan


##NOISE 20%

In [None]:
job_title = ['Truck Driver', 'Janitor', 'Cleaner', 'Administrative Clerk', 'Cashier', 'Data Analyst', 'Finance Manager', 'Professor']
education_levels = ['High School', 'Bachelor', 'Master', 'PhD']
industry = ['Transport', 'Logistics', 'Cleaning', 'Hotels and restaurants', 'Construction', 'Manufacturing', 'Financial intermediation', 'Pharmaceutical', 'Health and social services', 'Wholesale and retail', 'Technology', 'Research', 'Administration', 'Education', 'Energy', 'Agriculture']

# Define a function to preprocess the text data
random.seed(42)
def preprocess_textnoise(text, noise_level=0.2):
    # Remove "High School" to "high_school"
    # Replace integer ranges like "5-10" with "five_to_ten"
    text = re.sub(r'\b(\d+)-(\d+)\b', lambda match: f"{num2words(int(match.group(1)))}_to_{num2words(int(match.group(2)))}", text)
    # Replace "10+" with "ten_plus"
    text = re.sub(r'10\+', 'ten_plus', text)
    # Split the text into words
    words = text.split()
    # Replace some words with synonyms 
    for i, word in enumerate(words):
        # Randomly replace some job titles, education levels, or industries with a synonym
        if random.random() < noise_level and word in job_title + education_levels + industry:
            synsets = wordnet.synsets(word)
            if synsets:
                synonyms = synsets[0].lemma_names()
                if synonyms:
                    words[i] = random.choice(synonyms)
    # Join the words back into a string
    text_with_noise = ' '.join(words)
    return text_with_noise

In [None]:

for text in vacancies_df['VacancyDescription']:
    preprocessed_textnoise = preprocess_textnoise(text)
    print(preprocessed_textnoise[:150])

We are looking for an additional Cashier to strengthen our team in the Wholesale and retail industry with knowledge of French and five_to_ten years of
We are looking for an additional Data Analyst to strengthen our team in the disposal industry with knowledge of Dutch and zero_to_one years of experie
We are looking for an additional Janitor to strengthen our team in the Cleaning industry with knowledge of French and ten_plus years of experience, pr
We are looking for an additional Finance Manager to strengthen our team in the Cleaning industry with knowledge of Dutch and five_to_ten years of expe
We are looking for an additional Data Analyst to strengthen our team in the Wholesale and retail industry with knowledge of German and zero_to_one yea
We are looking for an additional Janitor to strengthen our team in the Cleaning industry with knowledge of German and two_to_four years of experience,
We are looking for an additional Cashier to strengthen our team in the Wholesale and retail in

###TF-IDF

In [None]:

# Create a TfidfVectorizer object with your preferred settings
tfidf = TfidfVectorizer(max_df=0.7, min_df=2, stop_words='english', preprocessor=preprocess_textnoise)

# Use the vectorizer to transform your text data
vacancies_tfidf = tfidf.fit_transform(vacancies_df['VacancyDescription'])

In [None]:
# Extract the feature names from the fitted vectorizer object
feature_names = list(tfidf.vocabulary_.keys())

# Print the feature names that have been extracted from the text data
print(feature_names)

['Cashier', 'Wholesale', 'retail', 'French', 'five_to_ten', 'High', 'School', 'Data', 'Analyst', 'Administration', 'Dutch', 'zero_to_one', 'bachelor', 'Janitor', 'Cleaning', 'ten_plus', 'Finance', 'Manager', 'Master', 'German', 'janitor', 'two_to_four', 'bank_clerk', 'Manufacturing', 'Spanish', 'Bachelor', 'Research', 'Logistics', 'Cleaner', 'Administrative', 'Clerk', 'Truck', 'Driver', 'transport', 'conveyance', 'cashier', 'Professor', 'Ph', 'Education', 'unmarried_man', 'Pharmaceutical', 'maestro', 'teller', 'PhD', 'Transport', 'English', 'research', 'cleaning', 'Agriculture', 'master', 'cleaner', 'administration', 'Technology', 'school', 'Hotels', 'restaurants', 'Construction', 'Financial', 'intermediation', 'cleanup', 'factory_farm', 'Health', 'social', 'services', 'Energy', 'cleansing', 'prof', 'professor', 'pharmaceutic', 'logistics', 'cleanser', 'disposal']


In [None]:
vacancies_tfidf.shape

(500, 72)

####Top-matched vacancy 


Before recommending, the similarity matrix has to be computed.

In [None]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(vacancies_tfidf, vacancies_tfidf)

# Since you have used the TF-IDF vectorizer, calculating the dot product between each vector will directly give you the cosine similarity score. 
# Therefore, you will use sklearn's linear_kernel() instead of cosine_similarities() since it is faster.
# Each vacancy will be a 1x500 column vector where each column will be a similarity score with each vacancy.

Recommends the top 10 most similar vacancies to the top-rated vacancy that the jobseeker has already rated

In [None]:
#Prompt the user to input the jobseeker ID
jobseeker_id = int(input("Enter the jobseeker ID: "))
       
# Retrieve the VacancyID of the job vacancies matched with the given JobseekerID
matches_for_jobseeker = matches_df[matches_df['JobseekerID'] == jobseeker_id]

# Sort the matches by rating in descending order
sorted_matches = matches_for_jobseeker.sort_values(by='Rating', ascending=False)

# Select the VacancyID of the top match for the jobseeker
top_match_vacancy_id = sorted_matches.iloc[0]['VacancyID']
print(f"The top match for jobseeker {jobseeker_id} is Vacancy {top_match_vacancy_id}")

# Find the row with Jobseeker and print the JobTitle and Industry columns
print(f'The characteristics of Jobseeker {jobseeker_id} are:')
row_j = jobseekers_df.loc[jobseekers_df['JobseekerID'] == jobseeker_id, ['JobseekerID', 'Industry', 'Language', 'Experience', 'Education Level']]
display(row_j)

# Find the row with the highest rated VacancyID and print the JobTitle and Industry columns
print(f'The characteristics of the highest rated Vacancy {top_match_vacancy_id} are:')
row_v = vacancies_df.loc[vacancies_df['VacancyID'] == top_match_vacancy_id, ['VacancyID','Job Title', 'Industry', 'Language', 'Experience', 'Education Level']]
display(row_v)

# Check if there are any matches for the jobseeker
if not sorted_matches.empty:
    # Get the VacancyIDs that the jobseeker has already rated
    rated_vacancy_ids = sorted_matches['VacancyID'].values

    # Define the modified get_recommendations function
    def get_recommendations(VacancyID, cosine_sim=cosine_sim, min_similarity=0.4): #ADDED: min_similarity=...

        # Construct a reverse map of indices and VacancyIDs
        indices = pd.Series(vacancies_df.index, index=vacancies_df['VacancyID']).drop_duplicates()
            
        # Get the index of the movie that matches the VacancyID (the highest rated one)
        idx = indices[VacancyID]

        # Get the pairwise similarity scores of all vacancies with that vacancy
        sim_scores = list(enumerate(cosine_sim[idx]))
        # ADDED: Filter the similarity scores by the minimum threshold
        sim_scores = [score for score in sim_scores if score[1] >= min_similarity]
       
        # Sort the vacancies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores and indices of the 10 most similar vacancies
        sim_scores = sim_scores[1:11]
        vacancy_indices = [i[0] for i in sim_scores] #NEW
        cosine_sim_scores = [i[1] for i in sim_scores] #NE
      
        # Get the details of the most similar vacancies, including the cosine similarity scores
        vacancies = pd.DataFrame(list(zip(vacancy_indices, cosine_sim_scores)), columns=['index', 'cosine_similarity_score'])
        vacancies = vacancies.merge(vacancies_df[['VacancyID', 'Job Title','Industry','Language','Experience','Education Level']], how='left', left_on='index', right_index=True)

        # Return the top 10 most similar vacancies
        return vacancies[['VacancyID', 'Job Title','Industry','Language','Experience','Education Level', 'cosine_similarity_score']]

    # Get the recommendations for the jobseeker
    recommendations = get_recommendations(top_match_vacancy_id, cosine_sim=cosine_sim, min_similarity=0.4)
  

    print(f"Recommended vacancies based on Vacancy {top_match_vacancy_id} are:")
    display(recommendations)
else:
    print(f"No matches found for jobseeker {jobseeker_id}")

Enter the jobseeker ID: 10466
The top match for jobseeker 10466 is Vacancy 46131
The characteristics of Jobseeker 10466 are:


Unnamed: 0,JobseekerID,Industry,Language,Experience,Education Level
2,10466,Administration,German,10+,PhD


The characteristics of the highest rated Vacancy 46131 are:


Unnamed: 0,VacancyID,Job Title,Industry,Language,Experience,Education Level
225,46131,Administrative Clerk,Administration,German,10+,Bachelor


Recommended vacancies based on Vacancy 46131 are:


Unnamed: 0,VacancyID,Job Title,Industry,Language,Experience,Education Level,cosine_similarity_score
0,29753,Administrative Clerk,Administration,German,10+,Bachelor,1.0
1,38808,Administrative Clerk,Administration,German,0-1,Bachelor,0.88491
2,31106,Administrative Clerk,Administration,French,10+,Bachelor,0.859394
3,38375,Administrative Clerk,Administration,English,10+,Bachelor,0.858525
4,46996,Administrative Clerk,Administration,English,10+,Bachelor,0.858525
5,30961,Administrative Clerk,Administration,Spanish,10+,Bachelor,0.857635
6,41829,Administrative Clerk,Administration,Dutch,10+,Bachelor,0.852848
7,46802,Administrative Clerk,Administration,English,5-10,Bachelor,0.757459
8,43115,Administrative Clerk,Administration,English,5-10,Bachelor,0.757459
9,40537,Administrative Clerk,Administration,English,5-10,Bachelor,0.757459


###Evaluation

> 






Kendall_corr and spearman_corr

In [None]:
#For particular jobseeker

jobseeker_id = 10466
from scipy.stats import kendalltau, spearmanr

# Get the actual vacancies that the jobseeker applied for
actual_vacancies = matches_df[matches_df['JobseekerID'] == jobseeker_id]['VacancyID'].values
indices = np.argsort(-matches_df.loc[(matches_df['JobseekerID'] == jobseeker_id) & (matches_df['VacancyID'].isin(actual_vacancies)), 'Rating'])
actual_vacancies = actual_vacancies[indices]  
print("actual_vacancies", actual_vacancies)

# Get the recommended vacancies
recommended_vacancies = recommendations['VacancyID'].values
print("recommended_vacancies", recommended_vacancies)

# Find the common vacancies in actual and recommended vacancies
common_vacancies = np.intersect1d(actual_vacancies, recommended_vacancies, assume_unique=True)
print("common", common_vacancies)

# Keep only the common vacancies in actual and recommended vacancies
actual_vacancies2 = actual_vacancies[np.isin(actual_vacancies, common_vacancies)]
recommended_vacancies2 = recommended_vacancies[np.isin(recommended_vacancies, common_vacancies)]
print("actual_vacancies2", actual_vacancies2)
print("recommended_vacancies2", recommended_vacancies2)

# Calculate Kendall rank correlation
kendall_corr, kendall_pvalue = kendalltau(actual_vacancies2, recommended_vacancies2)
kendall_corr = round(kendall_corr, 4)
kendall_pvalue = round(kendall_pvalue, 4)
print(f"Kendall rank correlation: {kendall_corr}, p-value: {kendall_pvalue}")

# Calculate Spearman rank correlation
spearman_corr, spearman_pvalue = spearmanr(actual_vacancies2, recommended_vacancies2)
spearman_corr = round(spearman_corr, 4)
spearman_pvalue = round(spearman_pvalue, 4)
print(f"Spearman rank correlation: {spearman_corr}, p-value: {spearman_pvalue}")

actual_vacancies [46131 34808 33811 25023 34743 38808 36153 45516 38183 31567 35983 28764]
recommended_vacancies [29753 38808 31106 38375 46996 30961 41829 46802 43115 40537]
common [38808]
actual_vacancies2 [38808]
recommended_vacancies2 [38808]
Kendall rank correlation: nan, p-value: nan
Spearman rank correlation: nan, p-value: nan


In [None]:
#For whole model
# Create empty lists to store the correlations and p-values
kendall_corrs = []
kendall_pvalues = []
spearman_corrs = []
spearman_pvalues = []

# Iterate over each unique jobseeker ID
for jobseeker_id in matches_df['JobseekerID'].unique():
    
    # Get the actual vacancies that the jobseeker applied for
    actual_vacancies = matches_df[matches_df['JobseekerID'] == jobseeker_id]['VacancyID'].values
    indices = np.argsort(-matches_df.loc[(matches_df['JobseekerID'] == jobseeker_id) & (matches_df['VacancyID'].isin(actual_vacancies)), 'Rating'])

    # Select the VacancyID of the top match for the jobseeker
    top_match_vacancy_id= matches_df.loc[matches_df['JobseekerID'] == jobseeker_id, 'VacancyID'].max()
    
    # Get the recommended vacancies based on the top-vacancy for each jobseeker
    recommended_vacancies = get_recommendations(top_match_vacancy_id, cosine_sim=cosine_sim, min_similarity=0.4)['VacancyID'].values

    # Find the common vacancies in actual and recommended vacancies
    common_vacancies = np.intersect1d(actual_vacancies, recommended_vacancies, assume_unique=True)
    
     # Only calculate correlations if there are at least 2 common vacancies //explanation on this in doc notes inside model
    if len(common_vacancies) >= 2:

        # Keep only the common vacancies in actual and recommended vacancies
        actual_vacancies = actual_vacancies[np.isin(actual_vacancies, common_vacancies)]
        recommended_vacancies = recommended_vacancies[np.isin(recommended_vacancies, common_vacancies)]

        # Calculate Kendall rank correlation
        kendall_corr, kendall_pvalue = kendalltau(actual_vacancies, recommended_vacancies)
        kendall_corrs.append(kendall_corr)
        kendall_pvalues.append(kendall_pvalue)

        # Calculate Spearman rank correlation
        spearman_corr, spearman_pvalue = spearmanr(actual_vacancies, recommended_vacancies)
        spearman_corrs.append(spearman_corr)
        spearman_pvalues.append(spearman_pvalue)


# Print the average correlations and p-values
print(f"Average Kendall rank correlation: {np.mean(kendall_corrs)}, p-value: {np.mean(kendall_pvalues)}")
print(f"Average Spearman rank correlation: {np.mean(spearman_corrs)}, p-value: {np.mean(spearman_pvalues)}")

Average Kendall rank correlation: -0.05391746031746033, p-value: 0.7875762169312169
Average Spearman rank correlation: -0.07666920634920635, p-value: nan


###User Profile




In [None]:
def get_top_n_vacancies(jobseeker_id, n=10):
    # Retrieve the VacancyID of the job vacancies matched with the given JobseekerID
    matches_for_jobseeker = matches_df[matches_df['JobseekerID'] == jobseeker_id]
    matched_vacancies = matches_for_jobseeker['VacancyID'].tolist()

    # Find the indices of the matched vacancies in the vacancies_df
    matched_indices = [vacancies_df[vacancies_df['VacancyID'] == vid].index[0] for vid in matched_vacancies]

    # Extract the vectorized data for the matched vacancies
    matched_vectors = vacancies_tfidf[matched_indices]

    # Compute the average values in the vectorized data for each feature.
    user_profile = np.asarray(np.mean(matched_vectors, axis=0))

    # Compute the cosine similarity between the user profile and the vacancies
    similarity_scores = cosine_similarity(user_profile.reshape(1, -1), vacancies_tfidf)

    # Sort the similarity scores from high to low
    sorted_scores_indices = np.argsort(similarity_scores)[0][::-1]

    # Find the row with Jobseeker and print the JobTitle and Industry columns
    row_j = jobseekers_df.loc[jobseekers_df['JobseekerID'] == jobseeker_id, ['JobseekerID', 'Industry', 'Language', 'Experience', 'Education Level']]

    # Get the top N recommended vacancies
    top_matches = sorted_scores_indices[:n]

    # Create a dataframe of the recommended vacancies
    recommendations = pd.DataFrame({'VacancyID': [vacancies_df.iloc[i]['VacancyID'] for i in top_matches],
                                     'Job Title': [vacancies_df.iloc[i]['Job Title'] for i in top_matches],
                                     'Industry': [vacancies_df.iloc[i]['Industry'] for i in top_matches],
                                     'Language': [vacancies_df.iloc[i]['Language'] for i in top_matches],
                                     'Experience': [vacancies_df.iloc[i]['Experience'] for i in top_matches],
                                     'Education Level': [vacancies_df.iloc[i]['Education Level'] for i in top_matches],
                                     'cosine_similarity_score': [similarity_scores[0][i] for i in top_matches]})
    recommendations = recommendations[['VacancyID', 'Job Title', 'Industry', 'Language', 'Experience', 'Education Level', 'cosine_similarity_score']]

    return recommendations

# Prompt the user to input the jobseeker ID
jobseeker_id = int(input("Enter the jobseeker ID: "))

print(f'The characteristics of Jobseeker {jobseeker_id} are:')
row_j = jobseekers_df.loc[jobseekers_df['JobseekerID'] == jobseeker_id, ['JobseekerID', 'Industry', 'Language', 'Experience', 'Education Level']]
display(row_j)

# Print the top 10 most similar vacancies
print(f"Top 10 job matches for jobseeker with ID {jobseeker_id}:")
recommended_vacancies = get_top_n_vacancies(jobseeker_id)
print(recommended_vacancies)
#display(recommendations)

Enter the jobseeker ID: 10466
The characteristics of Jobseeker 10466 are:


Unnamed: 0,JobseekerID,Industry,Language,Experience,Education Level
2,10466,Administration,German,10+,PhD


Top 10 job matches for jobseeker with ID 10466:
   VacancyID             Job Title        Industry Language Experience  \
0      34808  Administrative Clerk  Administration   German        0-1   
1      28764  Administrative Clerk  Administration   German        0-1   
2      35983  Administrative Clerk  Administration   German       5-10   
3      36153  Administrative Clerk  Administration   German        2-4   
4      38808  Administrative Clerk  Administration   German        0-1   
5      29753  Administrative Clerk  Administration   German        10+   
6      46131  Administrative Clerk  Administration   German        10+   
7      25656  Administrative Clerk  Administration   French        0-1   
8      31808  Administrative Clerk  Administration  English        0-1   
9      30108  Administrative Clerk  Administration   French        10+   

  Education Level  cosine_similarity_score  
0     High school                 0.934027  
1     High school                 0.934027  
2 

###Evaluation

Kendall_corr and spearman_corr

In [None]:
#one particular jobseeker
jobseeker_id = 10466
from scipy.stats import kendalltau, spearmanr

# Get the actual vacancies that the jobseeker applied for
actual_vacancies = matches_df[matches_df['JobseekerID'] == jobseeker_id]['VacancyID'].values
indices = np.argsort(-matches_df.loc[(matches_df['JobseekerID'] == jobseeker_id) & (matches_df['VacancyID'].isin(actual_vacancies)), 'Rating'])
actual_vacancies = actual_vacancies[indices]  
print("actual_vacancies", actual_vacancies)

# Get the recommended vacancies
recommended_vacancies = recommendations['VacancyID'].values.tolist()
print("recommended_vacancies", recommended_vacancies)

# Find the common vacancies in actual and recommended vacancies
common_vacancies = np.intersect1d(actual_vacancies, recommended_vacancies, assume_unique=True)
print("common", common_vacancies)

# Keep only the common vacancies in actual and recommended vacancies
actual_vacancies2 = actual_vacancies[np.isin(actual_vacancies, common_vacancies)]
print("actual_vacancies2", actual_vacancies2)
recommended_vacancies2 = [v for v in recommended_vacancies if v in common_vacancies]
print("recommended_vacancies2", recommended_vacancies2)

# Calculate Kendall rank correlation
kendall_corr, kendall_pvalue = kendalltau(actual_vacancies2, recommended_vacancies2)
kendall_corr = round(kendall_corr, 4)
kendall_pvalue = round(kendall_pvalue, 4)
print(f"Kendall rank correlation: {kendall_corr}, p-value: {kendall_pvalue}")

# Calculate Spearman rank correlation
spearman_corr, spearman_pvalue = spearmanr(actual_vacancies2, recommended_vacancies2)
spearman_corr = round(spearman_corr, 4)
spearman_pvalue = round(spearman_pvalue, 4)
print(f"Spearman rank correlation: {spearman_corr}, p-value: {spearman_pvalue}")

actual_vacancies [46131 34808 33811 25023 34743 38808 36153 45516 38183 31567 35983 28764]
recommended_vacancies [29753, 38808, 31106, 38375, 46996, 30961, 41829, 46802, 43115, 40537]
common [38808]
actual_vacancies2 [38808]
recommended_vacancies2 [38808]
Kendall rank correlation: nan, p-value: nan
Spearman rank correlation: nan, p-value: nan


In [None]:
#whole model
#Correct
# Create empty lists to store the correlations and p-values
kendall_corrs = []
kendall_pvalues = []
spearman_corrs = []
spearman_pvalues = []

# Iterate over each unique jobseeker ID
for jobseeker_id in matches_df['JobseekerID'].unique():
    
    # Get the actual vacancies that the jobseeker applied for
    actual_vacancies = matches_df[matches_df['JobseekerID'] == jobseeker_id]['VacancyID'].values
    indices = np.argsort(-matches_df.loc[(matches_df['JobseekerID'] == jobseeker_id) & (matches_df['VacancyID'].isin(actual_vacancies)), 'Rating'])
    actual_vacancies = actual_vacancies[indices][1:]  # remove the highest rated vacancyID

    # Get the recommended vacancies
    recommended_vacancies = get_top_n_vacancies(jobseeker_id)['VacancyID'].values.tolist() #list of the vacancyID of the recommended vacancies for a particular jobseeker

    # Find the common vacancies in actual and recommended vacancies
    common_vacancies = np.intersect1d(actual_vacancies, recommended_vacancies, assume_unique=True)
    
     # Only calculate correlations if there are at least 2 common vacancies //explanation on this in doc notes inside model
    if len(common_vacancies) >= 2:

        # Keep only the common vacancies in actual and recommended vacancies
        actual_vacancies = actual_vacancies[np.isin(actual_vacancies, common_vacancies)]
        recommended_vacancies = [v for v in recommended_vacancies if v in common_vacancies]

        # Calculate Kendall rank correlation
        kendall_corr, kendall_pvalue = kendalltau(actual_vacancies, recommended_vacancies)
        kendall_corrs.append(kendall_corr)
        kendall_pvalues.append(kendall_pvalue)

        # Calculate Spearman rank correlation
        spearman_corr, spearman_pvalue = spearmanr(actual_vacancies, recommended_vacancies)
        spearman_corrs.append(spearman_corr)
        spearman_pvalues.append(spearman_pvalue)


# Print the average correlations and p-values
print(f"Average Kendall rank correlation: {np.mean(kendall_corrs)}, p-value: {np.mean(kendall_pvalues)}")
print(f"Average Spearman rank correlation: {np.mean(spearman_corrs)}, p-value: {np.mean(spearman_pvalues)}")

Average Kendall rank correlation: 0.023955705074292122, p-value: 0.7147908503766893
Average Spearman rank correlation: 0.02583890387717132, p-value: nan


##NOISE 30%

In [None]:


job_title = ['Truck Driver', 'Janitor', 'Cleaner', 'Administrative Clerk', 'Cashier', 'Data Analyst', 'Finance Manager', 'Professor']
education_levels = ['High School', 'Bachelor', 'Master', 'PhD']
industry = ['Transport', 'Logistics', 'Cleaning', 'Hotels and restaurants', 'Construction', 'Manufacturing', 'Financial intermediation', 'Pharmaceutical', 'Health and social services', 'Wholesale and retail', 'Technology', 'Research', 'Administration', 'Education', 'Energy', 'Agriculture']

# Define a function to preprocess the text data
random.seed(42)
def preprocess_textnoise(text, noise_level=0.3):
    # Remove "High School" to "high_school"
    # Replace integer ranges like "5-10" with "five_to_ten"
    text = re.sub(r'\b(\d+)-(\d+)\b', lambda match: f"{num2words(int(match.group(1)))}_to_{num2words(int(match.group(2)))}", text)
    # Replace "10+" with "ten_plus"
    text = re.sub(r'10\+', 'ten_plus', text)
    # Split the text into words
    words = text.split()
    # Replace some words with synonyms 
    for i, word in enumerate(words):
        # Randomly replace some job titles, education levels, or industries with a synonym
        if random.random() < noise_level and word in job_title + education_levels + industry:
            synsets = wordnet.synsets(word)
            if synsets:
                synonyms = synsets[0].lemma_names()
                if synonyms:
                    words[i] = random.choice(synonyms)
    # Join the words back into a string
    text_with_noise = ' '.join(words)
    return text_with_noise

In [None]:

for text in vacancies_df['VacancyDescription']:
    preprocessed_textnoise = preprocess_textnoise(text)
    print(preprocessed_textnoise[:150])

We are looking for an additional Cashier to strengthen our team in the Wholesale and retail industry with knowledge of French and five_to_ten years of
We are looking for an additional Data Analyst to strengthen our team in the disposal industry with knowledge of Dutch and zero_to_one years of experie
We are looking for an additional Janitor to strengthen our team in the Cleaning industry with knowledge of French and ten_plus years of experience, pr
We are looking for an additional Finance Manager to strengthen our team in the cleaning industry with knowledge of Dutch and five_to_ten years of expe
We are looking for an additional Data Analyst to strengthen our team in the Wholesale and retail industry with knowledge of German and zero_to_one yea
We are looking for an additional Janitor to strengthen our team in the cleaning industry with knowledge of German and two_to_four years of experience,
We are looking for an additional Cashier to strengthen our team in the Wholesale and retail in

###TF-IDF

####Top-matched vacancy 


Before recommending, the similarity matrix has to be computed.

In [None]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(vacancies_tfidf, vacancies_tfidf)

# Since you have used the TF-IDF vectorizer, calculating the dot product between each vector will directly give you the cosine similarity score. 
# Therefore, you will use sklearn's linear_kernel() instead of cosine_similarities() since it is faster.
# Each vacancy will be a 1x500 column vector where each column will be a similarity score with each vacancy.

In [None]:
cosine_sim.shape

(500, 500)

In [None]:
cosine_sim[1]

# cosine_sim[1] gives you an array containing the cosine similarity scores between the first vacancy and all other vacancies in the vacancies_df. 
# The length of the array will be the same as the number of rows in vacancies_df.

array([0.        , 1.        , 0.        , 0.13597015, 0.40291001,
       0.        , 0.        , 0.39013514, 0.        , 0.        ,
       0.        , 0.13418497, 0.13153121, 0.37893587, 0.        ,
       0.42504628, 0.        , 0.        , 0.        , 0.        ,
       0.62846264, 0.13645389, 0.        , 0.        , 0.        ,
       0.        , 0.36665369, 0.2509991 , 0.        , 0.12767155,
       0.        , 0.13984538, 0.        , 0.11540319, 0.40108048,
       0.1215685 , 0.        , 0.        , 0.        , 0.        ,
       0.12144852, 0.17860185, 0.        , 0.        , 0.        ,
       0.4005423 , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.28774436, 0.        , 0.12722107, 0.25008387,
       0.        , 0.29842834, 0.        , 0.        , 0.        ,
       0.10698556, 0.        , 0.        , 0.        , 0.27017502,
       0.        , 0.13226226, 0.        , 0.14822572, 0.13984538,
       0.        , 0.        , 0.24072798, 0.        , 0.26192

Recommends the top 10 most similar vacancies to the top-rated vacancy that the jobseeker has already rated

In [None]:
#Prompt the user to input the jobseeker ID
jobseeker_id = int(input("Enter the jobseeker ID: "))
       
# Retrieve the VacancyID of the job vacancies matched with the given JobseekerID
matches_for_jobseeker = matches_df[matches_df['JobseekerID'] == jobseeker_id]

# Sort the matches by rating in descending order
sorted_matches = matches_for_jobseeker.sort_values(by='Rating', ascending=False)

# Select the VacancyID of the top match for the jobseeker
top_match_vacancy_id = sorted_matches.iloc[0]['VacancyID']
print(f"The top match for jobseeker {jobseeker_id} is Vacancy {top_match_vacancy_id}")

# Find the row with Jobseeker and print the JobTitle and Industry columns
print(f'The characteristics of Jobseeker {jobseeker_id} are:')
row_j = jobseekers_df.loc[jobseekers_df['JobseekerID'] == jobseeker_id, ['JobseekerID', 'Industry', 'Language', 'Experience', 'Education Level']]
display(row_j)

# Find the row with the highest rated VacancyID and print the JobTitle and Industry columns
print(f'The characteristics of the highest rated Vacancy {top_match_vacancy_id} are:')
row_v = vacancies_df.loc[vacancies_df['VacancyID'] == top_match_vacancy_id, ['VacancyID','Job Title', 'Industry', 'Language', 'Experience', 'Education Level']]
display(row_v)

# Check if there are any matches for the jobseeker
if not sorted_matches.empty:
    # Get the VacancyIDs that the jobseeker has already rated
    rated_vacancy_ids = sorted_matches['VacancyID'].values

    # Define the modified get_recommendations function
    def get_recommendations(VacancyID, cosine_sim=cosine_sim, min_similarity=0.4): #ADDED: min_similarity=...

        # Construct a reverse map of indices and VacancyIDs
        indices = pd.Series(vacancies_df.index, index=vacancies_df['VacancyID']).drop_duplicates()
            
        # Get the index of the movie that matches the VacancyID (the highest rated one)
        idx = indices[VacancyID]

        # Get the pairwise similarity scores of all vacancies with that vacancy
        sim_scores = list(enumerate(cosine_sim[idx]))
        # ADDED: Filter the similarity scores by the minimum threshold
        sim_scores = [score for score in sim_scores if score[1] >= min_similarity]
       
        # Sort the vacancies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores and indices of the 10 most similar vacancies
        sim_scores = sim_scores[1:11]
        vacancy_indices = [i[0] for i in sim_scores] #NEW
        cosine_sim_scores = [i[1] for i in sim_scores] #NE
      
        # Get the details of the most similar vacancies, including the cosine similarity scores
        vacancies = pd.DataFrame(list(zip(vacancy_indices, cosine_sim_scores)), columns=['index', 'cosine_similarity_score'])
        vacancies = vacancies.merge(vacancies_df[['VacancyID', 'Job Title','Industry','Language','Experience','Education Level']], how='left', left_on='index', right_index=True)

        # Return the top 10 most similar vacancies
        return vacancies[['VacancyID', 'Job Title','Industry','Language','Experience','Education Level', 'cosine_similarity_score']]

    # Get the recommendations for the jobseeker
    recommendations = get_recommendations(top_match_vacancy_id, cosine_sim=cosine_sim, min_similarity=0.4)
  

    print(f"Recommended vacancies based on Vacancy {top_match_vacancy_id} are:")
    display(recommendations)
else:
    print(f"No matches found for jobseeker {jobseeker_id}")

Enter the jobseeker ID: 10466
The top match for jobseeker 10466 is Vacancy 46131
The characteristics of Jobseeker 10466 are:


Unnamed: 0,JobseekerID,Industry,Language,Experience,Education Level
2,10466,Administration,German,10+,PhD


The characteristics of the highest rated Vacancy 46131 are:


Unnamed: 0,VacancyID,Job Title,Industry,Language,Experience,Education Level
225,46131,Administrative Clerk,Administration,German,10+,Bachelor


Recommended vacancies based on Vacancy 46131 are:


Unnamed: 0,VacancyID,Job Title,Industry,Language,Experience,Education Level,cosine_similarity_score
0,29753,Administrative Clerk,Administration,German,10+,Bachelor,1.0
1,38808,Administrative Clerk,Administration,German,0-1,Bachelor,0.88491
2,31106,Administrative Clerk,Administration,French,10+,Bachelor,0.859394
3,38375,Administrative Clerk,Administration,English,10+,Bachelor,0.858525
4,46996,Administrative Clerk,Administration,English,10+,Bachelor,0.858525
5,30961,Administrative Clerk,Administration,Spanish,10+,Bachelor,0.857635
6,41829,Administrative Clerk,Administration,Dutch,10+,Bachelor,0.852848
7,46802,Administrative Clerk,Administration,English,5-10,Bachelor,0.757459
8,43115,Administrative Clerk,Administration,English,5-10,Bachelor,0.757459
9,40537,Administrative Clerk,Administration,English,5-10,Bachelor,0.757459


In [None]:

# Create a TfidfVectorizer object with your preferred settings
tfidf = TfidfVectorizer(max_df=0.7, min_df=2, stop_words='english', preprocessor=preprocess_textnoise)

# Use the vectorizer to transform your text data
vacancies_tfidf = tfidf.fit_transform(vacancies_df['VacancyDescription'])

In [None]:
# Extract the feature names from the fitted vectorizer object
feature_names = list(tfidf.vocabulary_.keys())

# Print the feature names that have been extracted from the text data
print(feature_names)

['teller', 'Wholesale', 'retail', 'French', 'five_to_ten', 'High', 'School', 'Data', 'Analyst', 'Administration', 'Dutch', 'zero_to_one', 'Bachelor', 'janitor', 'cleaning', 'ten_plus', 'Finance', 'Manager', 'cleanup', 'master', 'German', 'Master', 'Janitor', 'Cleaning', 'two_to_four', 'Cashier', 'Manufacturing', 'Spanish', 'Research', 'bachelor', 'Cleaner', 'cleansing', 'Administrative', 'Clerk', 'Truck', 'Driver', 'Transport', 'cleaner', 'research', 'Professor', 'PhD', 'Education', 'Pharmaceutical', 'transport', 'English', 'professor', 'Agriculture', 'cleansing_agent', 'engineering', 'school', 'cleanser', 'Hotels', 'restaurants', 'Construction', 'administration', 'unmarried_man', 'Financial', 'intermediation', 'factory_farm', 'Technology', 'Health', 'social', 'services', 'pedagogy', 'cashier', 'conveyance', 'instruction', 'construction', 'maestro', 'Energy', 'bank_clerk', 'disposal', 'agriculture', 'pharmaceutic', 'Logistics', 'Ph', 'prof', 'manufacture']


In [None]:
vacancies_tfidf.shape

(500, 78)

###Evaluation

> 






Kendall_corr and spearman_corr

In [None]:
#For particular jobseeker

jobseeker_id = 10466
from scipy.stats import kendalltau, spearmanr

# Get the actual vacancies that the jobseeker applied for
actual_vacancies = matches_df[matches_df['JobseekerID'] == jobseeker_id]['VacancyID'].values
indices = np.argsort(-matches_df.loc[(matches_df['JobseekerID'] == jobseeker_id) & (matches_df['VacancyID'].isin(actual_vacancies)), 'Rating'])
actual_vacancies = actual_vacancies[indices]  
print("actual_vacancies", actual_vacancies)

# Get the recommended vacancies
recommended_vacancies = recommendations['VacancyID'].values
print("recommended_vacancies", recommended_vacancies)

# Find the common vacancies in actual and recommended vacancies
common_vacancies = np.intersect1d(actual_vacancies, recommended_vacancies, assume_unique=True)
print("common", common_vacancies)

# Keep only the common vacancies in actual and recommended vacancies
actual_vacancies2 = actual_vacancies[np.isin(actual_vacancies, common_vacancies)]
recommended_vacancies2 = recommended_vacancies[np.isin(recommended_vacancies, common_vacancies)]
print("actual_vacancies2", actual_vacancies2)
print("recommended_vacancies2", recommended_vacancies2)

# Calculate Kendall rank correlation
kendall_corr, kendall_pvalue = kendalltau(actual_vacancies2, recommended_vacancies2)
kendall_corr = round(kendall_corr, 4)
kendall_pvalue = round(kendall_pvalue, 4)
print(f"Kendall rank correlation: {kendall_corr}, p-value: {kendall_pvalue}")

# Calculate Spearman rank correlation
spearman_corr, spearman_pvalue = spearmanr(actual_vacancies2, recommended_vacancies2)
spearman_corr = round(spearman_corr, 4)
spearman_pvalue = round(spearman_pvalue, 4)
print(f"Spearman rank correlation: {spearman_corr}, p-value: {spearman_pvalue}")

actual_vacancies [46131 34808 33811 25023 34743 38808 36153 45516 38183 31567 35983 28764]
recommended_vacancies [29753 38808 31106 38375 46996 30961 41829 46802 43115 40537]
common [38808]
actual_vacancies2 [38808]
recommended_vacancies2 [38808]
Kendall rank correlation: nan, p-value: nan
Spearman rank correlation: nan, p-value: nan


In [None]:
#For whole model
# Create empty lists to store the correlations and p-values
kendall_corrs = []
kendall_pvalues = []
spearman_corrs = []
spearman_pvalues = []

# Iterate over each unique jobseeker ID
for jobseeker_id in matches_df['JobseekerID'].unique():
    
    # Get the actual vacancies that the jobseeker applied for
    actual_vacancies = matches_df[matches_df['JobseekerID'] == jobseeker_id]['VacancyID'].values
    indices = np.argsort(-matches_df.loc[(matches_df['JobseekerID'] == jobseeker_id) & (matches_df['VacancyID'].isin(actual_vacancies)), 'Rating'])

    # Select the VacancyID of the top match for the jobseeker
    top_match_vacancy_id= matches_df.loc[matches_df['JobseekerID'] == jobseeker_id, 'VacancyID'].max()
    
    # Get the recommended vacancies based on the top-vacancy for each jobseeker
    recommended_vacancies = get_recommendations(top_match_vacancy_id, cosine_sim=cosine_sim, min_similarity=0.4)['VacancyID'].values

    # Find the common vacancies in actual and recommended vacancies
    common_vacancies = np.intersect1d(actual_vacancies, recommended_vacancies, assume_unique=True)
    
     # Only calculate correlations if there are at least 2 common vacancies //explanation on this in doc notes inside model
    if len(common_vacancies) >= 2:

        # Keep only the common vacancies in actual and recommended vacancies
        actual_vacancies = actual_vacancies[np.isin(actual_vacancies, common_vacancies)]
        recommended_vacancies = recommended_vacancies[np.isin(recommended_vacancies, common_vacancies)]

        # Calculate Kendall rank correlation
        kendall_corr, kendall_pvalue = kendalltau(actual_vacancies, recommended_vacancies)
        kendall_corrs.append(kendall_corr)
        kendall_pvalues.append(kendall_pvalue)

        # Calculate Spearman rank correlation
        spearman_corr, spearman_pvalue = spearmanr(actual_vacancies, recommended_vacancies)
        spearman_corrs.append(spearman_corr)
        spearman_pvalues.append(spearman_pvalue)


# Print the average correlations and p-values
print(f"Average Kendall rank correlation: {np.mean(kendall_corrs)}, p-value: {np.mean(kendall_pvalues)}")
print(f"Average Spearman rank correlation: {np.mean(spearman_corrs)}, p-value: {np.mean(spearman_pvalues)}")

Average Kendall rank correlation: -0.05391746031746033, p-value: 0.7875762169312169
Average Spearman rank correlation: -0.07666920634920635, p-value: nan


###User Profile




In [None]:
def get_top_n_vacancies(jobseeker_id, n=10):
    # Retrieve the VacancyID of the job vacancies matched with the given JobseekerID
    matches_for_jobseeker = matches_df[matches_df['JobseekerID'] == jobseeker_id]
    matched_vacancies = matches_for_jobseeker['VacancyID'].tolist()

    # Find the indices of the matched vacancies in the vacancies_df
    matched_indices = [vacancies_df[vacancies_df['VacancyID'] == vid].index[0] for vid in matched_vacancies]

    # Extract the vectorized data for the matched vacancies
    matched_vectors = vacancies_tfidf[matched_indices]

    # Compute the average values in the vectorized data for each feature.
    user_profile = np.asarray(np.mean(matched_vectors, axis=0))

    # Compute the cosine similarity between the user profile and the vacancies
    similarity_scores = cosine_similarity(user_profile.reshape(1, -1), vacancies_tfidf)

    # Sort the similarity scores from high to low
    sorted_scores_indices = np.argsort(similarity_scores)[0][::-1]

    # Find the row with Jobseeker and print the JobTitle and Industry columns
    row_j = jobseekers_df.loc[jobseekers_df['JobseekerID'] == jobseeker_id, ['JobseekerID', 'Industry', 'Language', 'Experience', 'Education Level']]

    # Get the top N recommended vacancies
    top_matches = sorted_scores_indices[:n]

    # Create a dataframe of the recommended vacancies
    recommendations = pd.DataFrame({'VacancyID': [vacancies_df.iloc[i]['VacancyID'] for i in top_matches],
                                     'Job Title': [vacancies_df.iloc[i]['Job Title'] for i in top_matches],
                                     'Industry': [vacancies_df.iloc[i]['Industry'] for i in top_matches],
                                     'Language': [vacancies_df.iloc[i]['Language'] for i in top_matches],
                                     'Experience': [vacancies_df.iloc[i]['Experience'] for i in top_matches],
                                     'Education Level': [vacancies_df.iloc[i]['Education Level'] for i in top_matches],
                                     'cosine_similarity_score': [similarity_scores[0][i] for i in top_matches]})
    recommendations = recommendations[['VacancyID', 'Job Title', 'Industry', 'Language', 'Experience', 'Education Level', 'cosine_similarity_score']]

    return recommendations

# Prompt the user to input the jobseeker ID
jobseeker_id = int(input("Enter the jobseeker ID: "))

print(f'The characteristics of Jobseeker {jobseeker_id} are:')
row_j = jobseekers_df.loc[jobseekers_df['JobseekerID'] == jobseeker_id, ['JobseekerID', 'Industry', 'Language', 'Experience', 'Education Level']]
display(row_j)

# Print the top 10 most similar vacancies
print(f"Top 10 job matches for jobseeker with ID {jobseeker_id}:")
recommended_vacancies = get_top_n_vacancies(jobseeker_id)
print(recommended_vacancies)
#display(recommendations)

Enter the jobseeker ID: 10466
The characteristics of Jobseeker 10466 are:


Unnamed: 0,JobseekerID,Industry,Language,Experience,Education Level
2,10466,Administration,German,10+,PhD


Top 10 job matches for jobseeker with ID 10466:
   VacancyID             Job Title        Industry Language Experience  \
0      26181  Administrative Clerk  Administration   German        0-1   
1      34808  Administrative Clerk  Administration   German        0-1   
2      45516  Administrative Clerk  Administration   German        0-1   
3      28764  Administrative Clerk  Administration   German        0-1   
4      25023  Administrative Clerk  Administration   German        10+   
5      34743  Administrative Clerk  Administration   German        10+   
6      35983  Administrative Clerk  Administration   German       5-10   
7      46131  Administrative Clerk  Administration   German        10+   
8      25656  Administrative Clerk  Administration   French        0-1   
9      30108  Administrative Clerk  Administration   French        10+   

  Education Level  cosine_similarity_score  
0     High school                 0.934362  
1     High school                 0.934362  
2 

###Evaluation

Kendall_corr and spearman_corr

In [None]:
#one particular jobseeker
jobseeker_id = 10466
from scipy.stats import kendalltau, spearmanr

# Get the actual vacancies that the jobseeker applied for
actual_vacancies = matches_df[matches_df['JobseekerID'] == jobseeker_id]['VacancyID'].values
indices = np.argsort(-matches_df.loc[(matches_df['JobseekerID'] == jobseeker_id) & (matches_df['VacancyID'].isin(actual_vacancies)), 'Rating'])
actual_vacancies = actual_vacancies[indices]  
print("actual_vacancies", actual_vacancies)

# Get the recommended vacancies
recommended_vacancies = recommendations['VacancyID'].values.tolist()
print("recommended_vacancies", recommended_vacancies)

# Find the common vacancies in actual and recommended vacancies
common_vacancies = np.intersect1d(actual_vacancies, recommended_vacancies, assume_unique=True)
print("common", common_vacancies)

# Keep only the common vacancies in actual and recommended vacancies
actual_vacancies2 = actual_vacancies[np.isin(actual_vacancies, common_vacancies)]
print("actual_vacancies2", actual_vacancies2)
recommended_vacancies2 = [v for v in recommended_vacancies if v in common_vacancies]
print("recommended_vacancies2", recommended_vacancies2)

# Calculate Kendall rank correlation
kendall_corr, kendall_pvalue = kendalltau(actual_vacancies2, recommended_vacancies2)
kendall_corr = round(kendall_corr, 4)
kendall_pvalue = round(kendall_pvalue, 4)
print(f"Kendall rank correlation: {kendall_corr}, p-value: {kendall_pvalue}")

# Calculate Spearman rank correlation
spearman_corr, spearman_pvalue = spearmanr(actual_vacancies2, recommended_vacancies2)
spearman_corr = round(spearman_corr, 4)
spearman_pvalue = round(spearman_pvalue, 4)
print(f"Spearman rank correlation: {spearman_corr}, p-value: {spearman_pvalue}")

actual_vacancies [46131 34808 33811 25023 34743 38808 36153 45516 38183 31567 35983 28764]
recommended_vacancies [29753, 38808, 31106, 38375, 46996, 30961, 41829, 46802, 43115, 40537]
common [38808]
actual_vacancies2 [38808]
recommended_vacancies2 [38808]
Kendall rank correlation: nan, p-value: nan
Spearman rank correlation: nan, p-value: nan


In [None]:
#whole model
#Correct
# Create empty lists to store the correlations and p-values
kendall_corrs = []
kendall_pvalues = []
spearman_corrs = []
spearman_pvalues = []

# Iterate over each unique jobseeker ID
for jobseeker_id in matches_df['JobseekerID'].unique():
    
    # Get the actual vacancies that the jobseeker applied for
    actual_vacancies = matches_df[matches_df['JobseekerID'] == jobseeker_id]['VacancyID'].values
    indices = np.argsort(-matches_df.loc[(matches_df['JobseekerID'] == jobseeker_id) & (matches_df['VacancyID'].isin(actual_vacancies)), 'Rating'])
    actual_vacancies = actual_vacancies[indices][1:]  # remove the highest rated vacancyID

    # Get the recommended vacancies
    recommended_vacancies = get_top_n_vacancies(jobseeker_id)['VacancyID'].values.tolist() #list of the vacancyID of the recommended vacancies for a particular jobseeker

    # Find the common vacancies in actual and recommended vacancies
    common_vacancies = np.intersect1d(actual_vacancies, recommended_vacancies, assume_unique=True)
    
     # Only calculate correlations if there are at least 2 common vacancies //explanation on this in doc notes inside model
    if len(common_vacancies) >= 2:

        # Keep only the common vacancies in actual and recommended vacancies
        actual_vacancies = actual_vacancies[np.isin(actual_vacancies, common_vacancies)]
        recommended_vacancies = [v for v in recommended_vacancies if v in common_vacancies]

        # Calculate Kendall rank correlation
        kendall_corr, kendall_pvalue = kendalltau(actual_vacancies, recommended_vacancies)
        kendall_corrs.append(kendall_corr)
        kendall_pvalues.append(kendall_pvalue)

        # Calculate Spearman rank correlation
        spearman_corr, spearman_pvalue = spearmanr(actual_vacancies, recommended_vacancies)
        spearman_corrs.append(spearman_corr)
        spearman_pvalues.append(spearman_pvalue)


# Print the average correlations and p-values
print(f"Average Kendall rank correlation: {np.mean(kendall_corrs)}, p-value: {np.mean(kendall_pvalues)}")
print(f"Average Spearman rank correlation: {np.mean(spearman_corrs)}, p-value: {np.mean(spearman_pvalues)}")

Average Kendall rank correlation: 0.0313698986743562, p-value: 0.722389578123169
Average Spearman rank correlation: 0.036850421795754006, p-value: nan


##NOISE 40%

###TF-IDF

In [None]:
# Create a TfidfVectorizer object with your preferred settings
tfidf = TfidfVectorizer(max_df=0.7, min_df=2, stop_words='english', preprocessor=preprocess_textnoise)

# Use the vectorizer to transform your text data
vacancies_tfidf = tfidf.fit_transform(vacancies_df['VacancyDescription'])

In [None]:
# Extract the feature names from the fitted vectorizer object
feature_names = list(tfidf.vocabulary_.keys())

# Print the feature names that have been extracted from the text data
print(feature_names)

['teller', 'Wholesale', 'retail', 'French', 'five_to_ten', 'High', 'School', 'Data', 'Analyst', 'Administration', 'Dutch', 'zero_to_one', 'unmarried_man', 'janitor', 'cleaning', 'ten_plus', 'Finance', 'Manager', 'cleansing', 'master', 'German', 'Master', 'Cleaning', 'two_to_four', 'Manufacturing', 'Spanish', 'maestro', 'fabrication', 'Bachelor', 'Cashier', 'Research', 'Logistics', 'Cleaner', 'Administrative', 'Clerk', 'administration', 'Truck', 'Driver', 'transport', 'Transport', 'Janitor', 'cleanup', 'cleanser', 'cashier', 'research', 'professor', 'PhD', 'Education', 'Pharmaceutical', 'English', 'Professor', 'prof', 'conveyance', 'disposal', 'factory_farm', 'bank_clerk', 'Ph', 'bachelor', 'Technology', 'school', 'Hotels', 'restaurants', 'Construction', 'cleaner', 'Financial', 'intermediation', 'Agriculture', 'Health', 'social', 'services', 'building', 'construction', 'Energy', 'cleansing_agent', 'didactics', 'manufacture', 'engineering', 'teaching']


In [None]:
vacancies_tfidf.shape

(500, 78)

####Top-matched vacancy 


Before recommending, the similarity matrix has to be computed.

In [None]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(vacancies_tfidf, vacancies_tfidf)

# Since you have used the TF-IDF vectorizer, calculating the dot product between each vector will directly give you the cosine similarity score. 
# Therefore, you will use sklearn's linear_kernel() instead of cosine_similarities() since it is faster.
# Each vacancy will be a 1x500 column vector where each column will be a similarity score with each vacancy.

Recommends the top 10 most similar vacancies to the top-rated vacancy that the jobseeker has already rated

In [None]:
#Prompt the user to input the jobseeker ID
jobseeker_id = int(input("Enter the jobseeker ID: "))
       
# Retrieve the VacancyID of the job vacancies matched with the given JobseekerID
matches_for_jobseeker = matches_df[matches_df['JobseekerID'] == jobseeker_id]

# Sort the matches by rating in descending order
sorted_matches = matches_for_jobseeker.sort_values(by='Rating', ascending=False)

# Select the VacancyID of the top match for the jobseeker
top_match_vacancy_id = sorted_matches.iloc[0]['VacancyID']
print(f"The top match for jobseeker {jobseeker_id} is Vacancy {top_match_vacancy_id}")

# Find the row with Jobseeker and print the JobTitle and Industry columns
print(f'The characteristics of Jobseeker {jobseeker_id} are:')
row_j = jobseekers_df.loc[jobseekers_df['JobseekerID'] == jobseeker_id, ['JobseekerID', 'Industry', 'Language', 'Experience', 'Education Level']]
display(row_j)

# Find the row with the highest rated VacancyID and print the JobTitle and Industry columns
print(f'The characteristics of the highest rated Vacancy {top_match_vacancy_id} are:')
row_v = vacancies_df.loc[vacancies_df['VacancyID'] == top_match_vacancy_id, ['VacancyID','Job Title', 'Industry', 'Language', 'Experience', 'Education Level']]
display(row_v)

# Check if there are any matches for the jobseeker
if not sorted_matches.empty:
    # Get the VacancyIDs that the jobseeker has already rated
    rated_vacancy_ids = sorted_matches['VacancyID'].values

    # Define the modified get_recommendations function
    def get_recommendations(VacancyID, cosine_sim=cosine_sim, min_similarity=0.4): #ADDED: min_similarity=...

        # Construct a reverse map of indices and VacancyIDs
        indices = pd.Series(vacancies_df.index, index=vacancies_df['VacancyID']).drop_duplicates()
            
        # Get the index of the movie that matches the VacancyID (the highest rated one)
        idx = indices[VacancyID]

        # Get the pairwise similarity scores of all vacancies with that vacancy
        sim_scores = list(enumerate(cosine_sim[idx]))
        # ADDED: Filter the similarity scores by the minimum threshold
        sim_scores = [score for score in sim_scores if score[1] >= min_similarity]
       
        # Sort the vacancies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores and indices of the 10 most similar vacancies
        sim_scores = sim_scores[1:11]
        vacancy_indices = [i[0] for i in sim_scores] #NEW
        cosine_sim_scores = [i[1] for i in sim_scores] #NE
      
        # Get the details of the most similar vacancies, including the cosine similarity scores
        vacancies = pd.DataFrame(list(zip(vacancy_indices, cosine_sim_scores)), columns=['index', 'cosine_similarity_score'])
        vacancies = vacancies.merge(vacancies_df[['VacancyID', 'Job Title','Industry','Language','Experience','Education Level']], how='left', left_on='index', right_index=True)

        # Return the top 10 most similar vacancies
        return vacancies[['VacancyID', 'Job Title','Industry','Language','Experience','Education Level', 'cosine_similarity_score']]

    # Get the recommendations for the jobseeker
    recommendations = get_recommendations(top_match_vacancy_id, cosine_sim=cosine_sim, min_similarity=0.4)
  

    print(f"Recommended vacancies based on Vacancy {top_match_vacancy_id} are:")
    display(recommendations)
else:
    print(f"No matches found for jobseeker {jobseeker_id}")

Enter the jobseeker ID: 10466
The top match for jobseeker 10466 is Vacancy 46131
The characteristics of Jobseeker 10466 are:


Unnamed: 0,JobseekerID,Industry,Language,Experience,Education Level
2,10466,Administration,German,10+,PhD


The characteristics of the highest rated Vacancy 46131 are:


Unnamed: 0,VacancyID,Job Title,Industry,Language,Experience,Education Level
225,46131,Administrative Clerk,Administration,German,10+,Bachelor


Recommended vacancies based on Vacancy 46131 are:


Unnamed: 0,VacancyID,Job Title,Industry,Language,Experience,Education Level,cosine_similarity_score
0,29753,Administrative Clerk,Administration,German,10+,Bachelor,1.0
1,38808,Administrative Clerk,Administration,German,0-1,Bachelor,0.90929
2,43115,Administrative Clerk,Administration,English,5-10,Bachelor,0.809706
3,33811,Administrative Clerk,Administration,German,10+,Bachelor,0.764119
4,34743,Administrative Clerk,Administration,German,10+,High school,0.705532
5,38375,Administrative Clerk,Administration,English,10+,Bachelor,0.646823
6,46996,Administrative Clerk,Administration,English,10+,Bachelor,0.646823
7,30961,Administrative Clerk,Administration,Spanish,10+,Bachelor,0.641679
8,41829,Administrative Clerk,Administration,Dutch,10+,Bachelor,0.640132
9,49041,Administrative Clerk,Administration,Dutch,10+,Bachelor,0.640132


###Evaluation

> 






Kendall_corr and spearman_corr

In [None]:
#For particular jobseeker

jobseeker_id = 10466
from scipy.stats import kendalltau, spearmanr

# Get the actual vacancies that the jobseeker applied for
actual_vacancies = matches_df[matches_df['JobseekerID'] == jobseeker_id]['VacancyID'].values
indices = np.argsort(-matches_df.loc[(matches_df['JobseekerID'] == jobseeker_id) & (matches_df['VacancyID'].isin(actual_vacancies)), 'Rating'])
actual_vacancies = actual_vacancies[indices]  
print("actual_vacancies", actual_vacancies)

# Get the recommended vacancies
recommended_vacancies = recommendations['VacancyID'].values
print("recommended_vacancies", recommended_vacancies)

# Find the common vacancies in actual and recommended vacancies
common_vacancies = np.intersect1d(actual_vacancies, recommended_vacancies, assume_unique=True)
print("common", common_vacancies)

# Keep only the common vacancies in actual and recommended vacancies
actual_vacancies2 = actual_vacancies[np.isin(actual_vacancies, common_vacancies)]
recommended_vacancies2 = recommended_vacancies[np.isin(recommended_vacancies, common_vacancies)]
print("actual_vacancies2", actual_vacancies2)
print("recommended_vacancies2", recommended_vacancies2)

# Calculate Kendall rank correlation
kendall_corr, kendall_pvalue = kendalltau(actual_vacancies2, recommended_vacancies2)
kendall_corr = round(kendall_corr, 4)
kendall_pvalue = round(kendall_pvalue, 4)
print(f"Kendall rank correlation: {kendall_corr}, p-value: {kendall_pvalue}")

# Calculate Spearman rank correlation
spearman_corr, spearman_pvalue = spearmanr(actual_vacancies2, recommended_vacancies2)
spearman_corr = round(spearman_corr, 4)
spearman_pvalue = round(spearman_pvalue, 4)
print(f"Spearman rank correlation: {spearman_corr}, p-value: {spearman_pvalue}")

actual_vacancies [46131 34808 33811 25023 34743 38808 36153 45516 38183 31567 35983 28764]
recommended_vacancies [29753 38808 43115 33811 34743 38375 46996 30961 41829 49041]
common [33811 34743 38808]
actual_vacancies2 [33811 34743 38808]
recommended_vacancies2 [38808 33811 34743]
Kendall rank correlation: -0.3333, p-value: 1.0
Spearman rank correlation: -0.5, p-value: 0.6667


In [None]:
#For whole model
# Create empty lists to store the correlations and p-values
kendall_corrs = []
kendall_pvalues = []
spearman_corrs = []
spearman_pvalues = []

# Iterate over each unique jobseeker ID
for jobseeker_id in matches_df['JobseekerID'].unique():
    
    # Get the actual vacancies that the jobseeker applied for
    actual_vacancies = matches_df[matches_df['JobseekerID'] == jobseeker_id]['VacancyID'].values
    indices = np.argsort(-matches_df.loc[(matches_df['JobseekerID'] == jobseeker_id) & (matches_df['VacancyID'].isin(actual_vacancies)), 'Rating'])

    # Select the VacancyID of the top match for the jobseeker
    top_match_vacancy_id= matches_df.loc[matches_df['JobseekerID'] == jobseeker_id, 'VacancyID'].max()
    
    # Get the recommended vacancies based on the top-vacancy for each jobseeker
    recommended_vacancies = get_recommendations(top_match_vacancy_id, cosine_sim=cosine_sim, min_similarity=0.4)['VacancyID'].values

    # Find the common vacancies in actual and recommended vacancies
    common_vacancies = np.intersect1d(actual_vacancies, recommended_vacancies, assume_unique=True)
    
     # Only calculate correlations if there are at least 2 common vacancies //explanation on this in doc notes inside model
    if len(common_vacancies) >= 2:

        # Keep only the common vacancies in actual and recommended vacancies
        actual_vacancies = actual_vacancies[np.isin(actual_vacancies, common_vacancies)]
        recommended_vacancies = recommended_vacancies[np.isin(recommended_vacancies, common_vacancies)]

        # Calculate Kendall rank correlation
        kendall_corr, kendall_pvalue = kendalltau(actual_vacancies, recommended_vacancies)
        kendall_corrs.append(kendall_corr)
        kendall_pvalues.append(kendall_pvalue)

        # Calculate Spearman rank correlation
        spearman_corr, spearman_pvalue = spearmanr(actual_vacancies, recommended_vacancies)
        spearman_corrs.append(spearman_corr)
        spearman_pvalues.append(spearman_pvalue)


# Print the average correlations and p-values
print(f"Average Kendall rank correlation: {np.mean(kendall_corrs)}, p-value: {np.mean(kendall_pvalues)}")
print(f"Average Spearman rank correlation: {np.mean(spearman_corrs)}, p-value: {np.mean(spearman_pvalues)}")

Average Kendall rank correlation: -0.03589218696690549, p-value: 0.7338517381250238
Average Spearman rank correlation: -0.03229029585222011, p-value: nan


###User Profile




In [None]:
def get_top_n_vacancies(jobseeker_id, n=10):
    # Retrieve the VacancyID of the job vacancies matched with the given JobseekerID
    matches_for_jobseeker = matches_df[matches_df['JobseekerID'] == jobseeker_id]
    matched_vacancies = matches_for_jobseeker['VacancyID'].tolist()

    # Find the indices of the matched vacancies in the vacancies_df
    matched_indices = [vacancies_df[vacancies_df['VacancyID'] == vid].index[0] for vid in matched_vacancies]

    # Extract the vectorized data for the matched vacancies
    matched_vectors = vacancies_tfidf[matched_indices]

    # Compute the average values in the vectorized data for each feature.
    user_profile = np.asarray(np.mean(matched_vectors, axis=0))

    # Compute the cosine similarity between the user profile and the vacancies
    similarity_scores = cosine_similarity(user_profile.reshape(1, -1), vacancies_tfidf)

    # Sort the similarity scores from high to low
    sorted_scores_indices = np.argsort(similarity_scores)[0][::-1]

    # Find the row with Jobseeker and print the JobTitle and Industry columns
    row_j = jobseekers_df.loc[jobseekers_df['JobseekerID'] == jobseeker_id, ['JobseekerID', 'Industry', 'Language', 'Experience', 'Education Level']]

    # Get the top N recommended vacancies
    top_matches = sorted_scores_indices[:n]

    # Create a dataframe of the recommended vacancies
    recommendations = pd.DataFrame({'VacancyID': [vacancies_df.iloc[i]['VacancyID'] for i in top_matches],
                                     'Job Title': [vacancies_df.iloc[i]['Job Title'] for i in top_matches],
                                     'Industry': [vacancies_df.iloc[i]['Industry'] for i in top_matches],
                                     'Language': [vacancies_df.iloc[i]['Language'] for i in top_matches],
                                     'Experience': [vacancies_df.iloc[i]['Experience'] for i in top_matches],
                                     'Education Level': [vacancies_df.iloc[i]['Education Level'] for i in top_matches],
                                     'cosine_similarity_score': [similarity_scores[0][i] for i in top_matches]})
    recommendations = recommendations[['VacancyID', 'Job Title', 'Industry', 'Language', 'Experience', 'Education Level', 'cosine_similarity_score']]

    return recommendations

# Prompt the user to input the jobseeker ID
jobseeker_id = int(input("Enter the jobseeker ID: "))

print(f'The characteristics of Jobseeker {jobseeker_id} are:')
row_j = jobseekers_df.loc[jobseekers_df['JobseekerID'] == jobseeker_id, ['JobseekerID', 'Industry', 'Language', 'Experience', 'Education Level']]
display(row_j)

# Print the top 10 most similar vacancies
print(f"Top 10 job matches for jobseeker with ID {jobseeker_id}:")
recommended_vacancies = get_top_n_vacancies(jobseeker_id)
print(recommended_vacancies)
#display(recommendations)

Enter the jobseeker ID: 10466
The characteristics of Jobseeker 10466 are:


Unnamed: 0,JobseekerID,Industry,Language,Experience,Education Level
2,10466,Administration,German,10+,PhD


Top 10 job matches for jobseeker with ID 10466:
   VacancyID             Job Title        Industry Language Experience  \
0      26181  Administrative Clerk  Administration   German        0-1   
1      45516  Administrative Clerk  Administration   German        0-1   
2      34743  Administrative Clerk  Administration   German        10+   
3      36153  Administrative Clerk  Administration   German        2-4   
4      34808  Administrative Clerk  Administration   German        0-1   
5      28764  Administrative Clerk  Administration   German        0-1   
6      25023  Administrative Clerk  Administration   German        10+   
7      25656  Administrative Clerk  Administration   French        0-1   
8      38808  Administrative Clerk  Administration   German        0-1   
9      30108  Administrative Clerk  Administration   French        10+   

  Education Level  cosine_similarity_score  
0     High school                 0.920883  
1     High school                 0.920883  
2 

###Evaluation

Kendall_corr and spearman_corr

In [None]:
#one particular jobseeker
jobseeker_id = 10466
from scipy.stats import kendalltau, spearmanr

# Get the actual vacancies that the jobseeker applied for
actual_vacancies = matches_df[matches_df['JobseekerID'] == jobseeker_id]['VacancyID'].values
indices = np.argsort(-matches_df.loc[(matches_df['JobseekerID'] == jobseeker_id) & (matches_df['VacancyID'].isin(actual_vacancies)), 'Rating'])
actual_vacancies = actual_vacancies[indices]  
print("actual_vacancies", actual_vacancies)

# Get the recommended vacancies
recommended_vacancies = recommendations['VacancyID'].values.tolist()
print("recommended_vacancies", recommended_vacancies)

# Find the common vacancies in actual and recommended vacancies
common_vacancies = np.intersect1d(actual_vacancies, recommended_vacancies, assume_unique=True)
print("common", common_vacancies)

# Keep only the common vacancies in actual and recommended vacancies
actual_vacancies2 = actual_vacancies[np.isin(actual_vacancies, common_vacancies)]
print("actual_vacancies2", actual_vacancies2)
recommended_vacancies2 = [v for v in recommended_vacancies if v in common_vacancies]
print("recommended_vacancies2", recommended_vacancies2)

# Calculate Kendall rank correlation
kendall_corr, kendall_pvalue = kendalltau(actual_vacancies2, recommended_vacancies2)
kendall_corr = round(kendall_corr, 4)
kendall_pvalue = round(kendall_pvalue, 4)
print(f"Kendall rank correlation: {kendall_corr}, p-value: {kendall_pvalue}")

# Calculate Spearman rank correlation
spearman_corr, spearman_pvalue = spearmanr(actual_vacancies2, recommended_vacancies2)
spearman_corr = round(spearman_corr, 4)
spearman_pvalue = round(spearman_pvalue, 4)
print(f"Spearman rank correlation: {spearman_corr}, p-value: {spearman_pvalue}")

actual_vacancies [46131 34808 33811 25023 34743 38808 36153 45516 38183 31567 35983 28764]
recommended_vacancies [29753, 38808, 43115, 33811, 34743, 38375, 46996, 30961, 41829, 49041]
common [33811 34743 38808]
actual_vacancies2 [33811 34743 38808]
recommended_vacancies2 [38808, 33811, 34743]
Kendall rank correlation: -0.3333, p-value: 1.0
Spearman rank correlation: -0.5, p-value: 0.6667


In [None]:
#whole model
#Correct
# Create empty lists to store the correlations and p-values
kendall_corrs = []
kendall_pvalues = []
spearman_corrs = []
spearman_pvalues = []

# Iterate over each unique jobseeker ID
for jobseeker_id in matches_df['JobseekerID'].unique():
    
    # Get the actual vacancies that the jobseeker applied for
    actual_vacancies = matches_df[matches_df['JobseekerID'] == jobseeker_id]['VacancyID'].values
    indices = np.argsort(-matches_df.loc[(matches_df['JobseekerID'] == jobseeker_id) & (matches_df['VacancyID'].isin(actual_vacancies)), 'Rating'])
    actual_vacancies = actual_vacancies[indices][1:]  # remove the highest rated vacancyID

    # Get the recommended vacancies
    recommended_vacancies = get_top_n_vacancies(jobseeker_id)['VacancyID'].values.tolist() #list of the vacancyID of the recommended vacancies for a particular jobseeker

    # Find the common vacancies in actual and recommended vacancies
    common_vacancies = np.intersect1d(actual_vacancies, recommended_vacancies, assume_unique=True)
    
     # Only calculate correlations if there are at least 2 common vacancies //explanation on this in doc notes inside model
    if len(common_vacancies) >= 2:

        # Keep only the common vacancies in actual and recommended vacancies
        actual_vacancies = actual_vacancies[np.isin(actual_vacancies, common_vacancies)]
        recommended_vacancies = [v for v in recommended_vacancies if v in common_vacancies]

        # Calculate Kendall rank correlation
        kendall_corr, kendall_pvalue = kendalltau(actual_vacancies, recommended_vacancies)
        kendall_corrs.append(kendall_corr)
        kendall_pvalues.append(kendall_pvalue)

        # Calculate Spearman rank correlation
        spearman_corr, spearman_pvalue = spearmanr(actual_vacancies, recommended_vacancies)
        spearman_corrs.append(spearman_corr)
        spearman_pvalues.append(spearman_pvalue)


# Print the average correlations and p-values
print(f"Average Kendall rank correlation: {np.mean(kendall_corrs)}, p-value: {np.mean(kendall_pvalues)}")
print(f"Average Spearman rank correlation: {np.mean(spearman_corrs)}, p-value: {np.mean(spearman_pvalues)}")

Average Kendall rank correlation: -0.021039422216882268, p-value: 0.7202637602803402
Average Spearman rank correlation: -0.02283126531444446, p-value: nan


##Recommending vacancies

In [None]:
jobseeker_id = 10466

# Retrieve the VacancyID of the job vacancies matched with the given JobseekerID
matches_for_jobseeker = matches_df[matches_df['JobseekerID'] == jobseeker_id]

# Sort the matches by rating in descending order
sorted_matches = matches_for_jobseeker.sort_values(by='Rating', ascending=False)

# Select the VacancyID of the top match for the jobseeker
top_match_vacancy_id = sorted_matches.iloc[0]['VacancyID']
print(f"The top match for jobseeker {jobseeker_id} is Vacancy {top_match_vacancy_id}")

# Find the row with Jobseeker and print the JobTitle and Industry columns
print(f'The characteristics of Jobseeker {jobseeker_id} are:')
row_j = jobseekers_df.loc[jobseekers_df['JobseekerID'] == jobseeker_id, ['JobseekerID', 'Industry', 'Language', 'Experience', 'Education Level']]
display(row_j)

# Find the row with the highest rated VacancyID and print the JobTitle and Industry columns
print(f'The characteristics of the highest rated Vacancy {top_match_vacancy_id} are:')
row_v = vacancies_df.loc[vacancies_df['VacancyID'] == top_match_vacancy_id, ['VacancyID','Job Title', 'Industry', 'Language', 'Experience', 'Education Level']]
display(row_v)


# Check if there are any matches for the jobseeker
if not sorted_matches.empty:
    # Get the VacancyIDs that the jobseeker has already rated
    rated_vacancy_ids = sorted_matches['VacancyID'].values

    # Define the modified get_recommendations function
    def get_recommendations(VacancyID, cosine_sim=cosine_sim, min_similarity=0.4): #ADDED: min_similarity=...
        # Get the index of the movie that matches the VacancyID
        idx = indices[VacancyID]

        # Get the pairwise similarity scores of all movies with that vacancy
        sim_scores = list(enumerate(cosine_sim[idx]))
        # ADDED: Filter the similarity scores by the minimum threshold
        sim_scores = [score for score in sim_scores if score[1] >= min_similarity]
       
        # Sort the vacancies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the scores and indices of the 10 most similar vacancies
        sim_scores = sim_scores[1:11]
        vacancy_indices = [i[0] for i in sim_scores] 
        cosine_sim_scores = [i[1] for i in sim_scores] 

        # Remove the VacancyIDs that the jobseeker has already rated 
        vacancy_indices = [i for i in vacancy_indices if vacancies_df.iloc[i]['VacancyID'] not in rated_vacancy_ids]

       
        # Get the details of the most similar vacancies, including the cosine similarity scores
        vacancies = pd.DataFrame(list(zip(vacancy_indices, cosine_sim_scores)), columns=['index', 'cosine_similarity_score'])
        vacancies = vacancies.merge(vacancies_df[['VacancyID', 'Job Title','Industry','Language','Experience','Education Level']], how='left', left_on='index', right_index=True)
        

        # Return the top 10 most similar vacancies
        return vacancies[['VacancyID', 'Job Title','Industry','Language','Experience','Education Level', 'cosine_similarity_score']]

    # Get the recommendations for the jobseeker
    recommendations = get_recommendations(top_match_vacancy_id, cosine_sim=cosine_sim, min_similarity=0.4)
  

    print(f"Recommended vacancies based on Vacancy {top_match_vacancy_id} are:")
    display(recommendations)
else:
    print(f"No matches found for jobseeker {jobseeker_id}")

The top match for jobseeker 10466 is Vacancy 46131
The characteristics of Jobseeker 10466 are:


Unnamed: 0,JobseekerID,Industry,Language,Experience,Education Level
2,10466,Administration,German,10+,PhD


The characteristics of the highest rated Vacancy 46131 are:


Unnamed: 0,VacancyID,Job Title,Industry,Language,Experience,Education Level
225,46131,Administrative Clerk,Administration,German,10+,Bachelor


Recommended vacancies based on Vacancy 46131 are:


Unnamed: 0,VacancyID,Job Title,Industry,Language,Experience,Education Level,cosine_similarity_score
0,29753,Administrative Clerk,Administration,German,10+,Bachelor,1.0
1,48966,Administrative Clerk,Administration,French,10+,Bachelor,1.0
2,31106,Administrative Clerk,Administration,French,10+,Bachelor,0.895055
3,38375,Administrative Clerk,Administration,English,10+,Bachelor,0.895055
4,44216,Administrative Clerk,Administration,English,10+,Bachelor,0.878606


##Recommending jobseekers


**Recommending jobseekers to vacancies** 


*Note: Only the last code box is different*

Preprocessing

In [None]:
# Define a function to preprocess the 'Experience' column
def preprocess_experience(exp_str):
    # Split the experience string into a list of integers or a string '10+'
    # Split the experience string into a list of integers or a string '10+'
    # If the first integer is 0 and the second integer is 1, then the experience falls under the 'Exp_0_1' category, 
    # and the corresponding value in exp_dict is set to 1
    exp_list = [int(s.strip()) if s != '10+' else s.strip() for s in exp_str.split('-')]

    # Create a dictionary mapping each experience category to a binary value
    exp_dict = {'Exp_0_1': 0, 'Exp_2_4': 0, 'Exp_5_10': 0, 'Exp_10_plus': 0}
    if exp_list[0] == 0 and exp_list[1] == 1:
        exp_dict['Exp_0_1'] = 1
    elif exp_list[0] == 2 and exp_list[1] == 4:
        exp_dict['Exp_2_4'] = 1
    elif exp_list[0] == 5 and exp_list[1] == 10:
        exp_dict['Exp_5_10'] = 1
    elif len(exp_list) == 1 and exp_list[0] == '10+':
        exp_dict['Exp_10_plus'] = 1
    
    # Return the dictionary
    return exp_dict

In [None]:
# Apply the preprocess_experience function to the 'Experience' column of both the jobseekers_df and vacancies_df
jobseekers_df = pd.concat([jobseekers_df.drop('Experience', axis=1), jobseekers_df['Experience'].apply(preprocess_experience).apply(pd.Series)], axis=1)
vacancies_df = pd.concat([vacancies_df.drop('Experience', axis=1), vacancies_df['Experience'].apply(preprocess_experience).apply(pd.Series)], axis=1)

In [None]:
print(jobseekers_df)

      JobseekerID                    Name Language Education Level  \
0           20473    Joren Van Campenhout    Dutch             PhD   
1           22762            Romy Derycke    Dutch             PhD   
2           10466         Gabriel Lammens   German             PhD   
3           10978          Nicole Bertels   French     High School   
4           15303  Kathleen Vansteenkiste   French     High School   
...           ...                     ...      ...             ...   
8995        18299        Denise Van Herck  Spanish     High school   
8996        17686     Valentina Verhaegen  English     High School   
8997        18212        Yvette De Backer    Dutch             PhD   
8998        23689             Maria Bonte    Dutch     High School   
8999        21446        Sylvia Blommaert   French     High school   

                        Industry  Exp_0_1  Exp_2_4  Exp_5_10  Exp_10_plus  
0                     Technology        0        1         0            0  
1      

In [None]:
# Convert numeric experience columns to text
jobseekers_df['Exp_0_1'] = jobseekers_df['Exp_0_1'].apply(lambda x: 'zero_one' if x == 1 else 'none')
jobseekers_df['Exp_2_4'] = jobseekers_df['Exp_2_4'].apply(lambda x: 'two_four' if x == 1 else 'none')
jobseekers_df['Exp_5_10'] = jobseekers_df['Exp_5_10'].apply(lambda x: 'five_ten' if x == 1 else 'none')
jobseekers_df['Exp_10_plus'] = jobseekers_df['Exp_10_plus'].apply(lambda x: 'ten_plus' if x == 1 else 'none')

# Combine columns into 'content', excluding the "none" values
jobseekers_df['content'] = jobseekers_df[['Language', 'Education Level', 'Industry', 'Exp_0_1', 'Exp_2_4', 'Exp_5_10', 'Exp_10_plus']].astype(str).apply(lambda x: ' '.join([val for val in x if val != 'none']), axis=1)

In [None]:
# Define a function to preprocess the text data
def preprocess_text(text):
    # Convert "High School" to "high_school"
    text = re.sub(r'\bHigh School\b', 'high_school', text) # #added lowercase so that school is not extracted twice, once as upper, once lower case
    # Convert "High school" to "high_school"
    text = re.sub(r'\bHigh\s+school\b', 'high_school', text)
    # Replace integer ranges like "5-10" with "five_to_ten"

    # Return the dictionary
    return text

In [None]:
display(jobseekers_df)

Unnamed: 0,JobseekerID,Name,Language,Education Level,Industry,Exp_0_1,Exp_2_4,Exp_5_10,Exp_10_plus,content
0,20473,Joren Van Campenhout,Dutch,PhD,Technology,none,two_four,none,none,Dutch PhD Technology two_four
1,22762,Romy Derycke,Dutch,PhD,Hotels and restaurants,zero_one,none,none,none,Dutch PhD Hotels and restaurants zero_one
2,10466,Gabriel Lammens,German,PhD,Administration,none,none,none,ten_plus,German PhD Administration ten_plus
3,10978,Nicole Bertels,French,High School,Construction,none,none,none,ten_plus,French High School Construction ten_plus
4,15303,Kathleen Vansteenkiste,French,High School,Manufacturing,zero_one,none,none,none,French High School Manufacturing zero_one
...,...,...,...,...,...,...,...,...,...,...
8995,18299,Denise Van Herck,Spanish,High school,Cleaning,zero_one,none,none,none,Spanish High school Cleaning zero_one
8996,17686,Valentina Verhaegen,English,High School,Health and social services,none,two_four,none,none,English High School Health and social services...
8997,18212,Yvette De Backer,Dutch,PhD,Logistics,none,none,five_ten,none,Dutch PhD Logistics five_ten
8998,23689,Maria Bonte,Dutch,High School,Research,none,none,none,ten_plus,Dutch High School Research ten_plus


In [None]:
# Step 2: Define a similarity metric to measure the similarity between jobseekers and vacancies
# Create a TfidfVectorizer object to convert the 'content' column into a tf-idf matrix
tfidf = TfidfVectorizer(max_df=0.7, min_df=2, stop_words='english')

In [None]:
jobseekers_tfidf = tfidf.fit_transform(jobseekers_df['content'])

In [None]:
jobseekers_tfidf.shape

(9000, 35)

In [None]:
# Extract the feature names from the fitted vectorizer object
feature_names = list(tfidf.vocabulary_.keys())

# Print the feature namesthat have been extracted from the text data
print(feature_names)

['dutch', 'phd', 'technology', 'two_four', 'hotels', 'restaurants', 'zero_one', 'german', 'administration', 'ten_plus', 'french', 'high', 'school', 'construction', 'manufacturing', 'research', 'five_ten', 'pharmaceutical', 'spanish', 'bachelor', 'education', 'master', 'health', 'social', 'services', 'english', 'transport', 'logistics', 'agriculture', 'energy', 'cleaning', 'wholesale', 'retail', 'financial', 'intermediation']


=> 5 (languages) + 4(education levels) + 16 (industries) + 4 (years of experience) = 29

1.  Hotels and restaurants, +1
2.  High school, +1
3.  Health, social, services + 2
4.  Wholesale and retail +1
5.  Financial intermediation, +1



In [None]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(jobseekers_tfidf, jobseekers_tfidf)
# Since you have used the TF-IDF vectorizer, calculating the dot product between each vector will directly give you the cosine similarity score. 
# Therefore, you will use sklearn's linear_kernel() instead of cosine_similarities() since it is faster.
# Each vacancy will be a 1x500 column vector where each column will be a similarity score with each vacancy.

In [None]:
cosine_sim[1]

# cosine_sim[1] gives you an array containing the cosine similarity scores between the first vacancy and all other vacancies in the vacancies_df. 
# The length of the array will be the same as the number of rows in vacancies_df.

array([0.32204352, 1.        , 0.15078848, ..., 0.32368253, 0.16226562,
       0.        ])

In [None]:
#Construct a reverse map of indices and VacancyIDs
indices = pd.Series(jobseekers_df.index, index=jobseekers_df['JobseekerID']).drop_duplicates()

In [None]:
indices[:10]

JobseekerID
20473    0
22762    1
10466    2
10978    3
15303    4
19653    5
10201    6
15237    7
14696    8
15745    9
dtype: int64

In [None]:
vacancy_id = 30356

# Select the JobseekerID of the top match for the Vacancy
top_match_jobseeker_id = sorted_matches.iloc[0]['JobseekerID']
print(f"The top match for Vacancy {vacancy_id } is Jobseeker {top_match_jobseeker_id}")

# Find the row with Vacancy and print the JobTitle and Industry columns
print(f'The characteristics of Vacancy {vacancy_id} are:')
row_j = vacancies_df.loc[vacancies_df['VacancyID'] == vacancy_id, ['VacancyID', 'Industry', 'Language', 'Education Level']]
display(row_j)

# Retrieve the JobseekerID of the jobseekers matched with the given VacancyID
matches_for_vacancy = matches_df[matches_df['VacancyID'] == vacancy_id]

# Sort the matches by rating in descending order
sorted_matches = matches_for_vacancy.sort_values(by='Rating', ascending=False)

# Find the row where the Jobseeker gave the highest rating for the given VacancyID and print the Language, Industry etc. columns
print(f'The characteristics of the Jobseeker who gave the highest rating are {top_match_jobseeker_id} are:')
row_v = jobseekers_df.loc[jobseekers_df['JobseekerID'] == top_match_jobseeker_id, ['JobseekerID', 'Industry', 'Language', 'Education Level']]
display(row_v)

# Check if there are any matches for the vacancy
if not sorted_matches.empty:
    # Get the JobseekerIDs that have already been matched
    matched_jobseeker_ids = sorted_matches['JobseekerID'].values

    # Define the modified get_recommendations function
    def get_recommendations(JobseekerID, cosine_sim=cosine_sim, min_similarity=0.3): #ADDED: min_similarity=...
        # Get the index of the Jobseeker that matches the JobseekerID
        idx = indices[JobseekerID]

        # Get the pairwise similarity scores of all movies with that vacancy
        sim_scores = list(enumerate(cosine_sim[idx]))

        # ADDED: Filter the similarity scores by the minimum threshold
        sim_scores = [score for score in sim_scores if score[1] >= min_similarity]

        # Sort the vacancies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        # Get the scores of the 10 most similar vacancies
        sim_scores = sim_scores[1:11]
        jobseeker_indices = [i[0] for i in sim_scores] #NEW
        cosine_sim_scores = [i[1] for i in sim_scores] #NEW

        # Get the jobseeker indices
        #vacancy_indices = [i[0] for i in sim_scores] REMOVED

        # Remove the JobseekerIDs that have already been matched with the VacancyID
        jobseeker_indices = [i for i in jobseeker_indices if jobseekers_df.iloc[i]['JobseekerID'] not in matched_jobseeker_ids]

        # Get the details of the most similar vacancies, including the cosine similarity scores
        jobseekers = pd.DataFrame(list(zip(jobseeker_indices, cosine_sim_scores)), columns=['index', 'cosine_similarity_score'])
        jobseekers = jobseekers.merge(jobseekers_df[['JobseekerID','Industry','Language', 'Education Level']], how='left', left_on='index', right_index=True)
    
        # Return the top 10 most similar vacancies
        return jobseekers[['JobseekerID', 'Industry','Language','Education Level', 'cosine_similarity_score']]

    # Get the recommendations for the vacancy
    recommendations = get_recommendations(top_match_jobseeker_id, cosine_sim=cosine_sim, min_similarity=0.3)
  
    print(f"Recommended jobseekers for Vacancy {vacancy_id} based on Jobseeker {top_match_jobseeker_id} are:")
    display(recommendations)
else:
    print(f"No matches found for vacancy {vacancy_id}")

The top match for Vacancy 30356 is Jobseeker 10978
The characteristics of Vacancy 30356 are:


Unnamed: 0,VacancyID,Industry,Language,Education Level
296,30356,Technology,Dutch,Master


The characteristics of the Jobseeker who gave the highest rating are 10978 are:


Unnamed: 0,JobseekerID,Industry,Language,Education Level
3,10978,Construction,French,High School


Recommended jobseekers for Vacancy 30356 based on Jobseeker 10978 are:


Unnamed: 0,JobseekerID,Industry,Language,Education Level,cosine_similarity_score
0,23319,Construction,French,High School,1.0
1,17835,Construction,French,High School,1.0
2,18146,Construction,French,High School,1.0
3,19301,Construction,French,High School,1.0
4,23973,Construction,French,High School,1.0
5,10569,Construction,French,High School,1.0
6,13718,Construction,French,High School,1.0
7,13004,Construction,French,High School,0.846446
8,23192,Construction,French,High School,0.846446
9,12606,Construction,French,High School,0.846446
