<a href="https://colab.research.google.com/github/mimomaina/Career-Path-Recommendation-System/blob/main/Career_path_recommendation_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cosine Similarity

In [1]:
import pandas as pd

# Load the preprocessed data
preprocessed_file_path = "preprocessed_data.csv"
df = pd.read_csv(preprocessed_file_path)

print("Preprocessed data loaded successfully!")
print(df.head())


Preprocessed data loaded successfully!
      Experience Qualifications Salary Range    location       Country  \
0  2 to 12 Years            BCA   $56K-$116K    Ashgabat  Turkmenistan   
1  4 to 11 Years            phd    $65K-$91K  Porto-Novo         Benin   
2  4 to 12 Years            MCA    $59K-$93K    Brussels       Belgium   
3  3 to 10 Years            BCA   $57K-$104K      Manama       Bahrain   
4   1 to 8 Years            NaN    $56K-$86K      Banjul        Gambia   

   latitude  longitude  Work Type  Company Size Job Posting Date  ...  \
0   38.9697    59.5563     Intern        100340       2022-12-19  ...   
1    9.3077     2.3158  Full-Time        129896       2023-02-25  ...   
2   50.5039     4.4699  Full-Time         23196       2023-07-25  ...   
3   26.0667    50.5577   Contract        130338       2023-07-01  ...   
4   13.4432   -15.3101  Temporary        127900       2022-05-24  ...   

                                     Company Profile Job Automation Type  \
0

In [2]:
import joblib

# Load the TF-IDF vectorizer and matrix
tfidf_vectorizer = joblib.load("tfidf_vectorizer.pkl")
tfidf_matrix = joblib.load("tfidf_matrix.pkl")

print("TF-IDF model loaded successfully!")


TF-IDF model loaded successfully!


In [4]:
# !pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [5]:
import joblib
import numpy as np
import faiss
from sklearn.preprocessing import normalize

In [6]:
# Convert sparse TF-IDF matrix to dense NumPy array (FAISS requires dense format)
tfidf_matrix_dense = tfidf_matrix.toarray().astype(np.float32)

In [7]:
#Normalize the vectors (cosine similarity requires normalized vectors)
tfidf_matrix_dense = normalize(tfidf_matrix_dense, norm='l2')

In [8]:
# Create FAISS index for Cosine Similarity
index = faiss.IndexFlatIP(tfidf_matrix_dense.shape[1])  # IP (Inner Product) for cosine similarity
index.add(tfidf_matrix_dense)

print(f"FAISS index created with {index.ntotal} job descriptions.")

FAISS index created with 292167 job descriptions.


In [19]:
def search_similar_jobs(job_desc, top_k=5):
    """Search for top_k similar job descriptions using FAISS (cosine similarity)."""
    # Transform input job description into TF-IDF vector
    job_vector = tfidf_vectorizer.transform([job_desc]).toarray().astype(np.float32)

    # Normalize the job vector (for cosine similarity)
    job_vector = normalize(job_vector, norm='l2')

    # Search FAISS index
    distances, indices = index.search(job_vector, top_k)

    return indices[0], distances[0]  # Returns indices and similarity scores


In [21]:
# Example search (replace with an actual job description)
query_job = "Data Scientist with experience in Python and Machine Learning"
top_k = 5  # Define top K similar jobs to fetch
similar_jobs, similarities = search_similar_jobs(query_job, top_k)


In [22]:
# Display results
print("\n**Similar Job Listings:**")
for i, (job_id, similarity) in enumerate(zip(similar_jobs, similarities)):
    print(f"Rank {i+1}: Job ID {job_id} | Similarity Score: {similarity:.4f}")


**Similar Job Listings:**
Rank 1: Job ID 803 | Similarity Score: 0.5493
Rank 2: Job ID 678 | Similarity Score: 0.5493
Rank 3: Job ID 656 | Similarity Score: 0.5493
Rank 4: Job ID 481 | Similarity Score: 0.5493
Rank 5: Job ID 437 | Similarity Score: 0.5493


## Euclidean Distance

In [24]:
import faiss
import numpy as np
import joblib
from sklearn.preprocessing import normalize

# Load the TF-IDF vectorizer and matrix
tfidf_vectorizer = joblib.load("tfidf_vectorizer.pkl")
tfidf_matrix = joblib.load("tfidf_matrix.pkl")

# Convert to dense format and normalize
tfidf_matrix_dense = tfidf_matrix.toarray().astype(np.float32)
tfidf_matrix_dense = normalize(tfidf_matrix_dense, norm='l2')  # Normalize the vectors

# Create a FAISS index with L2 distance
index = faiss.IndexFlatL2(tfidf_matrix_dense.shape[1])
index.add(tfidf_matrix_dense)  # Add normalized vectors

# Confirm FAISS index size
print(f"FAISS index contains {index.ntotal} job descriptions.")

# Function to find similar jobs using L2 distance
def find_similar_jobs(job_description, top_k=5):
    job_vector = tfidf_vectorizer.transform([job_description]).toarray().astype(np.float32)
    job_vector = normalize(job_vector, norm='l2')  # Normalize the query vector

    distances, indices = index.search(job_vector, top_k)

    return indices[0], distances[0]

# Test with a sample job description
job_description = "Data Scientist with experience in machine learning and NLP."
similar_jobs, distances = find_similar_jobs(job_description, top_k=5)

# Print results
print("\n**Improved Similar Job Listings:**")
for rank, (job_id, distance) in enumerate(zip(similar_jobs, distances), start=1):
    print(f"Rank {rank}: Job ID {job_id} | Euclidean Distance: {distance:.4f}")


FAISS index contains 292167 job descriptions.

**Improved Similar Job Listings:**
Rank 1: Job ID 437 | Euclidean Distance: 0.9580
Rank 2: Job ID 481 | Euclidean Distance: 0.9580
Rank 3: Job ID 656 | Euclidean Distance: 0.9580
Rank 4: Job ID 678 | Euclidean Distance: 0.9580
Rank 5: Job ID 803 | Euclidean Distance: 0.9580


In [25]:
import faiss
import numpy as np
import joblib
from sklearn.preprocessing import normalize

# Load the TF-IDF vectorizer and matrix
tfidf_vectorizer = joblib.load("tfidf_vectorizer.pkl")
tfidf_matrix = joblib.load("tfidf_matrix.pkl")

# Convert sparse to dense and normalize
tfidf_matrix_dense = tfidf_matrix.toarray().astype(np.float32)
tfidf_matrix_dense = normalize(tfidf_matrix_dense, norm='l2')  # Normalize the vectors

# Use FAISS IndexHNSWFlat (better structure for L2 search)
d = tfidf_matrix_dense.shape[1]  # Vector dimension
index = faiss.IndexHNSWFlat(d, 32)  # 32 is the number of neighbors stored in memory
index.hnsw.efConstruction = 64  # Tuning parameter for better search accuracy
index.add(tfidf_matrix_dense)

# Confirm FAISS index size
print(f"FAISS index contains {index.ntotal} job descriptions.")

# Function to find similar jobs using L2 distance
def find_similar_jobs(job_description, top_k=5):
    job_vector = tfidf_vectorizer.transform([job_description]).toarray().astype(np.float32)
    job_vector = normalize(job_vector, norm='l2')  # Normalize the query vector

    distances, indices = index.search(job_vector, top_k)

    return indices[0], distances[0]

# Test with a sample job description
job_description = "Data Scientist with experience in machine learning and NLP."
similar_jobs, distances = find_similar_jobs(job_description, top_k=5)

# Print results
print("\n**Improved Similar Job Listings:**")
for rank, (job_id, distance) in enumerate(zip(similar_jobs, distances), start=1):
    print(f"Rank {rank}: Job ID {job_id} | Euclidean Distance: {distance:.4f}")


FAISS index contains 292167 job descriptions.

**Improved Similar Job Listings:**
Rank 1: Job ID 98920 | Euclidean Distance: 0.9580
Rank 2: Job ID 179978 | Euclidean Distance: 0.9580
Rank 3: Job ID 187393 | Euclidean Distance: 0.9580
Rank 4: Job ID 204360 | Euclidean Distance: 0.9580
Rank 5: Job ID 206060 | Euclidean Distance: 0.9580
