In [None]:
!pip install pandas scikit-learn nltk transformers sentence-transformers matplotlib shap


^C


Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.50.3-py3-none-any.whl.metadata (39 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-4.0.1-py3-none-any.whl.metadata (13 kB)
Collecting shap
  Downloading shap-0.47.1.tar.gz (2.5 MB)
     ---------------------------------------- 0.0/2.5 MB ? eta -:--:--
     ---------------- ----------------------- 1.0/2.5 MB 6.9 MB/s eta 0:00:01
     ------------------------- -------------- 1.6/2.5 MB 3.7 MB/s eta 0:00:01
     ---------------------------------------- 2.5/2.5 MB 4.1 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting huggingface-hub<1

DEPRECATION: Loading egg at c:\python313\lib\site-packages\vboxapi-1.0-py3.13.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330
  You can safely remove it manually.
  You can safely remove it manually.


In [1]:
# Import libraries
import pandas as pd
import numpy as np
import re
import unicodedata
import joblib
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sentence_transformers import SentenceTransformer
import shap

# Step 1: Data Preprocessing & Cleaning
def clean_text(text):
    """Clean the text by normalizing, removing special characters, and converting to lowercase."""
    text = unicodedata.normalize("NFKD", text)  # Fix encoding issues
    text = re.sub(r'[^a-zA-Z0-9., ]', '', text)  # Remove special characters
    text = text.lower().strip()  # Convert to lowercase
    return text

# Load dataset
resume_df = pd.read_csv("UpdatedResumeDataSet.csv")  # Replace with your file path
job_df = pd.read_csv("job_descriptions.csv")  # Replace with your file path

# Combine relevant text fields
resume_df['Combined_Text'] = resume_df['Category'] + " " + resume_df['Resume']
job_df['Combined_Text'] = job_df['Job Title'] + " " + job_df['Job Description'] + " " + job_df['skills']

# Clean text fields
resume_df['Combined_Text'] = resume_df['Combined_Text'].apply(clean_text)
job_df['Combined_Text'] = job_df['Combined_Text'].apply(clean_text)

# Step 2: BERT-based Embedding for Semantic Matching
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings for resume and job description
resume_embeddings = model.encode(resume_df['Combined_Text'].tolist())
job_embeddings = model.encode(job_df['Combined_Text'].tolist())

# Compute cosine similarity on embeddings
similarity_matrix = cosine_similarity(resume_embeddings, job_embeddings)

# Find the best match for each resume
matches = similarity_matrix.argmax(axis=1)

# Step 3: Create Unified Dataset
unified_data = []
for i, job_index in enumerate(matches):
    unified_data.append({
        "Resume_Category": resume_df.iloc[i]['Category'],
        "Resume_Text": resume_df.iloc[i]['Resume'],
        "Matched_Job_Title": job_df.iloc[job_index]['Job Title'],
        "Matched_Job_Description": job_df.iloc[job_index]['Job Description'],
        "Matched_Job_Skills": job_df.iloc[job_index]['skills'],
    })

unified_df = pd.DataFrame(unified_data)

# Save the unified dataset
unified_df.to_csv("unified_dataset.csv", index=False)
print("Unified dataset created successfully!")

# Step 4: Apply TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

# Apply TF-IDF transformation to combine Resume, Job Description, and Job Skills
tfidf_matrix = tfidf.fit_transform(unified_df['Resume_Text'] + " " + unified_df['Matched_Job_Description'] + " " + unified_df['Matched_Job_Skills'])

# Convert to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

# Concatenate TF-IDF with the unified dataset
df_final = pd.concat([unified_df, tfidf_df], axis=1)

# Save the processed dataset
df_final.to_csv("processed_dataset.csv", index=False)

# Step 5: Dimensionality Reduction using PCA
scaler = StandardScaler()
tfidf_scaled = scaler.fit_transform(tfidf_df)

# Apply PCA (keeping 100 components)
pca = PCA(n_components=100)
tfidf_pca = pca.fit_transform(tfidf_scaled)

# Plot explained variance to decide optimal components
plt.figure(figsize=(10, 5))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o', linestyle='--')
plt.xlabel("Number of Components")
plt.ylabel("Explained Variance")
plt.title("PCA - Explained Variance vs Number of Components")
plt.show()

# Convert PCA results to DataFrame
pca_df = pd.DataFrame(tfidf_pca, columns=[f"PC{i+1}" for i in range(100)])

# Concatenate with original dataset (keeping Resume_Category for modeling)
df_pca = pd.concat([df_final[['Resume_Category']], pca_df], axis=1)

# Save the reduced dataset
df_pca.to_csv("reduced_dataset.csv", index=False)

# Step 6: Train Random Forest Model
X = df_final.drop(columns=["Resume_Category", "Resume_Text", "Matched_Job_Title", "Matched_Job_Description", "Matched_Job_Skills"])
y = df_final["Resume_Category"]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate the model
accuracy = rf_model.score(X_test, y_test)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Step 7: Hyperparameter Tuning with GridSearchCV
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

# GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters found by GridSearch
print("Best parameters found: ", grid_search.best_params_)

# Use the best model from grid search
best_rf_model = grid_search.best_estimator_

# Evaluate with the best model
y_pred_best = best_rf_model.predict(X_test)
print(classification_report(y_test, y_pred_best))

# Step 8: Model Interpretation with SHAP
explainer = shap.TreeExplainer(best_rf_model)
shap_values = explainer.shap_values(X_train)

# Plot SHAP summary plot
shap.summary_plot(shap_values, X_train)

# Step 9: Save the Model and Vectorizer
joblib.dump(best_rf_model, "best_rf_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

# Optional: Save the PCA and scaler if needed for future use
joblib.dump(pca, "pca_model.pkl")
joblib.dump(scaler, "scaler.pkl")

print("Model and vectorizer saved successfully!")


KeyboardInterrupt: 