In [246]:
!pip install PyMuPDF



In [247]:
import os
import fitz
import joblib
import pandas as pd
from PyPDF2 import PdfReader
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix






In [248]:
# Local absolute paths
resume_folder = r"C:\Users\panit\resume_screening_system\data\resumes"
output_csv_path = r"C:\Users\panit\resume_screening_system\data\parsed_resumes.csv"

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        doc = fitz.open(pdf_path)
        for page in doc:
            text += page.get_text() or ""
    except Exception as e:
        print(f"⚠️ Error reading {pdf_path}: {e}")
    return text.strip()  # clean whitespace

# Parse all resumes in the folder
parsed_data = []
for filename in os.listdir(resume_folder):
    if filename.lower().endswith(".pdf"):
        file_path = os.path.join(resume_folder, filename)
        text = extract_text_from_pdf(file_path)
        parsed_data.append({"filename": filename, "content": text})

# Save as DataFrame
df = pd.DataFrame(parsed_data)
df.to_csv(output_csv_path, index=False)

print(f"✅ Done! Parsed {len(df)} resumes → saved to {output_csv_path}")


✅ Done! Parsed 21 resumes → saved to C:\Users\panit\resume_screening_system\data\parsed_resumes.csv


In [249]:
# Load parsed resumes
resume_df = pd.read_csv(r"C:\Users\panit\resume_screening_system\data\parsed_resumes.csv")

# Load selected job description
with open(r"C:\Users\panit\resume_screening_system\data\jds\data_analyst_jd.txt", "r", encoding="utf-8") as file:
    jd_text = file.read()

# Prepare corpus: JD + all resumes
corpus = [jd_text] + resume_df["content"].fillna("").tolist()

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(corpus)

# Cosine Similarity: JD (index 0) vs all resumes
similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

# Add similarity score (as %)
resume_df["similarity_score"] = similarities
resume_df["similarity_score (%)"] = (resume_df["similarity_score"] * 100).round(2)

# Sort and display top 5 matches
resume_df_sorted = resume_df.sort_values(by="similarity_score", ascending=False)
top_matches = resume_df_sorted[["filename", "similarity_score (%)"]].head(5)

print("🎯 Top matching resumes for Data Analyst JD:\n")
print(top_matches)


🎯 Top matching resumes for Data Analyst JD:

       filename  similarity_score (%)
19  Resume8.pdf                 20.84
12  Resume2.pdf                 18.86
1   Resume1.pdf                 17.75
18  Resume7.pdf                 17.54
20  Resume9.pdf                 17.00


In [250]:
# Step: Skill-Based Scoring

# Define required skills
required_skills = ["SQL", "Python", "Power BI", "Tableau", "Excel", "Statistics", "ETL", "KPI"]

# Convert resume content to lowercase for case-insensitive matching
resume_df["content_lower"] = resume_df["content"].fillna("").str.lower()

# Count number of matched skills in each resume
resume_df["skill_match_score"] = resume_df["content_lower"].apply(
    lambda text: sum(skill.lower() in text for skill in required_skills)
)

# Optional: Drop the helper column (not needed for further steps)
resume_df.drop(columns=["content_lower"], inplace=True)

# Preview the result
print("✅ Skill-based scoring completed:")
print(resume_df[["filename", "skill_match_score"]].head())


✅ Skill-based scoring completed:
         filename  skill_match_score
0  new_resume.pdf                  1
1     Resume1.pdf                  5
2    Resume10.pdf                  2
3    Resume11.pdf                  2
4    Resume12.pdf                  2


In [251]:
# Step: Combine Similarity + Skill Match into Final Score

# Define skill-matching function (if not already defined)
def count_matching_skills(text, skills):
    text = text.lower() if isinstance(text, str) else ""
    return sum(skill.lower() in text for skill in skills)

# Apply skill match count
resume_df["skill_match_score"] = resume_df["content"].apply(
    lambda x: count_matching_skills(x, required_skills)
)

# Normalize skill score
max_skill_score = resume_df["skill_match_score"].max()
resume_df["skill_match_score_norm"] = resume_df["skill_match_score"] / max_skill_score

# Compute final score (weighted combination)
resume_df["final_score"] = (
    0.7 * resume_df["similarity_score"] +
    0.3 * resume_df["skill_match_score_norm"]
)

# Also store percentage versions for display
resume_df["similarity_score (%)"] = (resume_df["similarity_score"] * 100).round(2)
resume_df["final_score (%)"] = (resume_df["final_score"] * 100).round(2)

# Display top 5
top5 = resume_df.sort_values(by="final_score", ascending=False)[[
    "filename", "similarity_score (%)", "skill_match_score", "final_score (%)"
]].head(5)

print("🎯 Top 5 Resumes by Final Score:")
print(top5)


🎯 Top 5 Resumes by Final Score:
       filename  similarity_score (%)  skill_match_score  final_score (%)
12  Resume2.pdf                 18.86                  6            43.20
19  Resume8.pdf                 20.84                  5            39.59
14  Resume3.pdf                 12.89                  6            39.02
1   Resume1.pdf                 17.75                  5            37.42
16  Resume5.pdf                 13.98                  5            34.79


In [252]:
# Load parsed resumes
resume_df = pd.read_csv("C:/Users/panit/resume_screening_system/data/parsed_resumes.csv")

# Define roles and corresponding JD paths
job_roles = {
    "Data Analyst Score (%)": "C:/Users/panit/resume_screening_system/data/jds/data_analyst_jd.txt",
    "BI Analyst Score (%)": "C:/Users/panit/resume_screening_system/data/jds/bi_analyst_jd.txt",
    "Developer Score (%)": "C:/Users/panit/resume_screening_system/data/jds/software_developer_jd.txt",
    "Project Manager Score (%)": "C:/Users/panit/resume_screening_system/data/jds/project_manager_jd.txt"
}

# Initialize results DataFrame
similarity_table = pd.DataFrame()
similarity_table["filename"] = resume_df["filename"]

# Loop over roles to calculate TF-IDF similarity
for role, jd_path in job_roles.items():
    with open(jd_path, "r", encoding="utf-8") as file:
        jd_text = file.read()

    corpus = [jd_text] + resume_df["content"].fillna("").tolist()
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(corpus)
    similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
    
    # Convert to percentage
    similarity_table[role] = (similarities * 100).round(2)

# Identify best match role
similarity_table["Best Match Role"] = similarity_table[job_roles.keys()].idxmax(axis=1)
similarity_table["Best Match Score (%)"] = similarity_table[job_roles.keys()].max(axis=1)

# View sample
similarity_table.sort_values(by="Best Match Score (%)", ascending=False).head(10)


Unnamed: 0,filename,Data Analyst Score (%),BI Analyst Score (%),Developer Score (%),Project Manager Score (%),Best Match Role,Best Match Score (%)
19,Resume8.pdf,20.84,26.64,4.08,6.07,BI Analyst Score (%),26.64
20,Resume9.pdf,17.0,22.94,3.94,2.13,BI Analyst Score (%),22.94
1,Resume1.pdf,17.75,20.8,2.93,3.79,BI Analyst Score (%),20.8
9,Resume17.pdf,1.63,4.73,7.85,20.2,Project Manager Score (%),20.2
18,Resume7.pdf,17.54,19.71,1.98,3.92,BI Analyst Score (%),19.71
2,Resume10.pdf,12.82,19.39,2.05,2.52,BI Analyst Score (%),19.39
12,Resume2.pdf,18.86,17.84,5.04,5.36,Data Analyst Score (%),18.86
17,Resume6.pdf,13.19,17.93,3.09,5.86,BI Analyst Score (%),17.93
13,Resume20.pdf,3.31,3.8,1.25,17.17,Project Manager Score (%),17.17
6,Resume14.pdf,8.28,7.65,15.49,5.68,Developer Score (%),15.49


In [253]:
# Make sure the file is not open in Excel before running this
similarity_table.to_csv("C:/Users/panit/resume_screening_system/data/role_match_results.csv", index=False)
print("✅ Saved to: role_match_results.csv")


✅ Saved to: role_match_results.csv


In [254]:
# Load role match results
similarity_df = pd.read_csv("C:/Users/panit/resume_screening_system/data/role_match_results.csv")

# Load parsed resumes to get 'content' field
parsed_df = pd.read_csv("C:/Users/panit/resume_screening_system/data/parsed_resumes.csv")

# Merge on filename
resume_df = pd.merge(similarity_df, parsed_df, on="filename", how="left")

# Required skills
required_skills = ["SQL", "Python", "Power BI", "Tableau", "Excel", "Statistics", "ETL", "KPI"]

# Get similarity from best match column
resume_df["similarity_score (%)"] = resume_df.apply(
    lambda row: row[row["Best Match Role"]], axis=1
)
resume_df["similarity_score"] = resume_df["similarity_score (%)"] / 100

# Count matching skills
def count_matching_skills(text, skill_list):
    text_lower = str(text).lower()
    return sum(skill.lower() in text_lower for skill in skill_list)

resume_df["skill_match_score"] = resume_df["content"].apply(
    lambda x: count_matching_skills(x, required_skills)
)

# Normalize skill score
max_skill_score = resume_df["skill_match_score"].max()
resume_df["skill_match_score_norm"] = resume_df["skill_match_score"] / max_skill_score

# Final score
resume_df["final_score"] = (
    0.7 * resume_df["similarity_score"] +
    0.3 * resume_df["skill_match_score_norm"]
)
resume_df["final_score (%)"] = (resume_df["final_score"] * 100).round(2)

# Save final prediction
resume_df.to_csv("C:/Users/panit/resume_screening_system/data/final_predictions.csv", index=False)

# Show top 5
resume_df[[
    "filename", "Best Match Role", "similarity_score (%)", "skill_match_score", "final_score (%)"
]].sort_values(by="final_score (%)", ascending=False).head(5)


Unnamed: 0,filename,Best Match Role,similarity_score (%),skill_match_score,final_score (%)
19,Resume8.pdf,BI Analyst Score (%),26.64,5,43.65
12,Resume2.pdf,Data Analyst Score (%),18.86,6,43.2
1,Resume1.pdf,BI Analyst Score (%),20.8,5,39.56
14,Resume3.pdf,Data Analyst Score (%),12.89,6,39.02
16,Resume5.pdf,Data Analyst Score (%),13.98,5,34.79


In [255]:
# Load parsed resume data (which includes 'filename' and 'content')
df = pd.read_csv(r"C:/Users/panit/resume_screening_system/data/parsed_resumes.csv")
print("✅ Columns available:", df.columns.tolist())

# Extract corpus from correct column
corpus = df["content"].fillna("").astype(str)

# Create and fit TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Save the vectorizer model
joblib.dump(tfidf_vectorizer, r"C:/Users/panit/resume_screening_system/models/tfidf_vectorizer.pkl")

print("✅ TF-IDF vectorizer trained and saved successfully.")


✅ Columns available: ['filename', 'content']
✅ TF-IDF vectorizer trained and saved successfully.


In [256]:
# Save the trained Random Forest model
model_path = "C:/Users/panit/resume_screening_system/models/resume_classifier.pkl"
joblib.dump(rf_model, model_path)

print(f"✅ Random Forest model saved successfully to: {model_path}")


✅ Random Forest model saved successfully to: C:/Users/panit/resume_screening_system/models/resume_classifier.pkl


In [257]:
# ✅ Generate Shortlisting Labels Based on Rule
# Use percentage-based thresholds for consistency
resume_df["shortlisted"] = (
    (resume_df["final_score (%)"] >= 20) & 
    (resume_df["skill_match_score"] >= 3)
).astype(int)

# ✅ Check class balance
print("📊 Shortlisted Class Distribution:")
print(resume_df["shortlisted"].value_counts())

# ✅ Select features and target
features = resume_df[["similarity_score (%)", "skill_match_score", "final_score (%)"]]
target = resume_df["shortlisted"]

# ✅ Train-Test Split (stratify ensures class balance in train/test sets)
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42, stratify=target
)

# ✅ Train models only if at least 2 classes are present
if len(y_train.unique()) > 1:
    print("\n✅ Training Logistic Regression and Random Forest Models...")

    # Logistic Regression
    lr_model = LogisticRegression()
    lr_model.fit(X_train, y_train)

    # Random Forest
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train, y_train)

    # ✅ Step 6: Evaluate Performance
    print("\n📈 Logistic Regression Results:")
    print(confusion_matrix(y_test, lr_model.predict(X_test)))
    print(classification_report(y_test, lr_model.predict(X_test)))

    print("\n🌲 Random Forest Results:")
    print(confusion_matrix(y_test, rf_model.predict(X_test)))
    print(classification_report(y_test, rf_model.predict(X_test)))

else:
    print("\n⚠️ Cannot train models: Only one class present in y_train.")


📊 Shortlisted Class Distribution:
shortlisted
0    12
1     9
Name: count, dtype: int64

✅ Training Logistic Regression and Random Forest Models...

📈 Logistic Regression Results:
[[3 0]
 [0 2]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         2

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5


🌲 Random Forest Results:
[[3 0]
 [0 2]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         2

    accuracy                           1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5



In [258]:
# ✅ Add predictions to the DataFrame
resume_df["predicted_shortlist"] = rf_model.predict(features)

# ✅ Map binary values to labels for clarity
resume_df["shortlisted_label"] = resume_df["shortlisted"].map({0: "No", 1: "Yes"})
resume_df["predicted_label"] = resume_df["predicted_shortlist"].map({0: "No", 1: "Yes"})

# ✅ Display preview
print(resume_df[[
    "filename", "Best Match Role", "similarity_score (%)", 
    "skill_match_score", "final_score (%)", 
    "shortlisted_label", "predicted_label"
]].sort_values(by="final_score (%)", ascending=False).head(10))

        filename            Best Match Role  similarity_score (%)  \
19   Resume8.pdf       BI Analyst Score (%)                 26.64   
12   Resume2.pdf     Data Analyst Score (%)                 18.86   
1    Resume1.pdf       BI Analyst Score (%)                 20.80   
14   Resume3.pdf     Data Analyst Score (%)                 12.89   
16   Resume5.pdf     Data Analyst Score (%)                 13.98   
17   Resume6.pdf       BI Analyst Score (%)                 17.93   
8   Resume16.pdf  Project Manager Score (%)                  7.68   
9   Resume17.pdf  Project Manager Score (%)                 20.20   
18   Resume7.pdf       BI Analyst Score (%)                 19.71   
2   Resume10.pdf       BI Analyst Score (%)                 19.39   

    skill_match_score  final_score (%) shortlisted_label predicted_label  
19                  5            43.65               Yes             Yes  
12                  6            43.20               Yes             Yes  
1              

In [259]:
# Map 0 → "No", 1 → "Yes" for better readability
resume_df["shortlisted_label"] = resume_df["shortlisted"].map({0: "No", 1: "Yes"})
resume_df["predicted_label"] = resume_df["predicted_shortlist"].map({0: "No", 1: "Yes"})

# Display results (use percentage columns)
print(resume_df[[
    "filename",
    "similarity_score (%)",
    "skill_match_score",
    "final_score (%)",
    "shortlisted_label",
    "predicted_label"
]])


          filename  similarity_score (%)  skill_match_score  final_score (%)  \
0   new_resume.pdf                  9.53                  1            11.67   
1      Resume1.pdf                 20.80                  5            39.56   
2     Resume10.pdf                 19.39                  2            23.57   
3     Resume11.pdf                 12.72                  2            18.90   
4     Resume12.pdf                 12.85                  2            19.00   
5     Resume13.pdf                  9.49                  2            16.64   
6     Resume14.pdf                 15.49                  2            20.84   
7     Resume15.pdf                  9.99                  0             6.99   
8     Resume16.pdf                  7.68                  5            30.38   
9     Resume17.pdf                 20.20                  3            29.14   
10    Resume18.pdf                 10.59                  0             7.41   
11    Resume19.pdf                 13.58

In [260]:
resume_df.to_csv("C:/Users/panit/resume_screening_system/data/final_predictions.csv", index=False)

In [261]:
# Compute accuracy using numeric columns (not string labels)
accuracy = accuracy_score(resume_df["shortlisted"], resume_df["predicted_shortlist"])
print(f"✅ Overall Accuracy: {accuracy:.2f}")


✅ Overall Accuracy: 1.00


In [262]:
# Find mismatched predictions
mismatches = resume_df[resume_df["shortlisted_label"] != resume_df["predicted_label"]]

# Display mismatches
print("🔍 Mismatched Predictions:")
print(mismatches[["filename", "shortlisted_label", "predicted_label"]])

🔍 Mismatched Predictions:
Empty DataFrame
Columns: [filename, shortlisted_label, predicted_label]
Index: []


In [263]:
# Load saved TF-IDF vectorizer ===
tfidf = joblib.load("C:/Users/panit/resume_screening_system/models/tfidf_vectorizer.pkl")

# Define skill list ===
required_skills = ["SQL", "Python", "Power BI", "Excel", "ETL", "KPI"]

# Function to extract text from a PDF ===
def extract_text(path):
    text = ""
    reader = PdfReader(path)
    for page in reader.pages:
        text += page.extract_text() or ""
    return text

# Directory containing resumes ===
DATA_DIR = "C:/Users/panit/resume_screening_system/data/resumes"

# Loop through all PDFs and compute scores ===
print("📂 Scanning Resumes...\n")
results = []

for filename in os.listdir(DATA_DIR):
    if filename.endswith(".pdf"):
        full_path = os.path.join(DATA_DIR, filename)
        text = extract_text(full_path).lower()

        # Vectorize and calculate scores
        vector = tfidf.transform([text])
        similarity_score = np.mean(vector.toarray())
        skill_score = sum(skill.lower() in text for skill in required_skills)
        skill_norm = skill_score / len(required_skills)
        final_score = 0.7 * similarity_score + 0.3 * skill_norm

        # ✅ RULE-BASED SHORTLISTING LOGIC
        shortlisted = "✅ Yes" if (final_score * 100 >= 20 and skill_score >= 3) else "❌ No"

        # Store results
        results.append({
            "filename": filename,
            "similarity_score (%)": round(similarity_score * 100, 2),
            "skill_match_score": skill_score,
            "final_score (%)": round(final_score * 100, 2),
            "shortlisted": shortlisted
        })

# Create DataFrame and display ===
results_df = pd.DataFrame(results)

print("{:<25} {:<20} {:<18} {:<18} {:<12}".format(
    "Filename", "Similarity (%)", "Skill Match", "Final Score (%)", "Shortlisted?"
))
print("-" * 100)
for row in results:
    print("{:<25} {:<20} {:<18} {:<18} {}".format(
        row["filename"], 
        row["similarity_score (%)"],
        row["skill_match_score"],
        row["final_score (%)"],
        row["shortlisted"]
    ))

# Save to CSV ===
output_path = "C:/Users/panit/resume_screening_system/data/new_resume_predictions.csv"
results_df.to_csv(output_path, index=False)
print(f"\n✅ Saved predictions to: {output_path}")


📂 Scanning Resumes...

Filename                  Similarity (%)       Skill Match        Final Score (%)    Shortlisted?
----------------------------------------------------------------------------------------------------
new_resume.pdf            0.84                 1                  5.59               ❌ No
Resume1.pdf               0.94                 4                  20.66              ✅ Yes
Resume10.pdf              0.9                  2                  10.63              ❌ No
Resume11.pdf              1.03                 2                  10.72              ❌ No
Resume12.pdf              1.06                 2                  10.74              ❌ No
Resume13.pdf              0.98                 2                  10.69              ❌ No
Resume14.pdf              1.09                 2                  10.76              ❌ No
Resume15.pdf              1.01                 0                  0.71               ❌ No
Resume16.pdf              1.03                 3         