In [1]:
import numpy as np
import re
import spacy
import torch
from pdfminer.high_level import extract_text
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import gradio as gr
import pandas as pd
import tempfile
import matplotlib.pyplot as plt

In [5]:
# Load NLP models
nlp = spacy.load('en_core_web_sm')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [6]:
# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    try:
        text = extract_text(pdf_path)
        if not text.strip():
            raise ValueError("Empty or unreadable PDF.")
        return text
    except Exception as e:
        return f"Error reading PDF: {str(e)}"

In [7]:
# Clean Resume
def clean_resume(text):
    text = re.sub(r'\W+', ' ', text)  
    doc = nlp(text.lower())           
    clean_text = ' '.join([token.lemma_ for token in doc if token.text not in nlp.Defaults.stop_words])
    return clean_text

In [8]:
# Get BERT Embeddings (Batch Processing)
def get_bert_embedding(texts):
    inputs = tokenizer(texts, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()

In [9]:
# Rank Resumes using multiple models
def rank_resumes(job_desc_embedding, resume_embeddings, tfidf_matrix, tfidf_vectorizer, job_desc_text):
    similarity_scores = cosine_similarity(job_desc_embedding, resume_embeddings).flatten()
    tfidf_scores = tfidf_matrix @ tfidf_vectorizer.transform([job_desc_text]).T
    tfidf_scores = tfidf_scores.toarray().flatten()
    final_scores = (0.7 * similarity_scores) + (0.3 * tfidf_scores / tfidf_scores.max()) 
    
    # Prepare training data
    X = final_scores.reshape(-1, 1)
    y = [1 if score >= 0.5 else 0 for score in final_scores]
    if len(set(y)) > 1:  # Ensure we have both classes
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        models = {
            "Logistic Regression": LogisticRegression(),
            "SVM": SVC(probability=True),
            "Random Forest": RandomForestClassifier(n_estimators=100),
            "Neural Network": MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500)
        }
        
        best_model = None
        best_accuracy = 0
        
        for name, model in models.items():
            model.fit(X_train, y_train)
            acc = accuracy_score(y_test, model.predict(X_test))
            print(f"{name} Accuracy: {acc:.2f}")
            if acc > best_accuracy:
                best_accuracy = acc
                best_model = model
    
    ranked_indices = np.argsort(final_scores)[::-1]
    return ranked_indices, final_scores[ranked_indices]

In [10]:
# Save results to CSV
def save_results_to_csv(ranked_results):
    with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as tmp_file:
        ranked_results.to_csv(tmp_file.name, index=False)
        return tmp_file.name

In [11]:
# Graphical Visualization
def plot_results(ranked_results):
    plt.figure(figsize=(10, 6))
    plt.barh(ranked_results['Resume'], ranked_results['Score'], color='skyblue')
    plt.xlabel("Score")
    plt.ylabel("Resumes")
    plt.title("Resume Ranking Scores")
    plt.gca().invert_yaxis()
    plt.show()


In [12]:
# Process Resumes
def process_resumes(resume_files, job_desc_text):
    resume_embeddings = []
    cleaned_resumes = []
    resume_names = []
    
    for resume_file in resume_files:
        try:
            pdf_text = extract_text_from_pdf(resume_file.name)
            clean_text = clean_resume(pdf_text)
            cleaned_resumes.append(clean_text)
            resume_names.append(resume_file.name)
        except Exception as e:
            return {"error": f"Error processing {resume_file.name}: {str(e)}"}
    
    if not cleaned_resumes:
        return {"error": "No valid resumes found."}
    
    resume_embeddings = get_bert_embedding(cleaned_resumes)
    job_desc_embedding = get_bert_embedding([job_desc_text])
    
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_resumes)
    
    ranked_indices, scores = rank_resumes(job_desc_embedding, resume_embeddings, tfidf_matrix, tfidf_vectorizer, job_desc_text)
    
    ranked_results = pd.DataFrame({
        "Rank": range(1, len(ranked_indices) + 1),
        "Resume": [resume_names[i].split('/')[-1] for i in ranked_indices],
        "Score": [f"{scores[i]:.4f}" for i in ranked_indices]
    })
    
    csv_file_path = save_results_to_csv(ranked_results)
    plot_results(ranked_results)
    return ranked_results.to_string(index=False), csv_file_path

In [13]:
# Interface for Gradio
def interface(job_desc_text, resume_files):
    return process_resumes(resume_files, job_desc_text)

# Gradio UI
gr.Interface(
    fn=interface,
    inputs=[
        gr.Textbox(label="Job Description", placeholder="Enter the job description..."),
        gr.File(label="Upload Resumes", file_count="multiple")
    ],
    outputs=["text", "file"],
    title="AI-Powered Resume Ranking with ML Models",
    description="Upload resumes and enter a job description to rank the resumes based on relevance using ML models.",
    allow_flagging="never"
).launch()




* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


