## Resume Screening Algorithm Using logistic regression

#### 1. Data Loading and Preprocessing

In [1]:
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

#  load data
data = pd.read_csv('Resume.csv')

# preprocess the text function
def preprocess_text(text):
  stop_words = set(stopwords.words('english'))
  lemmatizer = WordNetLemmatizer()
  text = text.lower()
  text = text.translate(str.maketrans('', '', string.punctuation))
  words = text.split()
  word =  [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
  return ' '.join(words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mistr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\mistr/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Apply preprocessing to the resume column
data['Resume'] = data['Resume'].apply(preprocess_text)

#### 2. Feature Extraction and Model Training

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report

# use TF-IDF to convert text data to numerical features
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['Resume'])
y = data['Category']

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate the Model
y_pred = model.predict(X_test)
accuracy  = accuracy_score(y_test,y_pred)
conf_matrix = confusion_matrix(y_test,y_pred)
class_report = classification_report(y_test,y_pred)

print(f'Accuracy : {accuracy}')
print('Confusion Matrix : ')
print(conf_matrix)
print('classification_report : ')
print(class_report)

Accuracy : 0.9948186528497409
Confusion Matrix : 
[[ 3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0]
 [ 0  6  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0]
 [ 0  0  5  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0]
 [ 0  0  0  7  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0]
 [ 0  0  0  0  4  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0]
 [ 0  0  0  0  0  9  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0]
 [ 0  0  0  0  0  0  5  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0]
 [ 0  0  0  0  0  0  0  8  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0]
 [ 0  0  0  0  0  0  0  0 13  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0
   0]
 [ 0  0  0  0  0  0  0  0  0  5  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0]
 [ 0  0  0  0  0  0  0  0  0  0  7  0  0  0  0  0  0  0  0  0  0  0  0  0
   0]
 [ 0  0  0  0  0  0  0  0  0  0  0  6  0  0  0  0  0  0  0  0  0  0  0

#### 3. Predicting Matching Percentage

In [4]:
# pip install PyPDF2

In [5]:
from PyPDF2 import PdfReader
import numpy as np

# Extract text from PDF
def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = ''
    for page in reader.pages:
        text += page.extract_text()
    return preprocess_text(text)

# Prompt for job description and resume file
def get_user_input():
    job_description = input("Please enter the job description: ").strip()
    if not job_description:
        print("Job description cannot be empty!")
        return None, None

    resume_pdf_path = input("Please enter the path to the resume PDF file: ").strip()
    return job_description, resume_pdf_path

# Calculate cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

def match_resume(job_description, resume_pdf_path):
    job_description = preprocess_text(job_description)
    resume_text = extract_text_from_pdf(resume_pdf_path)
    job_features = tfidf.transform([job_description])
    resume_features = tfidf.transform([resume_text])

    # Calculate cosine similarity
    similarity = cosine_similarity(job_features, resume_features)

    # Return as a percentage
    return similarity[0][0] * 100

# Example usage
job_description, resume_pdf_path = get_user_input()
if job_description and resume_pdf_path:
    match_percentage = match_resume(job_description, resume_pdf_path)
    print(f'Match Percentage: {match_percentage}%')


Match Percentage: 31.989833141031582%
