#### Updated Algorithm

#### Resume Screening Algorithm Using logistic regression

## 1. Preprocess the Text

In [None]:
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
data = pd.read_csv('Resume.csv')

# Preprocess text function
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply preprocessing to the resume column
data['Resume'] = data['Resume'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 2. TF-IDF Vectorization


In [None]:
# Use TF-IDF to convert text data to numerical features
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['Resume'])


##3. Calculate Similarity Scores

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Function to calculate match percentage
def calculate_similarity(job_description, resume_text):
    job_features = tfidf.transform([job_description])
    resume_features = tfidf.transform([resume_text])
    similarity = cosine_similarity(job_features, resume_features)
    return similarity[0][0] * 100


## 4. Create Labeled Data for Training

In [None]:
# Limit to the first 300 resumes for processing
data_subset = data.head(100)

In [None]:
# Create labeled dataset for training suitability model
rows = []

# Set a threshold for labeling
threshold = 30

# Generate labels based on similarity scores
for i in range(len(data_subset)):
    resume_text = data_subset['Resume'].iloc[i]
    for j in range(len(data_subset)):
        job_description = data_subset['Resume'].iloc[j]
        match_percentage = calculate_similarity(job_description, resume_text)
        suitable = 1 if match_percentage >= threshold else 0
        rows.append({'match_percentage': match_percentage, 'suitable': suitable})

## 5. Train the Logistic Regression Model

In [None]:
# Convert list of rows to DataFrame
labeled_data = pd.DataFrame(rows)

# Check if labeled_data is created successfully
if labeled_data.empty:
    print("Labeled data is empty. Please check the similarity calculation.")
else:
    print("Labeled data created successfully.")
    print(labeled_data.head(30))

    # Train logistic regression model for suitability prediction
    X_suitability = labeled_data[['match_percentage']]
    y_suitability = labeled_data['suitable']

    suitability_model = LogisticRegression()
    suitability_model.fit(X_suitability, y_suitability)

    print("Model trained successfully.")


Labeled data created successfully.
    match_percentage  suitable
0         100.000000         1
1           8.832553         0
2          11.997773         0
3          20.150485         0
4          10.528153         0
5           9.561241         0
6          21.994874         0
7          21.267344         0
8          16.767357         0
9          25.742033         0
10        100.000000         1
11          8.832553         0
12         11.997773         0
13         20.150485         0
14         10.528153         0
15          9.561241         0
16         21.994874         0
17         21.267344         0
18         16.767357         0
19         25.742033         0
20        100.000000         1
21          8.832553         0
22         11.997773         0
23         20.150485         0
24         10.528153         0
25          9.561241         0
26         21.994874         0
27         21.267344         0
28         16.767357         0
29         25.742033         0
Mode

## 6. Predict Suitability

In [None]:
# Function to extract text from PDF
from PyPDF2 import PdfReader

def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = ''
    for page in reader.pages:
        text += page.extract_text()
    return preprocess_text(text)

# Prompt for job description and resume file
def get_user_input():
    job_description = input("Please enter the job description: ").strip()
    if not job_description:
        print("Job description cannot be empty!")
        return None, None

    resume_pdf_path = input("Please enter the path to the resume PDF file: ").strip()
    return job_description, resume_pdf_path

# Example usage
job_description, resume_pdf_path = get_user_input()
if job_description and resume_pdf_path:
    job_description = preprocess_text(job_description)
    resume_text = extract_text_from_pdf(resume_pdf_path)
    match_percentage = calculate_similarity(job_description, resume_text)
    print(f'Match Percentage: {match_percentage}%')

    # Predict suitability based on match percentage
    suitability_prediction = suitability_model.predict([[match_percentage]])
    suitability = 'Yes' if suitability_prediction[0] == 1 else 'No'

    print(f'Is the candidate suitable for the job? {suitability}')


Please enter the job description: We are looking for passionate engineers and researchers that want to contribute in this exciting and fast moving field of Deep Learning and Research.  Our client is a highly awarded AI and Machine Learning lab, which is disrupting the multi billion dollar Agriculture and commodities business globally. They are recognized as a de-facto business for expert AI capability in solutions that satisfy real world challenges in near real time.  As the Lead Engineer - Deep Learning, you will be responsible for leading research, software implementation for new concept prototypes in the areas of computer vision and deep learning.  What you will do: • Focusing on developing new concepts and user experiences through rapid prototyping and collaboration with the best-in-class research and development team. • Reading research papers and implementing state-of-the-art techniques for computer vision • Building and managing datasets. • Providing Rapid experimentation, analy

