Aim:
To create a resume-checking tool that extracts, preprocesses, and standardizes text from resumes in multiple formats (.docx, .pdf, .txt), enabling the identification of key information such as skills, experience, and qualifications for recruitment purposes.

Inference:
The project aims to automate the extraction and normalization of information from resumes to enable key insights, possibly for skills identification, experience analysis, or matching candidates to job roles based on text features in resumes.

In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
import docx
import PyPDF2
import zipfile
import re
from docx import Document
import os

In [2]:
!pip install python-docx



In [3]:
pip install PyPDF2

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Download essential NLTK resources for text preprocessing

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nidhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nidhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nidhi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Nidhi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
def tokenize(text):
    return word_tokenize(text)

def to_lowercase(tokens):
    return [token.lower() for token in tokens]

#Removes common English words that don't add much meaning (like 'the', 'is').
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words]

def remove_punctuation(tokens):
    return [token for token in tokens if token not in string.punctuation]

#Reduces words to their root form to improve matching (e.g., 'running' to 'run').
def apply_lemmatization(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

def simple_tokenize(text):
    return re.findall(r'\b\w+\b', text)

In [None]:
#Applies all preprocessing steps to text:tokenizing, converting to lowercase, removing stopwords/punctuation, and lemmatizing.

def preprocess(text):
    tokens = simple_tokenize(text)
    tokens = to_lowercase(tokens)
    tokens = remove_stopwords(tokens)
    tokens = remove_punctuation(tokens)
    tokens = apply_lemmatization(tokens)
    return tokens

In [7]:
def read_docx(file_path):
    """Reads and extracts text from a .docx file."""
    doc = Document(file_path)
    text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
    return text

In [None]:
# Function to handle different file types

def read_file(file_path):

    if file_path.endswith('.docx'):
        return read_docx(file_path)
    elif file_path.endswith('.txt'):
        return read_txt(file_path)
    elif file_path.endswith('.pdf'):
        return read_pdf(file_path)
    else:
        raise ValueError("Unsupported file format. Please provide a .docx, .pdf, or .txt file.")


In [None]:
#Extracts a ZIP file to the specified directory.

def extract_zip(zip_path, extract_to):
    if not os.path.exists(extract_to):
        os.makedirs(extract_to)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"Extracted files to: {extract_to}")

In [None]:
#Extracts name and experience from the text.

def extract_name_experience(text):
    # Updated patterns to look for both "My name is" and "Name:"
    name_match = re.search(r"(?:My name is|Name:)\s+([A-Za-z]+\s[A-Za-z]+)", text)
    name = name_match.group(1) if name_match else "Name not found"

    # Extract experience
    experience_match = re.search(r"(\d+)\s+years of experience", text)
    experience_years = int(experience_match.group(1)) if experience_match else 0

    return name, experience_years

In [None]:
#Counts occurrences of each keyword in the text.

def count_keywords(text):
    keywords = ["machine learning", "data science", "python", "deep learning", "NLP", "artificial intelligence", "modeling"]


    keyword_count = sum(1 for keyword in keywords if keyword.lower() in text.lower())
    return keyword_count

In [None]:
#Calculates the score based on experience and keyword count.

def calculate_score(experience_years, keyword_count):
    experience_score = 0
    if experience_years >= 5:
        experience_score = 70  # Full experience score for 5+ years
    elif experience_years >= 3:
        experience_score = 50  # Medium score for 3-4 years
    elif experience_years >= 1:
        experience_score = 30  # Base score for 1-2 years
    else:
        experience_score = 10  # Minimum score for less than 1 year

    # Calculate the keyword score (up to 30 points)
    keyword_score = min(keyword_count * 5, 30)  # Each keyword up to max 30 points

    total_score = experience_score + keyword_score
    return total_score

In [None]:
#Extracts details and scores the resume based on experience and keywords.

def screen_resume(text):
    name, experience_years = extract_name_experience(text)
    keyword_count = count_keywords(text)
    score = calculate_score(experience_years, keyword_count)

    return {
        "Name": name,
        "Experience (years)": experience_years,
        "Keywords Matched": keyword_count,
        "Score": score
    }

In [None]:
#Extracts and processes resumes from a ZIP file, specifically focusing on .docx files.
#Each resume is read, processed, and scored based on experience and keyword matches.
    

def process_resumes_from_zip(zip_path):
    extract_to = '/content/resumes'  # Temporary directory for extraction
    extract_zip(zip_path, extract_to)

    results = []
    # Walk through the directory to find all .docx files, including in subdirectories
    for root, dirs, files in os.walk(extract_to):
        for file_name in files:
            if file_name.endswith('.docx'):
                file_path = os.path.join(root, file_name)
                resume_text = read_docx(file_path)
                result = screen_resume(resume_text) #Applying main function
                result["File"] = file_name  # Include the file name in the result for reference
                results.append(result)
                print(f"Processed file: {file_path}")
                print()
            else:
                print(f"Skipped non-docx file: {file_name}")
                print()

    if not results:
        print("No valid resumes found in the ZIP file.")
        print()
    return results

In [16]:
# Path to the ZIP file
zip_path = 'Resumes.zip'

In [17]:
# Process resumes and output the results
results = process_resumes_from_zip(zip_path)
if results:
    print("Final Results:")
    for result in results:
        print(result)
else:
    print("No results to display.")

Extracted files to: /content/resumes
Processed file: /content/resumes\Resumes\Resume1.docx

Processed file: /content/resumes\Resumes\Resume2.docx

Processed file: /content/resumes\Resumes\Resume3.docx

Final Results:
{'Name': 'Arjun Patel', 'Experience (years)': 6, 'Keywords Matched': 1, 'Score': 75, 'File': 'Resume1.docx'}
{'Name': 'Leena Rao', 'Experience (years)': 4, 'Keywords Matched': 1, 'Score': 55, 'File': 'Resume2.docx'}
{'Name': 'Nikhil Mehta', 'Experience (years)': 3, 'Keywords Matched': 0, 'Score': 50, 'File': 'Resume3.docx'}
