In [20]:
import fitz  # PyMuPDF
import re

In [21]:

# Function to remove phone numbers using regular expressions
def remove_phone_numbers(text):
    # Regular expression pattern to match phone numbers
    phone_pattern = r'\b(?:\d{3}[-.\s]|\(\d{3}\)\s*)\d{3}[-.\s]?\d{4}\b'
    return re.sub(phone_pattern, '', text)


In [4]:

# Function to extract dates using regular expressions
def extract_dates(text):
    # Regular expression pattern to match dates in various formats
    date_patterns = [
        r'\b\d{4}-\d{2}-\d{2}\b',               # YYYY-MM-DD
        r'\b\d{2}-\d{2}-\d{4}\b',               # DD-MM-YYYY
        r'\b\d{2}/\d{2}/\d{4}\b',               # MM/DD/YYYY
        r'\b(?:\w{3,9}) \d{4}\b',               # Month Year (e.g., January 2024)
        r'\b\d{1,2}/\d{4}\b',                   # MM/YYYY
        r'\b\d{4}\b',                           # Year (e.g., 2024)
        r'\b(?:\w{3,9}) \d{1,2}, \d{4}\b',      # Month Day, Year (e.g., January 25, 2024)
        r'\b\d{1,2}/\d{1,2}/\d{2}\b',           # MM/DD/YY
        r'\b\d{1,2}-\d{1,2}-\d{2}\b',           # DD-MM-YY
        r'\b\d{1,2}\.\d{1,2}\.\d{2}\b',         # DD.MM.YY
        r'\b(?:\w{3,9}), \d{4}\b',              # Month, Year (e.g., January, 2024)
        # Add more patterns for other date formats as needed
    ]
    
    dates = []
    for pattern in date_patterns:
        dates.extend(re.findall(pattern, text))
    
    return dates


In [5]:

# Read PDF and extract text
def read_pdf(pdf_path):
    text = ''
    with fitz.open(pdf_path) as doc:
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
    return text


In [6]:

# Example usage
def main():
    # PDF path
    pdf_path = r'D:\HR-Analytics-Final\src\uploads\Komal_Lamba_Resume.pdf'

    # Read PDF and extract text
    resume_text = read_pdf(pdf_path)

    # Preprocessing: Remove phone numbers
    cleaned_text = remove_phone_numbers(resume_text)

    # Extract dates
    dates = extract_dates(cleaned_text)
    print("Extracted Dates:", dates)


In [7]:

if __name__ == "__main__":
    main()

Extracted Dates: ['Jun 2021', 'Jun 2021', 'Jun 2021', 'Jul 2023', 'Sep 2023', '2021', '2019', '2021', '2014', '2018', '2012', '2014', '2023', '2021', '2021', '2021', '2023', '2023']


In [22]:
import fitz  # PyMuPDF
import spacy
from dateutil.parser import parse
from datetime import datetime,  date


In [23]:
# Load English NER model
nlp = spacy.load("en_core_web_sm")

In [24]:
# Function to extract text from PDF using PyMuPDF
def extract_text_from_pdf(pdf_file):
    text = ""
    with fitz.open(pdf_file) as pdf_document:
        for page in pdf_document:
            text += page.get_text()
    return text

In [25]:
# Function to extract date entities from text using spaCy NER
def extract_date_entities(text):
    # Process the resume text
    doc = nlp(text)
    # Extract date entities
    date_entities = [ent.text for ent in doc.ents if ent.label_ == "DATE"]
    # Filter out non-date entities
    date_entities = [date for date in date_entities if len(date.split()) > 1]
    return date_entities

In [38]:
def parse_date(date_string):
    current_indicators = ["current", "present", "now", "ongoing"]
    for indicator in current_indicators:
        if re.search(r'\b' + re.escape(indicator) + r'\b', date_string, re.IGNORECASE):
            return date.today()

In [39]:
# Function to extract date ranges from date entities
def extract_date_ranges(date_entities):
    date_ranges = []
    for date_entity in date_entities:
        try:
            # Parse the date using custom function
            parsed_date = parse_date(date_entity)
            if parsed_date:
                # Add parsed date to date ranges if not None
                date_ranges.append(parsed_date)
        except Exception as e:
            print(f"Error parsing date: {e}")
    # Sort the date ranges in ascending order
    date_ranges.sort()
    return date_ranges

In [40]:
# Path to your PDF file
pdf_file_path = r'D:\HR-Analytics-Final\src\uploads\Komal_Lamba_Resume.pdf'

In [41]:
# Extract text from PDF
resume_text = extract_text_from_pdf(pdf_file_path)

# Extract date entities from resume text
date_entities = extract_date_entities(resume_text)

# Extract date ranges from date entities
date_ranges = extract_date_ranges(date_entities)

In [42]:
# Function to extract work experience date ranges
def extract_work_experience(date_ranges):
    work_experience = []
    # Assuming work experience involves consecutive dates
    for i in range(len(date_ranges) - 1):
        start_date = date_ranges[i]
        end_date = date_ranges[i + 1]
        # Assuming work experience involves at least 1 year
        if (end_date - start_date).days >= 365:
            work_experience.append((start_date, end_date))
    return work_experience

In [43]:
# Extract work experience from date ranges
work_experience = extract_work_experience(date_ranges)

# Print extracted work experience
for start_date, end_date in work_experience:
    print(f"Work Experience: {start_date} to {end_date}")