In [1]:
!pip install pymupdf



## Personal resume data

In [1]:
import os
import pandas as pd
import pymupdf  # This is the correct import for PyMuPDF (fitz is the old name)

# Define folder paths
RESUME_DIR = "data/resumes/"
JD_DIR = "data/job_descriptions/"
STATUS_DIR = "data/application_status/"

def extract_formatted_text(pdf_path):
    """Extract formatted text while preserving all font styles, sizes, and layout using PyMuPDF."""
    try:
        doc = pymupdf.open(pdf_path)
        formatted_text = ""
        
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            blocks = page.get_text("dict")["blocks"]
            
            for block in blocks:
                if block["type"] == 0:  # Only process text blocks
                    for line in block["lines"]:
                        for span in line["spans"]:
                            # Extract the text and all related formatting
                            text = span["text"]
                            font_size = span["size"]
                            font_color = span["color"]
                            font_name = span["font"]
                            is_bold = "bold" in font_name.lower()
                            is_italic = "italic" in font_name.lower()

                            # Add formatted text with details
                            formatted_text += f"Text: {text}, Font: {font_name}, Size: {font_size}, Color: {font_color}, Bold: {is_bold}, Italic: {is_italic}\n"

        return formatted_text
    except Exception as e:
        print(f"Error extracting formatted text from {pdf_path}: {e}")
        return ""

def extract_plain_text(pdf_path):
    """Extract plain text (ATS-friendly) using PyMuPDF."""
    try:
        doc = pymupdf.open(pdf_path)
        plain_text = ""
        
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            plain_text += page.get_text("text")
        
        return plain_text
    except Exception as e:
        print(f"Error extracting plain text from {pdf_path}: {e}")
        return ""

def read_text_file(file_path):
    """Read text from JD or application status files."""
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            return file.read().strip()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

def process_resumes():
    """Processes resumes, JDs, and application statuses into a Pandas DataFrame."""
    data = []

    # List all files in the directories
    resume_files = sorted([file for file in os.listdir(RESUME_DIR) if file.lower().endswith('.pdf')])
    jd_files = sorted([file for file in os.listdir(JD_DIR) if file.lower().endswith('.txt')])
    status_files = sorted([file for file in os.listdir(STATUS_DIR) if file.lower().endswith('.txt')])

    # Ensure that the lengths of resume_files, jd_files, and status_files are consistent
    num_files = min(len(resume_files), len(jd_files), len(status_files))

    for i in range(num_files):
        resume_file = resume_files[i]
        jd_file = jd_files[i]
        status_file = status_files[i]

        resume_path = os.path.join(RESUME_DIR, resume_file)
        jd_path = os.path.join(JD_DIR, jd_file)
        status_path = os.path.join(STATUS_DIR, status_file)

        formatted_text = extract_formatted_text(resume_path)
        plain_text = extract_plain_text(resume_path)
        job_description = read_text_file(jd_path)
        application_status = read_text_file(status_path)

        # Store everything in a structured list
        data.append({
            "Resume File": resume_file,
            "Plain Text Resume": plain_text,
            "Formatted Resume": formatted_text,
            "Job Description": job_description,
            "Application Status": application_status
        })

    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    return df

# Run the script and get DataFrame
df = process_resumes()

# Display DataFrame
df

  from pandas.core import (


Unnamed: 0,Resume File,Plain Text Resume,Formatted Resume,Job Description,Application Status
0,Resume1.pdf,"Sparsh Marwah \nBoston, MA 02130 | marwah.sp@n...","Text: Sparsh Marwah , Font: TimesNewRomanPS-Bo...",Understands and articulates the business case ...,Rejected
1,Resume10.pdf,"Sparsh Marwah \nBoston, MA 02130 | marwah.sp@n...","Text: Sparsh Marwah , Font: TimesNewRomanPS-Bo...",About Us\nAlexandria Events by Invited is a pr...,Rejected
2,Resume2.pdf,"Sparsh Marwah \nBoston, MA 02130 | marwah.sp@n...","Text: Sparsh Marwah , Font: TimesNewRomanPS-Bo...",Knowing the destination isn't enough: you need...,Rejected
3,Resume3.pdf,"Sparsh Marwah \nBoston, MA 02130 | marwah.sp@n...","Text: Sparsh Marwah , Font: TimesNewRomanPS-Bo...",You’ll be joining an elite team of subject mat...,Rejected
4,Resume4.pdf,"Sparsh Marwah \nBoston, MA 02130 | marwah.sp@n...","Text: Sparsh Marwah , Font: TimesNewRomanPS-Bo...",In this position you should have the following...,Rejected
5,Resume5.pdf,"Sparsh Marwah \nBoston, MA 02130 | marwah.sp@n...","Text: Sparsh Marwah , Font: TimesNewRomanPS-Bo...",Copart is seeking a Data Science Analyst to he...,Rejected
6,Resume6.pdf,"Sparsh Marwah \nBoston, MA 02130 | marwah.sp@n...","Text: Sparsh Marwah , Font: TimesNewRomanPS-Bo...","What You'll Do:\nMinimum 1 year full-time, ana...",Rejected
7,Resume7.pdf,"Sparsh Marwah \nBoston, MA 02130 | marwah.sp@n...","Text: Sparsh Marwah , Font: TimesNewRomanPS-Bo...",Dispatch simplifies last-mile deliveries for b...,Rejected
8,Resume8.pdf,"Sparsh Marwah \nBoston, MA 02130 | marwah.sp@n...","Text: Sparsh Marwah , Font: TimesNewRomanPS-Bo...","At Sun Life, we look for optimistic people who...",Accepted
9,Resume9.pdf,"Sparsh Marwah \nBoston, MA 02130 | marwah.sp@n...","Text: Sparsh Marwah , Font: TimesNewRomanPS-Bo...",Federato is on a mission to defend the right t...,Rejected


In [2]:
!pip install pymongo
!pip install chromadb





## Storing data in MongoDB and Chroma DB

In [2]:
import pymongo
import chromadb
import pandas as pd
from pymongo import MongoClient

# MongoDB connection for storing formatted text
mongo_client = MongoClient("mongodb://localhost:27017/")  # Connect to MongoDB
db = mongo_client["resume_db"]
formatted_resume_collection = db["formatted_resumes"]

# ChromaDB setup for plain text
client = chromadb.PersistentClient(path="./chroma_db") 
collection = client.get_or_create_collection("plain_resumes")  # Use get_or_create_collection to avoid duplication

def store_to_chromadb_and_mongodb(df):
    """Store plain resume text in ChromaDB and formatted resume text in MongoDB, ensuring unique entries."""
    for index, row in df.iterrows():
        resume_file = row["Resume File"]
        plain_text = row["Plain Text Resume"]
        formatted_text = row["Formatted Resume"]
        job_description = row["Job Description"]
        application_status = row["Application Status"]

        # === Ensure Uniqueness in MongoDB ===
        existing_resume = formatted_resume_collection.find_one({"resume_file": resume_file})
        if existing_resume:
            # Update existing entry
            formatted_resume_collection.update_one(
                {"resume_file": resume_file},
                {"$set": {
                    "formatted_resume": formatted_text,
                    "job_description": job_description,
                    "application_status": application_status
                }}
            )
            print(f"Updated existing entry in MongoDB for: {resume_file}")
        else:
            # Insert new entry
            formatted_resume_collection.insert_one({
                "resume_file": resume_file,
                "formatted_resume": formatted_text,
                "job_description": job_description,
                "application_status": application_status
            })
            print(f"Inserted new entry in MongoDB for: {resume_file}")

        # === Ensure Uniqueness in ChromaDB ===
        existing_chroma_entry = collection.get(ids=[resume_file])  # Check if the ID already exists
        if existing_chroma_entry and existing_chroma_entry['documents']:
            # Update existing entry by first deleting and then re-adding (ChromaDB lacks direct update support)
            collection.delete(ids=[resume_file])
            print(f"Deleted existing entry in ChromaDB for: {resume_file}")

        # Add the new entry (whether new or after deletion)
        collection.add(
            documents=[plain_text],
            metadatas=[{
                "resume_file": resume_file,
                "job_description": job_description,
                "application_status": application_status
            }],
            ids=[resume_file]
        )
        print(f"Stored in ChromaDB: {resume_file}")

# Call the function to store the data in both databases
store_to_chromadb_and_mongodb(df)

# Print confirmation
print("Data has been successfully stored (ensuring uniqueness) in ChromaDB and MongoDB.")


Updated existing entry in MongoDB for: Resume1.pdf
Deleted existing entry in ChromaDB for: Resume1.pdf
Stored in ChromaDB: Resume1.pdf
Updated existing entry in MongoDB for: Resume10.pdf
Deleted existing entry in ChromaDB for: Resume10.pdf
Stored in ChromaDB: Resume10.pdf
Updated existing entry in MongoDB for: Resume2.pdf
Deleted existing entry in ChromaDB for: Resume2.pdf
Stored in ChromaDB: Resume2.pdf
Updated existing entry in MongoDB for: Resume3.pdf
Deleted existing entry in ChromaDB for: Resume3.pdf
Stored in ChromaDB: Resume3.pdf
Updated existing entry in MongoDB for: Resume4.pdf
Deleted existing entry in ChromaDB for: Resume4.pdf
Stored in ChromaDB: Resume4.pdf
Updated existing entry in MongoDB for: Resume5.pdf
Deleted existing entry in ChromaDB for: Resume5.pdf
Stored in ChromaDB: Resume5.pdf
Updated existing entry in MongoDB for: Resume6.pdf
Deleted existing entry in ChromaDB for: Resume6.pdf
Stored in ChromaDB: Resume6.pdf
Updated existing entry in MongoDB for: Resume7.pdf
D

### Retrieval of data

In [3]:
import pandas as pd
from pymongo import MongoClient

# MongoDB connection string (adjust it as needed)
MONGO_URI = "mongodb://localhost:27017"
DATABASE_NAME = "resume_db"
FORMATTED_COLLECTION = "formatted_resumes"

# Connect to MongoDB
client = MongoClient(MONGO_URI)
db = client[DATABASE_NAME]
formatted_resume_collection = db[FORMATTED_COLLECTION]

def retrieve_formatted_resumes_as_df():
    """Retrieve unique formatted resume data from MongoDB and return it as a DataFrame"""
    
    # Retrieve all documents from the collection
    formatted_resumes = formatted_resume_collection.find() 
    
    # Convert the results into a list of dictionaries while ensuring unique entries
    resume_list = []
    seen_files = set()  # To track unique resume files
    
    for resume in formatted_resumes:
        resume_file = resume['resume_file']
        
        # Only append if the resume_file hasn't been encountered before
        if resume_file not in seen_files:
            seen_files.add(resume_file)
            resume_list.append({
                "Resume File": resume['resume_file'],
                "Formatted Resume": resume['formatted_resume'],
                "Job Description": resume['job_description'],
                "Application Status": resume['application_status']
            })
    
    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(resume_list)
    return df

# Example usage: retrieve unique formatted resumes as a DataFrame
formatted_resume_df = retrieve_formatted_resumes_as_df()
formatted_resume_df


Unnamed: 0,Resume File,Formatted Resume,Job Description,Application Status
0,Resume1.pdf,"Text: Sparsh Marwah , Font: TimesNewRomanPS-Bo...",Understands and articulates the business case ...,Rejected
1,Resume10.pdf,"Text: Sparsh Marwah , Font: TimesNewRomanPS-Bo...",About Us\nAlexandria Events by Invited is a pr...,Rejected
2,Resume2.pdf,"Text: Sparsh Marwah , Font: TimesNewRomanPS-Bo...",Knowing the destination isn't enough: you need...,Rejected
3,Resume3.pdf,"Text: Sparsh Marwah , Font: TimesNewRomanPS-Bo...",You’ll be joining an elite team of subject mat...,Rejected
4,Resume4.pdf,"Text: Sparsh Marwah , Font: TimesNewRomanPS-Bo...",In this position you should have the following...,Rejected
5,Resume5.pdf,"Text: Sparsh Marwah , Font: TimesNewRomanPS-Bo...",Copart is seeking a Data Science Analyst to he...,Rejected
6,Resume6.pdf,"Text: Sparsh Marwah , Font: TimesNewRomanPS-Bo...","What You'll Do:\nMinimum 1 year full-time, ana...",Rejected
7,Resume7.pdf,"Text: Sparsh Marwah , Font: TimesNewRomanPS-Bo...",Dispatch simplifies last-mile deliveries for b...,Rejected
8,Resume8.pdf,"Text: Sparsh Marwah , Font: TimesNewRomanPS-Bo...","At Sun Life, we look for optimistic people who...",Accepted
9,Resume9.pdf,"Text: Sparsh Marwah , Font: TimesNewRomanPS-Bo...",Federato is on a mission to defend the right t...,Rejected


In [7]:
import chromadb
import pandas as pd

# Initialize ChromaDB client and collection
client = chromadb.PersistentClient(path="./chroma_db")  # Use the correct path if needed
collection = client.get_or_create_collection(name="plain_resumes")

def retrieve_all_resumes():
    """Retrieve all resumes stored in ChromaDB and return as a DataFrame"""
    results = collection.get()  # Retrieves all stored documents
    
    # Convert results to a structured DataFrame
    resume_list = []
    for i in range(len(results['documents'])):
        resume_list.append({
            "Resume File": results['ids'][i],
            "Plain Text Resume": results['documents'][i],
            "Metadata": results['metadatas'][i]  # Contains metadata like resume_file, job_description, etc.
        })
    
    df = pd.DataFrame(resume_list)
    return df

# Retrieve all resumes
plain_resumes_df = retrieve_all_resumes()
plain_resumes_df

Unnamed: 0,Resume File,Plain Text Resume,Metadata
0,Resume1.pdf,"Sparsh Marwah \nBoston, MA 02130 | marwah.sp@n...","{'application_status': 'Rejected', 'job_descri..."
1,Resume10.pdf,"Sparsh Marwah \nBoston, MA 02130 | marwah.sp@n...","{'application_status': 'Rejected', 'job_descri..."
2,Resume2.pdf,"Sparsh Marwah \nBoston, MA 02130 | marwah.sp@n...","{'application_status': 'Rejected', 'job_descri..."
3,Resume3.pdf,"Sparsh Marwah \nBoston, MA 02130 | marwah.sp@n...","{'application_status': 'Rejected', 'job_descri..."
4,Resume4.pdf,"Sparsh Marwah \nBoston, MA 02130 | marwah.sp@n...","{'application_status': 'Rejected', 'job_descri..."
5,Resume5.pdf,"Sparsh Marwah \nBoston, MA 02130 | marwah.sp@n...","{'application_status': 'Rejected', 'job_descri..."
6,Resume6.pdf,"Sparsh Marwah \nBoston, MA 02130 | marwah.sp@n...","{'application_status': 'Rejected', 'job_descri..."
7,Resume7.pdf,"Sparsh Marwah \nBoston, MA 02130 | marwah.sp@n...","{'application_status': 'Rejected', 'job_descri..."
8,Resume8.pdf,"Sparsh Marwah \nBoston, MA 02130 | marwah.sp@n...","{'application_status': 'Accepted', 'job_descri..."
9,Resume9.pdf,"Sparsh Marwah \nBoston, MA 02130 | marwah.sp@n...","{'application_status': 'Rejected', 'job_descri..."
