# Imports and Setup

In [28]:
import os
import json
import re
from pathlib import Path
from datetime import datetime
from typing import Dict, List
from collections import defaultdict

import fitz  # PyMuPDF
from docx import Document
from gliner import GLiNER
from openai import OpenAI





# Configuration - All parameters in one place
class Config:
    # Model settings
    MODEL_GLINER = "urchade/gliner_large-v2.1"
    MODEL_NUNER = "numind/NuNerZero"
    
    # Entity labels for extraction
    ENTITY_LABELS = {
        "basic": ["person", "email"],
        "skills": ["skill", "technology", "tool", "programming language"]
    }
    
    # Confidence thresholds for different entity types
    THRESHOLDS = {
        "person": 0.4,
        "email": 0.3,
        "skill": 0.1
    }
    
    # Directories
    DATA_DIR = "./data"
    OUTPUT_DIR = "./output"
    
    # OpenAI API key - set your key here or use environment variable
    API_KEY = ""  # Replace with your OpenAI API key
    
    # LLM evaluation schema
    EVALUATION_SCHEMA = {
        "name": "resume_evaluation",
        "strict": True,
        "schema": {
            "type": "object",
            "properties": {
                "name_status": {
                    "type": "string", 
                    "enum": ["correct", "incorrect", "partial"]
                },
                "email_status": {
                    "type": "string", 
                    "enum": ["correct", "incorrect", "partial"]
                },
                "skills_status": {
                    "type": "string", 
                    "enum": ["correct", "incorrect", "partial"]
                },
                "extraction_quality": {
                    "type": "string",
                    "enum": ["excellent", "good", "fair", "poor"],
                    "description": "Overall quality assessment"
                },
                "overall_reasoning": {
                    "type": "string"
                }
            },
            "required": ["name_status", "email_status", "skills_status", "extraction_quality", "overall_reasoning"],
            "additionalProperties": False
        }
    }

print("Configuration loaded")






# Set API key from config or environment
if not Config.API_KEY:
    Config.API_KEY = os.getenv("OPENAI_API_KEY", "")

print("Imports completed")
print(f"API Key available: {'Yes' if Config.API_KEY else 'No (evaluation will be skipped)'}")



Configuration loaded
Imports completed
API Key available: Yes


# ResumeParser Class

In [29]:
class ResumeParser:
    """Resume parser using GLiNER and NuNER Zero models."""
    
    def __init__(self):
        print("Loading models...")
        self.gliner = GLiNER.from_pretrained(Config.MODEL_GLINER)
        self.nuner = GLiNER.from_pretrained(Config.MODEL_NUNER)
        self.email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
        print("Models loaded successfully")
        
    def extract_text(self, file_path: Path) -> str:
        """Extract text from PDF or DOCX files."""
        if file_path.suffix.lower() == '.pdf':
            doc = fitz.open(file_path)
            text = " ".join([page.get_text() for page in doc])
            doc.close()
        elif file_path.suffix.lower() == '.docx':
            doc = Document(file_path)
            text = " ".join([para.text for para in doc.paragraphs])
        else:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
        return text

    def merge_adjacent_entities(self, entities: List[Dict], text: str) -> List[Dict]:
        """Merge adjacent entities for NuNER Zero (handles multi-word entities)."""
        if not entities:
            return []
        
        entities = sorted(entities, key=lambda x: x.get('start', 0))
        merged = []
        current = entities[0].copy()
        
        for next_entity in entities[1:]:
            if (next_entity['label'] == current['label'] and 
                next_entity['start'] <= current['end'] + 1):
                current['end'] = next_entity['end']
                current['text'] = text[current['start']:current['end']].strip()
            else:
                merged.append(current)
                current = next_entity.copy()
        merged.append(current)
        return merged

    def extract_entities(self, text: str) -> Dict[str, List[str]]:
        """Extract entities using both models with different thresholds."""
        all_entities = []
        
        # GLiNER extraction
        gliner_basic = self.gliner.predict_entities(
            text, Config.ENTITY_LABELS["basic"], threshold=Config.THRESHOLDS["person"]
        )
        gliner_skills = self.gliner.predict_entities(
            text, Config.ENTITY_LABELS["skills"], threshold=Config.THRESHOLDS["skill"]
        )
        all_entities.extend(gliner_basic + gliner_skills)
        
        # NuNER Zero extraction (lowercase labels required)
        nuner_basic = self.nuner.predict_entities(
            text, [l.lower() for l in Config.ENTITY_LABELS["basic"]], 
            threshold=Config.THRESHOLDS["person"]
        )
        nuner_skills = self.nuner.predict_entities(
            text, [l.lower() for l in Config.ENTITY_LABELS["skills"]], 
            threshold=Config.THRESHOLDS["skill"]
        )
        
        # Merge adjacent entities for NuNER
        nuner_basic = self.merge_adjacent_entities(nuner_basic, text)
        nuner_skills = self.merge_adjacent_entities(nuner_skills, text)
        all_entities.extend(nuner_basic + nuner_skills)
        
        # Normalize entity labels
        normalized = defaultdict(list)
        label_map = {
            "person": "name", "email": "email", "skill": "skills",
            "technology": "skills", "tool": "skills", "programming language": "skills"
        }
        
        for entity in all_entities:
            canonical_label = label_map.get(entity['label'].lower(), entity['label'].lower())
            normalized[canonical_label].append(entity['text'].strip())
        
        return dict(normalized)

    def parse_resume(self, file_path: Path) -> Dict:
        """Parse resume and extract name, email, skills."""
        text = self.extract_text(file_path)
        entities = self.extract_entities(text)
        
        # Extract name (first valid name)
        name = entities.get("name", [""])[0] if entities.get("name") else ""
        
        # Extract email (first valid email)
        email = ""
        email_candidates = entities.get("email", []) + self.email_regex.findall(text)
        for candidate in email_candidates:
            if "@" in candidate and "." in candidate.split("@")[-1]:
                email = candidate.lower()
                break
        
        # Extract and clean skills
        raw_skills = entities.get("skills", [])
        skills = []
        
        for skill in raw_skills:
            cleaned = re.sub(r'[^\w\s\+\-\.\#]', '', skill)
            if 2 <= len(cleaned) <= 25:
                skills.append(cleaned)
        
        # Remove duplicates while preserving order
        unique_skills = list(dict.fromkeys(skills))[:15]
        
        return {
            "name": name,
            "email": email,
            "skills": unique_skills
        }

print("ResumeParser class defined")


ResumeParser class defined


# ResumeEvaluator Class

In [30]:
class ResumeEvaluator:
    """LLM-based evaluator for resume parsing quality."""
    
    def __init__(self, api_key: str):
        if not api_key:
            raise ValueError("API key required for evaluation")
        self.client = OpenAI(api_key=api_key)
        
    def evaluate(self, original_text: str, extracted_data: Dict) -> Dict:
        """Evaluate extraction quality using LLM judge."""
        
        prompt = f"""Evaluate this resume parsing extraction:

RESUME TEXT (first 800 chars):
{original_text[:5000]}

EXTRACTED DATA:
{json.dumps(extracted_data, indent=2)}

For each field, determine if extraction is:
- "correct": Perfectly extracted and accurate
- "incorrect": Missing, wrong, or completely inaccurate  
- "partial": Extracted but incomplete or minor issues

Also provide overall extraction quality (excellent/good/fair/poor) and brief reasoning."""

        try:
            response = self.client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are an expert resume parsing evaluator."},
                    {"role": "user", "content": prompt}
                ],
                response_format={
                    "type": "json_schema", 
                    "json_schema": Config.EVALUATION_SCHEMA
                },
                temperature=0.1,
                max_tokens=1000
            )
            
            return json.loads(response.choices[0].message.content)
        except Exception as e:
            return {"error": f"Evaluation failed: {e}"}

print("ResumeEvaluator class defined")


ResumeEvaluator class defined


# Helper Functions

In [31]:

def calculate_metrics(results: List[Dict]) -> Dict:
    """Calculate simple performance metrics."""
    if not results:
        return {}
    
    successful = [r for r in results if r.get("success")]
    total = len(results)
    
    # Basic metrics
    metrics = {
        "total_files": total,
        "successful_parses": len(successful),
        "success_rate": len(successful) / total * 100 if total > 0 else 0,
        "avg_skills_per_resume": sum(
            len(r.get("parsed_data", {}).get("skills", [])) 
            for r in successful
        ) / len(successful) if successful else 0
    }
    
    # Evaluation metrics
    evaluations = [
        r.get("evaluation") for r in results 
        if r.get("evaluation") and not r.get("evaluation", {}).get("error")
    ]
    
    if evaluations:
        name_correct = sum(1 for e in evaluations if e.get("name_status") == "correct")
        email_correct = sum(1 for e in evaluations if e.get("email_status") == "correct")
        skills_correct = sum(1 for e in evaluations if (e.get("skills_status") == "correct" or e.get("skills_status") == "partial"))
        
        # Calculate overall quality distribution
        quality_distribution = {"excellent": 0, "good": 0, "fair": 0, "poor": 0}
        for evaluation in evaluations:
            quality = evaluation.get("extraction_quality", "poor")
            quality_distribution[quality] += 1
        
        metrics.update({
            "evaluation_count": len(evaluations),
            "name_accuracy": name_correct / len(evaluations) * 100,
            "email_accuracy": email_correct / len(evaluations) * 100,
            "skills_accuracy": skills_correct / len(evaluations) * 100,
            "overall_quality_distribution": quality_distribution
        })
    
    return metrics

def print_summary(metrics: Dict):
    """Print benchmark summary."""
    print("\nBENCHMARK SUMMARY")
    print("=" * 40)
    print(f"Files processed: {metrics.get('total_files', 0)}")
    print(f"Success rate: {metrics.get('success_rate', 0):.1f}%")
    print(f"Avg skills per resume: {metrics.get('avg_skills_per_resume', 0):.1f}")
    
    if metrics.get('evaluation_count'):
        print(f"\nEVALUATION RESULTS ({metrics['evaluation_count']} files):")
        print(f"Name accuracy: {metrics.get('name_accuracy', 0):.1f}%")
        print(f"Email accuracy: {metrics.get('email_accuracy', 0):.1f}%")
        print(f"Skills accuracy: {metrics.get('skills_accuracy', 0):.1f}%")
        
        # Display overall quality distribution
        quality_dist = metrics.get('overall_quality_distribution')
        if quality_dist:
            print(f"\nOVERALL EXTRACTION QUALITY:")
            for quality in ['excellent', 'good', 'fair', 'poor']:
                count = quality_dist.get(quality, 0)
                percentage = (count / metrics['evaluation_count'] * 100) if metrics['evaluation_count'] > 0 else 0
                print(f"  {quality.capitalize()}: {count} ({percentage:.1f}%)")

print("Helper functions defined")



Helper functions defined


# Main analysis

In [32]:
# Initialize components
parser = ResumeParser()
evaluator = ResumeEvaluator(Config.API_KEY) if Config.API_KEY else None

# Setup directories
data_dir = Path(Config.DATA_DIR)
output_dir = Path(Config.OUTPUT_DIR)
output_dir.mkdir(exist_ok=True)

if not data_dir.exists():
    print(f"Error: Data directory '{data_dir}' not found")
else:
    # Find resume files
    resume_files = [f for f in data_dir.iterdir() 
                    if f.suffix.lower() in ['.pdf', '.docx', '.txt']]
    
    print(f"Found {len(resume_files)} resume files")
    
    # Process files
    results = []
    
    for file_path in resume_files:
        try:
            # Parse resume
            parsed_data = parser.parse_resume(file_path)
            
            result = {
                "file_name": file_path.name,
                "parsed_data": parsed_data,
                "success": bool(parsed_data["name"] or parsed_data["email"] or parsed_data["skills"])
            }
            
            # Evaluate if API key available
            if evaluator:
                original_text = parser.extract_text(file_path)
                evaluation = evaluator.evaluate(original_text, parsed_data)
                result["evaluation"] = evaluation
            
            results.append(result)
            
            print(f"Processed {file_path.name}: Name={bool(parsed_data['name'])}, Email={bool(parsed_data['email'])}, Skills={len(parsed_data['skills'])}")
            
        except Exception as e:
            print(f"Error processing {file_path.name}: {e}")
            results.append({"file_name": file_path.name, "error": str(e), "success": False})
    
    # Calculate and display metrics
    metrics = calculate_metrics(results)
    print_summary(metrics)
    
    # Save results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = output_dir / f"resume_parsing_results_{timestamp}.json"
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump({"results": results, "metrics": metrics}, f, indent=2, ensure_ascii=False)
    
    print(f"\nResults saved to: {output_file}")


Loading models...


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

Models loaded successfully


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Found 10 resume files


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Processed ATS basic HR resume.docx: Name=True, Email=True, Skills=1
Processed ATS Bold accounting resume.pdf: Name=True, Email=True, Skills=6
Processed ATS bold HR resume.docx: Name=True, Email=True, Skills=1
Processed ATS classic HR resume.docx: Name=True, Email=True, Skills=15
Processed ATS healthcare resume.pdf: Name=True, Email=True, Skills=5
Processed ATS office manager resume.pdf: Name=True, Email=True, Skills=7
Processed ATS simple classic resume.pdf: Name=True, Email=True, Skills=6
Processed ATS stylish accounting resume.docx: Name=True, Email=True, Skills=6
Processed Extended ATS healthcare resume.docx: Name=True, Email=False, Skills=1
Processed Simple ATS healthcare resume.pdf: Name=True, Email=True, Skills=12

BENCHMARK SUMMARY
Files processed: 10
Success rate: 100.0%
Avg skills per resume: 6.0

EVALUATION RESULTS (10 files):
Name accuracy: 100.0%
Email accuracy: 90.0%
Skills accuracy: 100.0%

OVERALL EXTRACTION QUALITY:
  Excellent: 1 (10.0%)
  Good: 6 (60.0%)
  Fair: 3 (30

# Detailed result 

In [33]:
# Detailed Quality Analysis (Optional)
def display_detailed_quality_analysis(results: List[Dict]):
    """Display detailed analysis of extraction quality by file."""
    
    evaluations = [
        r for r in results 
        if r.get("evaluation") and not r.get("evaluation", {}).get("error")
    ]
    
    if not evaluations:
        print("No evaluation data available for detailed analysis")
        return
    
    print("\nDETAILED QUALITY ANALYSIS")
    print("=" * 50)
    
    # Group by quality level
    quality_groups = {"excellent": [], "good": [], "fair": [], "poor": []}
    
    for result in evaluations:
        evaluation = result["evaluation"]
        quality = evaluation.get("extraction_quality", "poor")
        quality_groups[quality].append({
            "file": result["file_name"],
            "name_status": evaluation.get("name_status", "unknown"),
            "email_status": evaluation.get("email_status", "unknown"),
            "skills_status": evaluation.get("skills_status", "unknown"),
            "reasoning": evaluation.get("overall_reasoning", "No reasoning provided")
        })
    
    # Display each quality group
    for quality in ['excellent', 'good', 'fair', 'poor']:
        files = quality_groups[quality]
        if files:
            print(f"\n{quality.upper()} QUALITY ({len(files)} files):")
            for file_data in files:
                print(f"  File: {file_data['file']}")
                print(f"    Name: {file_data['name_status']}, Email: {file_data['email_status']}, Skills: {file_data['skills_status']}")
                print(f"    Reasoning: {file_data['reasoning'][:2500]}{'...' if len(file_data['reasoning']) > 100 else ''}")
                print()

# Run detailed analysis (optional)
if 'results' in locals():
    display_detailed_quality_analysis(results)



DETAILED QUALITY ANALYSIS

EXCELLENT QUALITY (1 files):
  File: ATS simple classic resume.pdf
    Name: correct, Email: correct, Skills: correct
    Reasoning: All fields were accurately extracted with no missing or incorrect information. The name, email, and skills are all correctly identified and formatted....


GOOD QUALITY (6 files):
  File: ATS Bold accounting resume.pdf
    Name: correct, Email: correct, Skills: partial
    Reasoning: The name and email were extracted accurately. The skills list is mostly correct but includes a minor formatting issue with 'State & federal tax codes' which should be 'State & federal tax codes' without the extra space. Additionally, 'GAAP' was mentioned in the resume but not included in the extracted skills list, which is a significant omission. Overall, the extraction is good but could be improved with complete skills representation....

  File: ATS classic HR resume.docx
    Name: correct, Email: correct, Skills: partial
    Reasoning: The name 