# 📘 Enhanced CV Processing System (Multi-Industry Support)

# 🔧 Improved Setup

In [2]:

import datetime
import re
import os
import json
from pathlib import Path
from typing import List, Dict, Optional
from langchain_community.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
from langchain_core.output_parsers import JsonOutputParser
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from pydantic import BaseModel, Field
import getpass

# Enhanced configuration
STORAGE_DIR = "cv_storage"
EMAIL_REGEX = r"^[\w\.-]+@[a-zA-Z\d\.-]+\.[a-zA-Z]{2,}$"
os.makedirs(STORAGE_DIR, exist_ok=True)




# ================== Core Functions ==================

In [3]:




def get_user_email() -> tuple[str, str]:
    """Get and validate email with storage directory creation"""
    while True:
        email = input("📧 Enter your email address: ").strip().lower()
        if not re.match(EMAIL_REGEX, email):
            print("❌ Invalid email format. Use name@domain.com")
            continue
            
        # Sanitize email for filesystem safety
        safe_email = re.sub(r"[^\w\.-]", "_", email)
        user_dir = os.path.join(STORAGE_DIR, safe_email)
        
        if os.path.exists(user_dir):
            print("✅ Existing profile found")
            return email, user_dir
            
        os.makedirs(user_dir, exist_ok=True)
        return email, user_dir

def upload_cv(user_dir: str) -> str:
    """Handle CV upload with comprehensive validation"""
    while True:
        file_path = input("📄 Enter CV path (.pdf/.docx): ").strip()
        
        if not os.path.isfile(file_path):
            print("❌ File not found")
            continue
            
        try:
            if file_path.lower().endswith(".pdf"):
                loader = PyPDFLoader(file_path)
            elif file_path.lower().endswith(".docx"):
                loader = UnstructuredWordDocumentLoader(file_path)
            else:
                print("❌ Unsupported format. Use PDF/DOCX")
                continue

            documents = loader.load()
            text = "\n".join([doc.page_content for doc in documents])
            
            # Save original file
            safe_filename = re.sub(r"[^\w\.-]", "_", os.path.basename(file_path))
            save_path = os.path.join(user_dir, safe_filename)
            Path(save_path).write_bytes(Path(file_path).read_bytes())
            
            print(f"✅ CV saved: {save_path}")
            return text
            
        except Exception as e:
            print(f"🚨 Error processing file: {str(e)}")

# ================== AI Processing Setup ==================

In [4]:

class ProfessionalProfile(BaseModel):
    """Universal professional profile model"""
    full_name: str = Field(..., description="Full legal name")
    contact_email: str = Field(..., description="Primary contact email")
    phone: Optional[str] = Field(None, description="Contact phone number")
    summary: Optional[str] = Field(None, description="Professional summary")
    
    education: List[Dict] = Field(
        default_factory=list,
        description="List of educational achievements with degrees, institutions, and dates"
    )
    
    experience: List[Dict] = Field(
        default_factory=list,
        description="Work history with job titles, companies, dates, and key achievements"
    )
    
    technical_skills: List[str] = Field(
        default_factory=list,
        description="Technical skills relevant to the industry"
    )
    
    certifications: List[str] = Field(
        default_factory=list,
        description="Professional certifications and licenses"
    )
    
    projects: List[Dict] = Field(
        default_factory=list,
        description="Notable projects with descriptions and outcomes"
    )
    
    industry_preferences: List[str] = Field(
        default_factory=list,
        description="Preferred industries or sectors"
    )


# Initialize AI components

In [26]:

parser = JsonOutputParser(pydantic_object=ProfessionalProfile)

prompt = PromptTemplate(
    template="""**Professional Profile Analysis Task**
Act as an expert career analyst with deep knowledge across industries (tech, healthcare, finance, engineering). 
Extract structured information while identifying transferable skills and cross-domain competencies.
"Please extract the following fields from the CV and return them in JSON format without any preamble: ..."

**Fields to Extract:**
- full_name
- contact_email
- phone
- summary
- linkedin
- github
- education (list)
- experience (list)
- technical_skills (list)
- soft_skills (list, optional)
- certifications (list)
- projects (list)
- languages (optional)
YOU CAN ADD AND SUBTRACT FIELDS ACCORDING TO PROVIDED CV AND INDUSTRY

**Analysis Guidelines:**
1. Core Identification:
- Extract full legal name from header/contact section
- Verify email format (name@domain.tld)
- Identify phone numbers in international format (+XXX...)
- Extract summary
- linked in link(if provided)
- github link(if provided)

2. Education Analysis:
- Parse degrees with majors/specializations
- Flag accreditation status for institutions
- Convert dates to MM/YYYY format
- Highlight research projects/theses


3. Experience Processing:
- Separate employment history from internships
- Identify technical/soft skill development
- Quantify achievements ("Increased X by Y%")
- Map technologies to industry standards

4. Skill Extraction:
- Categorize skills:
  • Technical (tools/platforms)
  • Methodologies (Agile, Six Sigma)
  • Domain Knowledge (HIPAA, GAAP)
- Identify skill maturity levels:
  (Beginner < 1yr, Intermediate 1-3yr, Expert 3+yr)

5. Cross-Industry Transfer Analysis:
- Identify portable competencies between industries
- Highlight leadership/management patterns
- Extract crisis management evidence
- Flag multilingual capabilities

**Structured Output Requirements:**
{format_instructions}

**Content Processing Rules:**
- Preserve original wording unless ambiguous
- Convert relative dates ("current" → {today})
- Expand acronyms first occurrence (WHO → World Health Organization)
- Handle conflicting info (prioritize most recent)
- Omit sections not explicitly mentioned

**Input Profile:**
{text}""",
    input_variables=["text"],
    partial_variables={
        "format_instructions": parser.get_format_instructions(),
        "today": datetime.date.today().strftime("%m/%Y")
    },
)



# Configure Groq LLM


In [9]:

if "GROQ_API_KEY" not in os.environ:
    os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your Groq API key: ")

llm = ChatGroq(
    model_name="llama3-8b-8192",
    temperature=0,
    max_tokens=8192
)


# ================== Processing Functions =================

In [13]:
def parse_cv(text: str) -> dict:
    """Process CV text through LLM parsing chain"""
    try:
        chain = prompt | llm | parser
        result = chain.invoke({"text": text})
        return dict(result)
    except Exception as e:
        print(f"⚠️ Error parsing CV: {str(e)}")
        return {}

def save_parsed_data(data: dict, user_dir: str) -> None:
    """Save structured profile data"""
    save_path = Path(user_dir) / "profile_data.json"
    with open(save_path, "w") as f:
        json.dump(data, f, indent=2)
    print(f"📄 Profile data saved to {save_path}")

def display_profile_summary(user_dir: str) -> None:
    """Display formatted profile summary"""
    data_file = Path(user_dir) / "profile_data.json"
    if not data_file.exists():
        print("ℹ️ No profile data available")
        return
        
    with open(data_file) as f:
        data = json.load(f)
    
    print("\n🌟 Professional Summary:")
    print(f"Name: {data.get('full_name', 'N/A')}")
    print(f"Contact: {data.get('contact_email', 'N/A')} | {data.get('phone', 'N/A')}")
    print(f"\n🏫 Education ({len(data['education'])} entries)")
    print(f"\n💼 Experience ({len(data['experience'])} positions)")
    print(f"\n🛠️ Technical Skills ({len(data['technical_skills'])} listed)")


# ================== Main Execution Flow ==================


In [27]:
def merge_with_llm(existing: dict, new: dict) -> dict:
    """Use LLM to intelligently merge two structured profile dicts."""
    if not existing:
        return new
    if not new:
        return existing

    try:
        prompt_text = f"""
You are a helpful assistant tasked with merging two structured professional profiles extracted from CVs. 
Your goal is to intelligently combine the data from both profiles, avoiding redundancy, preserving the most complete and informative entries, and resolving conflicts sensibly.

Act as an expert career analyst with deep cross-industry knowledge (tech, healthcare, finance, engineering). 
You must identify transferable skills, merge overlapping entries, and preserve all unique information, especially for certifications.

Please extract and return the following fields in raw JSON format **only**, without preamble or commentary.

---
**Fields to Extract:**
- full_name
- contact_email
- phone
- summary
- linkedin
- github
- education (list)
- experience (list)
- technical_skills (list)
- soft_skills (list, optional)
- certifications (list)
- projects (list)
- languages (optional)
YOU CAN ADD AND SUBTRACT FIELDS ACCORDING TO PROVIDED CV AND INDUSTRY
---

**Guidelines for Merging and Extraction:**

1. **Core Info:**
   - Extract full legal name from the header or contact block.
   - Emails must be valid (e.g., name@domain.com).
   - Phone numbers must be in international format (+XXX...).
   - Include LinkedIn and GitHub links if found.

2. **Education:**
   - List all degrees and specializations.
   - Include institution name, degree, field, start and end date (MM/YYYY).
   - Highlight research projects or thesis titles if available.
   - Avoid duplication; if same degree exists with more details, keep the more complete version.

3. **Experience:**
   - Distinguish jobs, internships, freelance, and volunteering.
   - Include job title, company, duration, technologies used, and quantifiable outcomes.
   - Keep the most recent or complete version of similar roles.
   - Use consistent date format (MM/YYYY).

4. **Skills:**
   - Group skills into:
     • Technical Skills (tools, platforms, libraries)
     • Methodologies (Agile, Scrum, Six Sigma)
     • Domain Knowledge (e.g., GDPR, HIPAA)
   - Include experience levels if stated (e.g., Expert, Intermediate).

5. **Certifications (✅ Important):**
   - Extract each certification with full name, issuing organization, and date (if available).
   - Only merge certifications if the **exact full name and issuer match**.
   - If titles are slightly different or have extra info (e.g., "AWS Certified Developer – Associate" vs "AWS Developer Cert"), treat them as separate and preserve both.
    
6. **Projects:**
   - Include title, description, technologies used, role (if specified), and duration.
   - Projects may come from personal work, hackathons, university, or freelance.
   - Merge only if titles and descriptions are identical or nearly identical.
   - Preserve all distinct projects — no limit.
7. **Languages (if any):**
   - Include spoken languages and proficiency if listed.

8. **General Rules:**
   - Avoid redundancy and merge smartly.
   - Prioritize clarity, structure, and richness of information.
   - Do not add placeholder or fabricated data.
   - Output should be a valid JSON object.
   

---

**Input Profiles:**

Profile A (existing):
{json.dumps(existing, indent=2)}

Profile B (newly parsed):
{json.dumps(new, indent=2)}

Return ONLY the merged profile in raw JSON format.
Do NOT include explanations or commentary. Just output the final merged JSON.
"""
    #     response = llm.invoke(prompt_text)
    #     return json.loads(response.content.strip())
    # except Exception as e:
    #     print(f"⚠️ LLM merge failed: {e}")
    #     return {**existing, **new}  # fallback: simple merge
        response = llm.invoke(prompt_text)
        raw_output = response.content.strip()

        # 📌 Safely extract the JSON part from the output
        json_start = raw_output.find('{')
        json_end = raw_output.rfind('}')
        if json_start == -1 or json_end == -1:
            raise ValueError("No JSON object found in LLM output")

        clean_json = raw_output[json_start:json_end+1]
        return json.loads(clean_json)

    except Exception as e:
        print(f"⚠️ LLM merge failed: {e}")
        print(f"⚠️ Raw LLM output:\n{response.content if 'response' in locals() else 'No response'}")
        return {**existing, **new}  # fallback


In [18]:
def manage_cv(user_dir: str) -> None:
    """Enhanced CV management system"""
    while True:
        action = input("\nChoose: [U]pload new, [D]elete, [V]iew, [E]xit: ").strip().lower()
        
        # if action == "u":
        #     text = upload_cv(user_dir)
        #     parsed = parse_cv(text)
        #     save_parsed_data(parsed, user_dir)
        if action == "u":
            text = upload_cv(user_dir)
            new_parsed = parse_cv(text)

            # Load existing data if it exists
            existing_file = Path(user_dir) / "profile_data.json"
            if existing_file.exists():
                with open(existing_file) as f:
                    existing_parsed = json.load(f)
                merged_data = merge_with_llm(existing_parsed, new_parsed)
                print("🔄 Merged new CV with existing profile using LLM.")
            else:
                merged_data = new_parsed
                print("🆕 No existing profile found. Saving new data.")

            save_parsed_data(merged_data, user_dir)

        elif action == "d":
            confirm = input("⚠️ Delete ALL profile data? (y/n): ").lower()
            if confirm == "y":
                for item in Path(user_dir).glob("*"):
                    item.unlink()
                print("🗑️ Profile data deleted")
        elif action == "v":
            display_profile_summary(user_dir)
        elif action == "e":
            break
        else:
            print("❌ Invalid option")


# ================== Main Execution Flow ==================

In [28]:

def run_cv_pipeline() -> None:
    """Main execution flow"""
    email, user_dir = get_user_email()
    
    if not any(Path(user_dir).iterdir()):
        print("📥 Initial profile setup")
        cv_text = upload_cv(user_dir)
        parsed_data = parse_cv(cv_text)
        save_parsed_data(parsed_data, user_dir)
    else:
        manage_cv(user_dir)

if __name__ == "__main__":
    run_cv_pipeline()

✅ Existing profile found
✅ CV saved: cv_storage\meers_gmail.com\SHAHMEER_KAMRAN_Resume.pdf
🔄 Merged new CV with existing profile using LLM.
📄 Profile data saved to cv_storage\meers_gmail.com\profile_data.json

🌟 Professional Summary:
Name: Shahmeer Gull
Contact: shahmeergull20@gmail.com | +92-309-0654885

🏫 Education (1 entries)

💼 Experience (8 positions)

🛠️ Technical Skills (18 listed)


### ADD MULTIPLE CV OUTPUT TO ONE PROFILE
### MAKE SURE WORD IS BEING READ (USE LANG CHAIN DEFAULT) 
### MAYBE INTIGERATE CHROMA DB