In [1]:
import re
import json
import os
import csv
from PyPDF2 import PdfReader
import docx2txt

# -------------------------------
# Extraction functions
# -------------------------------
def extract_email(text):
    return re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)

def extract_phone(text):
    return re.findall(r'\+?\d[\d -]{8,}\d', text)

def extract_projects(text):
    lines = text.splitlines()
    projects = [line.strip() for line in lines if re.search(r'\b(project|worked on|involved in|contributed to)\b', line, re.IGNORECASE)]
    return projects

def extract_skill_experience(text, skill_dict):
    skill_exp = {}
    exp_pattern = r'(\d+\+?\-?\d*\s*(?:years|yrs))'
    for skill, synonyms in skill_dict.items():
        for syn in synonyms:
            pattern = rf'({syn}).{{0,50}}{exp_pattern}|{exp_pattern}.{{0,50}}({syn})'
            matches = re.findall(pattern, text, re.IGNORECASE)
            if matches:
                for m in matches:
                    exp = next((s for s in m if s.lower() not in [syn.lower(), ""]), None)
                    if exp:
                        skill_exp[skill] = exp.strip()
                        break
            else:
                if re.search(rf'\b{re.escape(syn)}\b', text, re.IGNORECASE):
                    skill_exp[skill] = "Not specified"
    return skill_exp

def extract_total_experience(text):
    matches = re.findall(r'(\d+\+?\-?\d*)\s*(?:years|yrs)\s*(?:of)?\s*(?:experience)?', text, re.IGNORECASE)
    years = []
    for m in matches:
        try:
            years.append(int(re.findall(r'\d+', m)[0]))
        except:
            continue
    return max(years) if years else "Not specified"

# -------------------------------
# File text extraction
# -------------------------------
def extract_text_from_pdf(file_path):
    text = ""
    pdf = PdfReader(file_path)
    for page in pdf.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text

def extract_text_from_docx(file_path):
    return docx2txt.process(file_path)

# -------------------------------
# Skill dictionary
# -------------------------------
skill_dict = {
    "python": ["python", "python3", "cpython", "pypy"],
    "java": ["java", "openjdk", "jdk", "jvm"],
    "javascript": ["javascript", "js", "nodejs", "ecmascript"],
    "typescript": ["typescript", "ts"],
    "go": ["go", "golang"],
    "c": ["c"],
    "c++": ["c++", "cpp"],
    "c#": ["c#", "csharp"],
    "ruby": ["ruby", "ruby on rails", "rails"],
    "php": ["php", "laravel", "symfony"],
    "rust": ["rust"],
    "kotlin": ["kotlin"],
    "scala": ["scala"],
    "r": ["r", "r language"],
    "swift": ["swift"],
    "shell": ["shell", "bash", "zsh", "sh"],
    "perl": ["perl"],
    "sql": ["sql", "mysql", "postgresql", "oracle", "sqlite", "mssql"],
    "nosql": ["nosql", "mongodb", "cassandra", "redis", "dynamodb", "couchdb"],
    "big data": ["big data", "hadoop", "spark", "mapreduce", "hive", "pig"],
    "data engineering": ["data engineering", "etl", "data pipeline", "airflow"],
    "data science": ["data science", "machine learning", "ml", "statistics"],
    "deep learning": ["deep learning", "neural networks", "tensorflow", "pytorch", "keras"],
    "mlops": ["mlops", "model deployment", "model serving", "sagemaker", "mlflow"],
    "nlp": ["nlp", "natural language processing", "transformers", "spacy", "nltk"],
    "computer vision": ["computer vision", "cv", "opencv"],
    "analytics": ["analytics", "business intelligence", "bi", "tableau", "power bi"],
    "rest": ["rest", "restful api", "api", "web api"],
    "graphql": ["graphql"],
    "web frameworks": ["django", "flask", "express", "spring", "rails", "fastapi"],
    "frontend frameworks": ["react", "angular", "vue", "svelte", "ember"],
    "html": ["html", "html5"],
    "css": ["css", "css3", "scss", "sass", "less"],
    "webpack": ["webpack", "rollup", "parcel"],
    "microservices": ["microservices", "service oriented architecture", "soa"],
    "devops": ["devops", "ci/cd", "continuous integration", "continuous delivery", "continuous deployment"],
    "docker": ["docker", "containers"],
    "kubernetes": ["kubernetes", "k8s", "kube"],
    "terraform": ["terraform"],
    "ansible": ["ansible"],
    "chef": ["chef"],
    "puppet": ["puppet"],
    "helm": ["helm"],
    "istio": ["istio", "service mesh"],
    "prometheus": ["prometheus", "grafana", "monitoring"],
    "logging": ["elk", "elasticsearch", "logstash", "kibana", "splunk"],
    "cloud aws": ["aws", "amazon web services", "ec2", "s3", "lambda", "cloudformation", "iam", "dynamodb"],
    "cloud azure": ["azure", "microsoft azure", "azure functions", "azure devops", "arm templates"],
    "cloud gcp": ["gcp", "google cloud", "google cloud platform", "gce", "bigquery", "cloud functions"],
    "openstack": ["openstack"],
    "serverless": ["serverless", "faas"],
    "edge computing": ["edge computing"],
    "networking": ["networking", "dns", "http", "tcp/ip"],
    "security": ["security", "kubernetes security", "oauth2", "jwt", "tls", "ssl", "vault"],
    "git": ["git", "gitlab", "github", "bitbucket"],
    "ci tools": ["jenkins", "circleci", "travis ci", "github actions", "gitlab ci", "azure pipelines"],
    "jira": ["jira", "confluence"],
    "slack": ["slack"],
    "docker-compose": ["docker-compose", "compose"],
    "testing": ["testing", "unit test", "integration test", "pytest", "junit", "mocha", "jest"],
    "performance": ["performance", "profiling", "benchmark"],
    "cache": ["cache", "redis", "memcached"],
    "microservices architecture": ["microservices architecture", "soa", "service mesh"],
    "design patterns": ["design patterns", "solid", "ddd", "clean architecture"],
    "architecture": ["architecture", "system design", "scalability", "high availability"],
    "agile": ["agile", "scrum", "kanban"],
    "devsecops": ["devsecops", "security as code", "shift left"],
    "observability": ["observability", "opentelemetry", "logging", "tracing", "metrics"]
}

# -------------------------------
# Main parser
# -------------------------------
def parse_cv(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File does not exist: {file_path}")
    
    if file_path.lower().endswith(".pdf"):
        text = extract_text_from_pdf(file_path)
    elif file_path.lower().endswith(".docx"):
        text = extract_text_from_docx(file_path)
    else:
        raise ValueError("Only PDF or DOCX files are supported")
    
    #name_line = text.strip().splitlines()[0]
   # name = name_line.strip() if len(name_line) < 100 else ""

    emails = extract_email(text)
    phones = extract_phone(text)
    projects = extract_projects(text)
    skills_experience = extract_skill_experience(text, skill_dict)
    total_experience = extract_total_experience(text)

    cv_json = {
       # "Name": name,
        "Email": emails,
        "Phone": phones,
        "Total_Experience": total_experience,
        "Skills_Experience": skills_experience,
        "Project_Highlights": projects
    }

    return cv_json

# -------------------------------
# Save to JSON and CSV
# -------------------------------
def save_cv(cv_data, json_file, csv_file):
    # Save JSON
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(cv_data, f, indent=4)
    
    # Save CSV
    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([ "Email", "Phone", "Total_Experience", "Skill", "Skill_Experience", "Project_Highlights"])
        for skill, exp in cv_data["Skills_Experience"].items():
            writer.writerow([
                cv_data["Email"],
                ", ".join(cv_data["Phone"]),
                cv_data["Total_Experience"],
                skill,
                exp,
                " | ".join(cv_data["Project_Highlights"])
            ])

# -------------------------------
# Example usage
# -------------------------------
file_path = r"C:\Users\takbh\BDA696\venv2\how-I-met-my-job\UmaTakbhate_Resume.pdf"  # or .docx
parsed_cv = parse_cv(file_path)

# Output files
json_file = "parsed_cv.json"
csv_file = "parsed_cv.csv"

save_cv(parsed_cv, json_file, csv_file)

print(f"JSON saved to {json_file}")
print(f"CSV saved to {csv_file}")

JSON saved to parsed_cv.json
CSV saved to parsed_cv.csv
